In [1]:
import os
import json
from pyspark.sql import SparkSession
import pyspark.pandas as ps

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
spark = SparkSession.builder \
    .appName("pandas_udf_by_site") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '8g') \
    .config('spark.driver.cores', '12') \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.cores', '12') \
    .config('spark.cores.max', '12') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()



In [2]:
import pyspark
import pandas as pd
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, when, sum as spark_sum, monotonically_increasing_id, countDistinct

In [3]:
df_pandas = pd.read_csv(
    "D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/wat_algorithm/wat_select.csv")
# df_pandas = df_pandas[df_pandas['PRODUCT_ID'].isin(
#     ["AEMNRM01N.0B01", "AEMNE801N.0B01", "AFXNE001N.0C01", "AGKNCE01N.0A01", "AFXNJ701N.0B01"])]
df_pandas = df_pandas[df_pandas['PRODUCT_ID'].isin(["AEMNRM01N.0B01", "AEMNE801N.0B01"])]
df_spark = ps.from_pandas(df_pandas).to_spark()
print(f"df_spark shape: ({df_spark.count()}, {len(df_spark.columns)})")
df_spark.show()

json_loads_dict = {"requestId": "269",
                   "algorithm": "wat_by_wafer",
                   "requestParam": {"dateRange": {"start": "2021-12-06 19:50:49", "end": "2024-03-06 19:50:49"},
                                    "operNo": [],
                                    "uploadId": "84f6a2b46a5443ec9797347424402058",
                                    "flagMergeAllProdg1": "0",
                                    "flagMergeAllProductId": "0",
                                    "flagMergeAllChamber": "0",
                                    "mergeProdg1": [],
                                    # "mergeProductId": [
                                    #     {"xx1": ["AEMNRM01N.0B01", "AEMNE801N.0B01"]},
                                    #     {"xx2": ["AGKNCE01N.0A01", "AFXNJ701N.0B01"]}],
                                    "mergeProductId": [],
                                    "mergeEqp": [],
                                    "mergeChamber": [],
                                    "mergeOperno": [],
                                    "goodSite": ["SITE1_VAL", "SITE2_VAL", "SITE3_VAL"],
                                    "badSite": ["SITE4_VAL", "SITE8_VAL"],
                                    }
                   }

df_ = pd.DataFrame({"requestId": [json_loads_dict["requestId"]],
                    "requestParam": [json.dumps(json_loads_dict["requestParam"])]})

request_id_ = df_["requestId"].values[0]
request_params = df_["requestParam"].values[0]
parse_dict = json.loads(request_params)

merge_operno = list(parse_dict.get('mergeOperno')) if parse_dict.get('mergeOperno') else None
merge_prodg1 = list(parse_dict.get('mergeProdg1')) if parse_dict.get('mergeProdg1') else None
merge_product = list(parse_dict.get('mergeProductId')) if parse_dict.get('mergeProductId') else None
good_site_columns_ = list(parse_dict.get('goodSite')) if parse_dict.get('goodSite') else None
bad_site_columns_ = list(parse_dict.get('badSite')) if parse_dict.get('badSite') else None
grpby_list_ = ['PRODG1']

  fields = [
  for column, series in pdf.iteritems():


df_spark shape: (6368, 156)
+---------------+---------+--------+--------------------+---------+------------+-----------+--------------+----------------+---------+--------------+----------------+---------------------+----------+---------+----------+----------+---------+----------+----------+---------+------------+---------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+--------

In [4]:
class DataPreprocessorForWat:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 grpby_list: list[str],
                 columns_list: list[str],
                 convert_to_numeric_list: list[str],
                 merge_operno_list: List[Dict[str, List[str]]],
                 merge_prodg1_list: List[Dict[str, List[str]]],
                 merge_product_list: List[Dict[str, List[str]]]
                 ):
        self.df = df
        self.grpby_list = grpby_list
        self.columns_list = columns_list
        self.convert_to_numeric_list = convert_to_numeric_list
        self.merge_operno_list = merge_operno_list
        self.merge_prodg1_list = merge_prodg1_list
        self.merge_product_list = merge_product_list

    @staticmethod
    def select_columns(df: pyspark.sql.dataframe, columns_list: list[str]) -> pyspark.sql.dataframe:
        return df.select(columns_list)

    # @staticmethod
    # def exclude_some_data(df: pyspark.sql.dataframe, key_words: list[str],
    #                       certain_column: str) -> pyspark.sql.dataframe:
    #     key_words_str = '|'.join(key_words)
    #     df_filtered = df.filter(~col(certain_column).rlike(key_words_str))
    #     return df_filtered

    @staticmethod
    def pre_process(df: pyspark.sql.dataframe, convert_to_numeric_list: list[str]) -> pyspark.sql.dataframe:
        for column in convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in convert_to_numeric_list:
            convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=convert_to_numeric_list, how='all')
        return df

    @staticmethod
    def integrate_columns(df: pyspark.sql.dataframe,
                          merge_operno_list: List[Dict[str, List[str]]],
                          merge_prodg1_list: List[Dict[str, List[str]]],
                          merge_product_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :param merge_prodg1_list: A list of dictionaries for merging 'PRODG1' column in a similar fashion.
        :param merge_product_list: A list of dictionaries for merging 'PRODUCT_ID' column in a similar fashion.

        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        df_merged = DataPreprocessorForWat.integrate_single_column(df, merge_operno_list, 'OPE_NO')
        df_merged = DataPreprocessorForWat.integrate_single_column(df_merged, merge_prodg1_list, 'PRODG1')
        df_merged = DataPreprocessorForWat.integrate_single_column(df_merged, merge_product_list, 'PRODUCT_ID')
        return df_merged

    @staticmethod
    def integrate_single_column(df: pyspark.sql.dataframe,
                                merge_list: List[Dict[str, List[str]]],
                                column_name: str) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_list: A list of dictionaries where each dictionary contains values to be merged.
        :param column_name: The name of the column to be merged.

        :return: DataFrame with specified column integrated according to the merge rules.
        """
        splitter_comma = ","
        if merge_list is not None and len(merge_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn(column_name,
                                   when(col(column_name).isin(values), replacement_value).otherwise(col(column_name)))
        return df

    @staticmethod
    def commonality_analysis(df_run: pyspark.sql.dataframe, grpby_list: list[str]) -> pyspark.sql.dataframe:
        grps_all = df_run.groupBy(grpby_list).agg(countDistinct('WAFER_ID').alias('GOOD_NUM'))
        return grps_all

    # @staticmethod
    # def extract_unique_params_within_groups(df: pyspark.sql.dataframe, grpby_list) -> pyspark.sql.dataframe:
    #     grouped = df.groupby(*grpby_list).agg(collect_set('PARAMETRIC_NAME').alias('unique_values'))
    #     exploded = grouped.select(*grpby_list, explode(col('unique_values')).alias('PARAMETRIC_NAME'))
    #     unique_params_within_groups = exploded.dropDuplicates()
    #     return unique_params_within_groups

    @staticmethod
    def add_feature_stats_within_groups(df_integrate: pyspark.sql.dataframe, grpby_list) -> pyspark.sql.dataframe:
        unique_params_within_groups = (df_integrate.groupBy(grpby_list + ['PARAMETRIC_NAME'])
                                       .agg(countDistinct('WAFER_ID').alias('GOOD_NUM'))
                                       .na.fill(0))
        return unique_params_within_groups

    def run(self) -> pyspark.sql.dataframe:
        df_select = self.select_columns(df=self.df, columns_list=self.columns_list)
        df_integrate = self.integrate_columns(df=df_select,
                                              merge_operno_list=self.merge_operno_list,
                                              merge_prodg1_list=self.merge_prodg1_list,
                                              merge_product_list=self.merge_product_list)
        grps_all = self.commonality_analysis(df_run=df_integrate, grpby_list=self.grpby_list)
        add_parametric_stats_df = self.add_feature_stats_within_groups(df_integrate=df_integrate, grpby_list=self.grpby_list)
        df_preprocess = self.pre_process(df=df_integrate, convert_to_numeric_list=self.convert_to_numeric_list)
        return grps_all, add_parametric_stats_df, df_preprocess


class ExtractFeaturesBySite:
    @staticmethod
    def process_missing_values_for_site(df: pd.DataFrame,
                                        good_site_columns: list[str],
                                        bad_site_columns: list[str],
                                        missing_value_threshold: Union[int, float] = 0.7,
                                        process_miss_site_mode: str = 'drop') -> pd.DataFrame:
        assert process_miss_site_mode in ['drop', 'fill']
        site_columns = good_site_columns + bad_site_columns
        if process_miss_site_mode == 'drop':
            # drop rows based on the missing value threshold
            df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
        else:
            # fill missing values in the corresponding site rows using the AVERAGE of that row
            df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
        return df

    @staticmethod
    def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std()})

    @staticmethod
    def calculate_site_stats(df: pd.DataFrame, grpby_list: list[str], site_columns: list[str],
                             good_or_bad: str) -> pd.DataFrame:
        assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
        selected_df = df[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME'] + site_columns].reset_index(drop=True)
        # Perform statistical calculations for each row
        side_features = selected_df.apply(lambda row: ExtractFeaturesBySite.calculate_statistics(row[site_columns]),
                                          axis=1)
        side_features = side_features.fillna(0)
        df_with_features = pd.concat([selected_df, side_features], axis=1)
        if good_or_bad == 'good':
            df_with_features['label'] = 0
        else:
            df_with_features['label'] = 1
        return df_with_features

    @staticmethod
    def extract_features_by_site(df: pd.DataFrame,
                                 grpby_list: list[str],
                                 good_site_columns: list[str],
                                 bad_site_columns: list[str],
                                 missing_value_threshold: Union[int, float] = 0.7,
                                 process_miss_site_mode: str = 'drop') -> Union[pd.DataFrame, None]:
        """
        Extracts features from a DataFrame based on good and bad site columns.
        Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - grp_list: ['OPE_NO'] for most the case.
        - good_site_columns (list): List of columns representing good sites.
        - bad_site_columns (list): List of columns representing bad sites.
        - missing_value_threshold (Union[int, float]): Threshold for missing values.
        - process_miss_site_mode (str): Mode for handling missing values in site columns, e.g. drop or fill
        Returns:
        - Union[pd.DataFrame, None]: DataFrame with extracted features or None if no data is available.
        """
        df_pandas_specific_ = ExtractFeaturesBySite.process_missing_values_for_site(df=df,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
        if df_pandas_specific_.shape[0] != 0:
            side_with_features1 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             good_site_columns,
                                                                             good_or_bad='good')
            side_with_features2 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             bad_site_columns,
                                                                             good_or_bad='bad')
            side_with_features1_select = side_with_features1[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'label']]
            side_with_features2_select = side_with_features2[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'label']]
            side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
            return side_with_features_all

In [20]:
class FitWatModelBySite:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 grpby_list: list[str],
                 good_site_columns: list[str],
                 bad_site_columns: list[str],
                 process_miss_site_mode: str,
                 columns_to_process: list[str],
                 missing_value_threshold: Union[int, float],
                 model: str = 'pca'):
        """
        Initialize the FitInlineModelBySite object.

        Parameters:
        - df: pyspark.sql.dataframe, the input data
        - grpby_list: list[str], the grouping variable, inline data should be ["OPE_NO"] mostly
        - good_site_columns: List of str, column names for good sites
        - bad_site_columns: List of str, column names for bad sites
        - process_miss_site_mode: str, mode for handling missing values in site data, e.g. drop or fill
        - columns_to_process: List of str, columns to process in missing value functions
        - missing_value_threshold: Union[int, float], threshold for missing values
        - model: str, default is 'pca', other options include 'rf' for random forest, 'decisionTree' for decision tree,
                 svc, logistic and sgd.
        """
        self.df = df
        self.grpby_list = grpby_list
        self.good_site_columns = good_site_columns
        self.bad_site_columns = bad_site_columns
        self.process_miss_site_mode = process_miss_site_mode
        self.columns_to_process = columns_to_process
        self.missing_value_threshold = missing_value_threshold
        self.model = model

    @staticmethod
    def process_missing_values(df, columns_to_process, missing_value_threshold):
        for column in columns_to_process:
            missing_percentage = df[column].isnull().mean()
            if missing_percentage > missing_value_threshold:
                df = df.drop(columns=[column])
            else:
                df[column] = df[column].fillna(df[column].mean())
        return df

    @staticmethod
    def get_pivot_table(df, grpby_list, columns_to_process, missing_value_threshold):
        df = FitWatModelBySite.process_missing_values(df, columns_to_process, missing_value_threshold)
        index_list = ['WAFER_ID', 'label']
        columns_list = grpby_list + ['PARAMETRIC_NAME']
        values_list = df.columns.difference(['WAFER_ID', 'PARAMETRIC_NAME', 'label'] + grpby_list)
        pivot_result = df.pivot_table(index=index_list,
                                      columns=columns_list,
                                      values=values_list)
        pivot_result.columns = pivot_result.columns.map('#'.join)
        pivot_result = FitWatModelBySite.process_missing_values(pivot_result, pivot_result.columns,
                                                                missing_value_threshold)
        pivot_result = pivot_result.reset_index(drop=False)
        # Remove completely identical columns
        for column in pivot_result.columns.difference(index_list):
            if pivot_result[column].nunique() == 1:
                pivot_result = pivot_result.drop(column, axis=1)
        return pivot_result

    @staticmethod
    def construct_features_when_not_satisfied(x_train) -> pd.DataFrame:
        x_len = len(x_train.columns)
        res_top_select = pd.DataFrame({"features": x_train.columns,
                                       "importance": [-1.0] * x_len,
                                       "algorithm_satisfied": ['FALSE'] * x_len,
                                       "x_train_shape": [str(x_train.shape)] * x_len})
        return res_top_select

    @staticmethod
    def construct_features_when_satisfy_pca(x_train) -> pd.DataFrame:
        # 得到PCA算法结果res_top_select
        n_components = min(min(x_train.shape) - 2, 20)
        model = pca(n_components=n_components, verbose=None)
        results = model.fit_transform(x_train)
        res_top = results['topfeat']
        res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
        res_top_select['importance'] = abs(res_top_select['loading'])
        res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading",
                                                                                     axis=1).drop_duplicates()
        res_top_select['x_train_shape'] = str(x_train.shape)
        res_top_select['algorithm_satisfied'] = 'TRUE'
        return res_top_select

    @staticmethod
    def fit_pca_model(df: pyspark.sql.dataframe, grpby_list, good_site_columns, bad_site_columns, columns_to_process,
                      process_miss_site_mode,
                      missing_value_threshold) -> pyspark.sql.dataframe:
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True),
                                 StructField("algorithm_satisfied", StringType(), True),
                                 StructField("x_train_shape", StringType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
                                                                                    grpby_list=grpby_list,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
            # 如果df_run中的good_site_columns和bad_site_columns的每列缺失比例都大于70%, 则无法提取特征, side_with_features_all就是None
            if side_with_features_all is None:
                grpby_values = [df_run[item].iloc[0] for item in grpby_list]
                features_value = f"STATS#{'#'.join(map(str, grpby_values))}#PARAM"
                res_top_select = pd.DataFrame({"features": features_value,
                                               "importance": -2.0,
                                               "algorithm_satisfied": 'FALSE',
                                               "x_train_shape": str(0)}, index=[0])
                return res_top_select

            pivot_result = FitWatModelBySite.get_pivot_table(df=side_with_features_all,
                                                             grpby_list=grpby_list,
                                                             columns_to_process=columns_to_process,
                                                             missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

            if min(x_train.shape) > 2:
                res_top_select = FitWatModelBySite.construct_features_when_satisfy_pca(x_train=x_train)
                return res_top_select
            else:
                res_top_select = FitWatModelBySite.construct_features_when_not_satisfied(x_train=x_train)
                return res_top_select

        return df.groupby(grpby_list).apply(get_model_result)

    def run(self):
        if self.model == 'pca':
            res = self.fit_pca_model(df=self.df, grpby_list=self.grpby_list,
                                     good_site_columns=self.good_site_columns,
                                     bad_site_columns=self.bad_site_columns,
                                     columns_to_process=self.columns_to_process,
                                     process_miss_site_mode=self.process_miss_site_mode,
                                     missing_value_threshold=self.missing_value_threshold)
        else:
            res = None
        return res

In [7]:
good_site_columns = list(set(good_site_columns_))
bad_site_columns = list(set(bad_site_columns_))
site_columns = good_site_columns + bad_site_columns
site_columns

['SITE2_VAL', 'SITE3_VAL', 'SITE1_VAL', 'SITE8_VAL', 'SITE4_VAL']

In [8]:
columns_list = ['WAFER_ID', 'PRODG1', 'OPE_NO', 'PRODUCT_ID', 'PARAMETRIC_NAME', 'SITE_COUNT', 'AVERAGE'] + site_columns
convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns

In [13]:
grps_all, add_parametric_stats_df, df_preprocess = DataPreprocessorForWat(df=df_spark,
                                                                          grpby_list=grpby_list_,
                                                                          columns_list=columns_list,
                                                                          convert_to_numeric_list=convert_to_numeric_list,
                                                                          merge_operno_list=merge_operno,
                                                                          merge_prodg1_list=merge_prodg1,
                                                                          merge_product_list=merge_product).run()

In [15]:
grps_all.show()

+------+--------+
|PRODG1|GOOD_NUM|
+------+--------+
|  HGHJ|      13|
+------+--------+



In [16]:
print("unique_params_within_groups:", add_parametric_stats_df.count())
add_parametric_stats_df.show(10)

unique_params_within_groups: 424
+------+--------------------+--------+
|PRODG1|     PARAMETRIC_NAME|GOOD_NUM|
+------+--------------------+--------+
|  HGHJ|            RsM2_p09|      13|
|  HGHJ|        IfSF1_p28p45|       5|
|  HGHJ|           VtcsCMPN1|      13|
|  HGHJ|Idxdec_levelshift...|       8|
|  HGHJ|          BVjpPHLNLW|      13|
|  HGHJ|         ToxLPLW_inv|      13|
|  HGHJ|            RsLNW_2S|       5|
|  HGHJ|    RcSTK_1CV1_Npoly|       8|
|  HGHJ|          IdLP_p4p09|      13|
|  HGHJ|            RsLPW_2S|       5|
+------+--------------------+--------+
only showing top 10 rows



In [17]:
print("df_preprocess:", df_preprocess.count())
df_preprocess.show()

df_preprocess: 6368
+---------+------+--------+--------------+--------------------+----------+------------+---------+---------+---------+---------+---------+
| WAFER_ID|PRODG1|  OPE_NO|    PRODUCT_ID|     PARAMETRIC_NAME|SITE_COUNT|     AVERAGE|SITE2_VAL|SITE3_VAL|SITE1_VAL|SITE8_VAL|SITE4_VAL|
+---------+------+--------+--------------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|NHM078-15|  HGHJ|ST.TTS10|AEMNRM01N.0B01|         BVcbMLPNP_5|       9.0|       -14.6|    -14.6|    -14.6|    -14.6|    -14.6|    -14.6|
|NHM078-15|  HGHJ|ST.TTS10|AEMNRM01N.0B01|         BVceMLPNP_5|       9.0|       -15.2|    -15.2|    -15.2|    -15.2|    -15.2|    -15.2|
|NHM078-15|  HGHJ|ST.TTS10|AEMNRM01N.0B01|   BVjDNWDNW_S3_10nA|       9.0| 14.55555556|     14.6|     14.4|     14.4|     14.6|     14.6|
|NHM078-15|  HGHJ|ST.TTS10|AEMNRM01N.0B01|    BVjDNWNW_S2_10nA|       9.0| 14.53333333|     14.6|     14.4|     14.4|     14.6|     14.6|
|NHM078-15|  H

In [22]:
df_preprocess_pandas = df_preprocess.toPandas()
df_preprocess_pandas

Unnamed: 0,WAFER_ID,PRODG1,OPE_NO,PRODUCT_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE2_VAL,SITE3_VAL,SITE1_VAL,SITE8_VAL,SITE4_VAL
0,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVcbMLPNP_5,9.0,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000
1,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVceMLPNP_5,9.0,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000
2,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjDNWDNW_S3_10nA,9.0,14.555556,14.600000,14.400000,14.400000,14.600000,14.600000
3,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjDNWNW_S2_10nA,9.0,14.533333,14.600000,14.400000,14.400000,14.600000,14.600000
4,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjGR_S2_10nA,9.0,14.488889,14.400000,14.400000,14.400000,14.600000,14.400000
...,...,...,...,...,...,...,...,...,...,...,...,...
6363,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsTX_p7p38,9.0,-0.056430,-0.063450,-0.007673,-0.042231,-0.065479,-0.043970
6364,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsVLN1,9.0,0.192239,0.194373,0.195248,0.189287,0.190760,0.185206
6365,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsVLN2,9.0,0.190023,0.190073,0.188890,0.189869,0.192523,0.182971
6366,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,Vtcsxdec_levelshift_nfet3,9.0,0.559120,0.550910,0.544319,0.544663,0.582070,0.546110


In [24]:
prodg1 = 'HGHJ'
df_preprocess_pandas1 = df_preprocess_pandas.query(f"PRODG1 == '{prodg1}'")
df_preprocess_pandas1

Unnamed: 0,WAFER_ID,PRODG1,OPE_NO,PRODUCT_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE2_VAL,SITE3_VAL,SITE1_VAL,SITE8_VAL,SITE4_VAL
0,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVcbMLPNP_5,9.0,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000
1,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVceMLPNP_5,9.0,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000
2,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjDNWDNW_S3_10nA,9.0,14.555556,14.600000,14.400000,14.400000,14.600000,14.600000
3,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjDNWNW_S2_10nA,9.0,14.533333,14.600000,14.400000,14.400000,14.600000,14.600000
4,NHM078-15,HGHJ,ST.TTS10,AEMNRM01N.0B01,BVjGR_S2_10nA,9.0,14.488889,14.400000,14.400000,14.400000,14.600000,14.400000
...,...,...,...,...,...,...,...,...,...,...,...,...
6363,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsTX_p7p38,9.0,-0.056430,-0.063450,-0.007673,-0.042231,-0.065479,-0.043970
6364,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsVLN1,9.0,0.192239,0.194373,0.195248,0.189287,0.190760,0.185206
6365,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,VtcsVLN2,9.0,0.190023,0.190073,0.188890,0.189869,0.192523,0.182971
6366,NHH106-23,HGHJ,ST.TTS10,AEMNRM01N.0B01,Vtcsxdec_levelshift_nfet3,9.0,0.559120,0.550910,0.544319,0.544663,0.582070,0.546110


In [28]:
side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_preprocess_pandas1,
                                                                        grpby_list=grpby_list_,
                                                                        good_site_columns=good_site_columns,
                                                                        bad_site_columns=bad_site_columns,
                                                                        missing_value_threshold=0.7,
                                                                        process_miss_site_mode='drop')
side_with_features_all

Unnamed: 0,PRODG1,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,label
0,HGHJ,NHM078-15,BVcbMLPNP_5,-14.600000,-14.600000,-14.600000,-14.600000,0.000000e+00,0
1,HGHJ,NHM078-15,BVceMLPNP_5,-15.200000,-15.200000,-15.200000,-15.200000,2.175584e-15,0
2,HGHJ,NHM078-15,BVjDNWDNW_S3_10nA,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
3,HGHJ,NHM078-15,BVjDNWNW_S2_10nA,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
4,HGHJ,NHM078-15,BVjGR_S2_10nA,14.400000,14.400000,14.400000,14.400000,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...
6363,HGHJ,NHH106-23,VtcsTX_p7p38,-0.043970,-0.065479,-0.054724,-0.054724,1.520915e-02,1
6364,HGHJ,NHH106-23,VtcsVLN1,0.190760,0.185206,0.187983,0.187983,3.927625e-03,1
6365,HGHJ,NHH106-23,VtcsVLN2,0.192523,0.182971,0.187747,0.187747,6.754213e-03,1
6366,HGHJ,NHH106-23,Vtcsxdec_levelshift_nfet3,0.582070,0.546110,0.564090,0.564090,2.542770e-02,1


In [32]:
pivot_result = FitWatModelBySite.get_pivot_table(df=side_with_features_all,
                                             grpby_list=grpby_list_,
                                             columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV'],
                                             missing_value_threshold=0.7)
x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
pivot_result

Unnamed: 0,WAFER_ID,label,AVERAGE#HGHJ#BVcbMLPNP_5,AVERAGE#HGHJ#BVceMLPNP_5,AVERAGE#HGHJ#BVjDNWDNW_S3_10nA,AVERAGE#HGHJ#BVjDNWNW_S2_10nA,AVERAGE#HGHJ#BVjGR_S2_10nA,AVERAGE#HGHJ#BVjLN+LN+_I_p14_10nA,AVERAGE#HGHJ#BVjLP+LP+_I_p14_10nA,AVERAGE#HGHJ#BVjMN+MN+_I_p18_10nA,...,STD_DEV#HGHJ#VtcsTXB_p5p35,STD_DEV#HGHJ#VtcsTXC_p5p35,STD_DEV#HGHJ#VtcsTXD_p5p35,STD_DEV#HGHJ#VtcsTX_p7p38,STD_DEV#HGHJ#VtcsVLN1,STD_DEV#HGHJ#VtcsVLN2,STD_DEV#HGHJ#Vtcsxdec_levelshift_nch33,STD_DEV#HGHJ#Vtcsxdec_levelshift_nch33_2,STD_DEV#HGHJ#Vtcsxdec_levelshift_nfet3,STD_DEV#HGHJ#Vtcsxdec_levelshift_nfetbc3
0,NHH106-13,0,-14.666667,-15.266667,14.666667,14.666667,14.466667,9.666667,-8.2,9.6,...,0.066452,0.030716,0.041656,0.05686,0.00313,0.003908,0.011099,0.006735,0.017937,0.009665
1,NHH106-13,1,-14.8,-15.4,14.7,14.8,14.6,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.018893,0.006472,0.004226,0.011099,0.006735,0.025219,0.009597
2,NHH106-18,0,-14.666667,-15.2,14.6,14.6,14.466667,9.666667,-8.2,9.6,...,0.066452,0.030716,0.041656,0.034039,0.001569,0.001017,0.011099,0.006735,0.023337,0.004799
3,NHH106-18,1,-14.8,-15.3,14.6,14.6,14.6,9.8,-8.2,9.6,...,0.066452,0.030716,0.041656,0.033479,0.007973,0.005546,0.011099,0.006735,0.005224,0.001712
4,NHH106-23,0,-14.666667,-15.266667,14.6,14.666667,14.466667,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.028153,0.003219,0.000632,0.011099,0.006735,0.00371,0.00875
5,NHH106-23,1,-14.8,-15.3,14.7,14.6,14.6,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.015209,0.003928,0.006754,0.011099,0.006735,0.025428,0.001499
6,NHM078-01,0,-14.6,-15.2,14.6,14.6,14.4,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.048107,0.00384,0.004481,0.011099,0.006735,0.017221,0.007024
7,NHM078-01,1,-14.6,-15.2,14.6,14.6,14.5,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.004159,0.004594,0.00388,0.011099,0.006735,0.016195,0.00114
8,NHM078-03,0,-14.6,-15.2,14.6,14.533333,14.4,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.025562,0.001884,0.002578,0.011099,0.006735,0.008645,0.00426
9,NHM078-03,1,-14.6,-15.2,14.6,14.6,14.4,9.6,-8.2,9.6,...,0.066452,0.030716,0.041656,0.043692,0.0016,0.004213,0.011099,0.006735,0.005035,0.000854


In [48]:
pivot_result1 = FitWatModelBySite.get_pivot_table(df=side_with_features_all,
                                             grpby_list=grpby_list_,
                                             columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV'],
                                             missing_value_threshold=0.7)
x_train1 = pivot_result1[pivot_result1.columns.difference(['WAFER_ID', 'label']).tolist()]

In [49]:
x_train.equals(x_train1)

True

In [33]:
res_top_select = FitWatModelBySite.construct_features_when_satisfy_pca(x_train=x_train)
res_top_select

Unnamed: 0,features,importance,x_train_shape,algorithm_satisfied
0,AVERAGE#HGHJ#BVjpPHLNLW,0.419209,"(26, 2118)",True
1,MIN_VAL#HGHJ#BVjpPHLNW,0.465532,"(26, 2118)",True
2,STD_DEV#HGHJ#BVjpPHLNW,0.651756,"(26, 2118)",True
3,MEDIAN#HGHJ#BVjpPHLNW,0.773884,"(26, 2118)",True
4,STD_DEV#HGHJ#BVjpPHLNLW,0.749859,"(26, 2118)",True
5,STD_DEV#HGHJ#BVjpPHLNW,0.50398,"(26, 2118)",True
6,AVERAGE#HGHJ#BVjpPHLNW,0.642288,"(26, 2118)",True
7,AVERAGE#HGHJ#BVcbMLPNP_5,0.956127,"(26, 2118)",True
8,AVERAGE#HGHJ#BVceMLPNP_5,0.798123,"(26, 2118)",True
9,MEDIAN#HGHJ#RsNL_2,0.538975,"(26, 2118)",True


In [34]:
res_top_select = FitWatModelBySite.construct_features_when_satisfy_pca(x_train=x_train)
res_top_select

Unnamed: 0,features,importance,x_train_shape,algorithm_satisfied
0,AVERAGE#HGHJ#BVjpPHLNLW,0.419209,"(26, 2118)",True
1,MIN_VAL#HGHJ#BVjpPHLNW,0.465532,"(26, 2118)",True
2,STD_DEV#HGHJ#BVjpPHLNW,0.651756,"(26, 2118)",True
3,MEDIAN#HGHJ#BVjpPHLNW,0.773884,"(26, 2118)",True
4,STD_DEV#HGHJ#BVjpPHLNLW,0.586065,"(26, 2118)",True
5,STD_DEV#HGHJ#BVjpPHLNLW,0.647826,"(26, 2118)",True
6,AVERAGE#HGHJ#BVcbMLPNP_5,0.669555,"(26, 2118)",True
7,AVERAGE#HGHJ#BVjpPHLNLW,0.58125,"(26, 2118)",True
8,AVERAGE#HGHJ#BVceMLPNP_5,0.904031,"(26, 2118)",True
9,MEDIAN#HGHJ#RsNL_2,0.429226,"(26, 2118)",True


In [58]:
n_components = min(min(x_train.shape) - 2, 20)
model = pca(n_components=n_components, verbose=None, random_state=0)
results = model.fit_transform(x_train)
res_top = results['topfeat']
res_top_select1 = res_top[res_top['type'] == 'best'][['feature', 'loading']]
res_top_select1.sort_values('loading')

Unnamed: 0,feature,loading
3,MEDIAN#HGHJ#BVjpPHLNW,-0.773884
10,MAX_VAL#HGHJ#RcBV_0520,-0.707106
5,AVERAGE#HGHJ#BVjpPHLNLW,-0.582105
17,MAX_VAL#HGHJ#RsPL_2,-0.496356
1,MIN_VAL#HGHJ#BVjpPHLNW,-0.465532
12,MEDIAN#HGHJ#RsPL_2,-0.40972
18,MAX_VAL#HGHJ#IdNPASS1_p1p1,0.288339
13,MAX_VAL#HGHJ#RsPL_2,0.322771
9,MEDIAN#HGHJ#RsPL_2,0.366173
16,MIN_VAL#HGHJ#RsPL_2,0.368068


In [57]:
n_components = min(min(x_train.shape) - 2, 20)
model = pca(n_components=n_components, verbose=None, random_state=0)
results = model.fit_transform(x_train)
res_top = results['topfeat']
res_top_select2 = res_top[res_top['type'] == 'best'][['feature', 'loading']]
res_top_select2.sort_values('loading')

Unnamed: 0,feature,loading
3,MEDIAN#HGHJ#BVjpPHLNW,-0.773884
10,MAX_VAL#HGHJ#RcBV_0520,-0.707106
5,AVERAGE#HGHJ#BVjpPHLNLW,-0.582105
17,MAX_VAL#HGHJ#RsPL_2,-0.496356
1,MIN_VAL#HGHJ#BVjpPHLNW,-0.465532
12,MEDIAN#HGHJ#RsPL_2,-0.40972
18,MAX_VAL#HGHJ#IdNPASS1_p1p1,0.288339
13,MAX_VAL#HGHJ#RsPL_2,0.322771
9,MEDIAN#HGHJ#RsPL_2,0.366173
16,MIN_VAL#HGHJ#RsPL_2,0.368068


In [59]:
res_top_select1.equals(res_top_select2)

True