In [167]:
import pyspark
import pandas as pd
import pyspark.pandas as ps
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, when

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

import json

In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [4]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/inline_algorithm/codes_version6/inline_test_data2_bysite.csv")
df_pandas

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,MEASURE_TIME,RANGE_INDEX,FAB_ID,PRODUCT_ID,LOT_ID,AVERAGE,MAX_VAL,...,ACT_CODE,ETL_INSERT_TIME,ETL_ARC_FLAG,ETL_BATCH_SYNC_TS,ETL_DEL_FLAG,ETL_DS_JOB_NM,ETL_SRC_DB,ETL_SRC_TBL,ETL_TBL_OPER_TS,label
0,NBX219-17,1V.PQA10,MCW0,2023-09-07 16:38:00,0,N1,AFPNR901N.0B0J,NBX219000,0.000000,,...,,2023-09-07 16:47:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,0
1,NBX219-17,1U.CDG20,OEW0,2023-08-31 23:37:00,0,N1,AFPNR901N.0B0J,NBX219000,6000.092727,,...,,2023-08-31 23:47:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,0
2,NBX219-17,1U.CDG20,PEW0,2023-08-31 23:37:00,0,N1,AFPNR901N.0B0J,NBX219000,6.999584,,...,,2023-08-31 23:47:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,0
3,NBX219-17,1U.CDG20,PTW0,2023-08-31 23:37:00,0,N1,AFPNR901N.0B0J,NBX219000,5500.000000,,...,,2023-08-31 23:47:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,0
4,NBX219-17,1U.CDG20,REW0,2023-08-31 23:37:00,0,N1,AFPNR901N.0B0J,NBX219000,198.846013,,...,,2023-08-31 23:47:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11489,NAZ415-13,1U.CDG10,HFT0,2022-12-07 00:11:00,0,N1,AFPNM301N.0A01,NAZ415000,0.930000,,...,,2023-05-29 04:40:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,1
11490,NAZ415-13,1U.CDG10,OEW0,2022-12-07 00:11:00,0,N1,AFPNM301N.0A01,NAZ415000,6000.166667,,...,,2023-05-29 04:40:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,1
11491,NAZ415-13,1U.CDG10,PEW0,2022-12-07 00:11:00,0,N1,AFPNM301N.0A01,NAZ415000,7.008722,,...,,2023-05-29 04:40:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,1
11492,NAZ415-13,1U.CDG10,PTW0,2022-12-07 00:11:00,0,N1,AFPNM301N.0A01,NAZ415000,930.000000,,...,,2023-05-29 04:40:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,1


In [5]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

11494

In [6]:
def parse_JSON_config(df: pd.DataFrame):
    request_id = df["requestId"].values[0]
    request_params = df["requestParam"].values[0]
    parse_dict = json.loads(request_params)

    # PRODUCT_ID, PROG1, EQP, CHAMBER, OPER_NO存在部分合并的情况
    try:
        # OPER_NO的部分合并结果
        merge_operno = list(parse_dict.get('mergeOperno')) if parse_dict.get('mergeOperno') else None
    except KeyError:
        merge_operno = None

    try:
        # PROG1的部分合并结果
        merge_prodg1 = list(parse_dict.get('mergeProdg1')) if parse_dict.get('mergeProdg1') else None
    except KeyError:
        merge_prodg1 = None

    try:
        # PRODUCT_ID的部分合并结果
        merge_product = list(parse_dict.get('mergeProductId')) if parse_dict.get('mergeProductId') else None
    except KeyError:
        merge_product = None

    try:
        # EQP的部分合并结果
        merge_eqp = list(parse_dict.get('mergeEqp')) if parse_dict.get('mergeEqp') else None
    except KeyError:
        merge_eqp = None

    try:
        # CHAMBER的部分合并结果
        merge_chamber = list(parse_dict.get('mergeChamber')) if parse_dict.get('mergeChamber') else None
    except KeyError:
        merge_chamber = None

    # 获取good_site和bad_site
    try:
        good_site = list(parse_dict.get('goodSite')) if parse_dict.get('goodSite') else None
    except KeyError:
        good_site = None

    try:
        bad_site = list(parse_dict.get('badSite')) if parse_dict.get('badSite') else None
    except KeyError:
        bad_site = None

    # group by 子句中的字段
    group_by_list = parse_dict.get("groupByList")
    if group_by_list is None or len(group_by_list) == 0:
        group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
        # PRODUCT_ID, PROG1, CHAMBER 这3个存在一键合并的切换开关
        # 且一键合并PROG1时会自动一键合并PRODUCT_ID
        flag_merge_prodg1 = parse_dict.get('flagMergeAllProdg1')
        flag_merge_product_id = parse_dict.get('flagMergeAllProductId')
        flag_merge_chamber = parse_dict.get('flagMergeAllChamber')

        if flag_merge_prodg1 == '1':
            # 一键合并PROG1时，部分合并PROG1和PRODUCT_ID的情况都会被忽略
            merge_prodg1 = None
            merge_product = None
            group_by_list = ['OPER_NO', "EQP_NAME", 'TOOL_NAME']
            if flag_merge_chamber == '1':
                group_by_list = ['OPER_NO', "EQP_NAME"]
        elif flag_merge_product_id == '1':
            # 一键合并PRODUCT_ID时，部分合并PRODUCT_ID的情况会被忽略
            merge_product = None
            group_by_list = ["PRODG1", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
            if flag_merge_chamber == '1':
                # 一键合并CHAMBER时，部分合并CHAMBER的情况会被忽略
                group_by_list = ["PRODG1", 'OPER_NO', "EQP_NAME"]
        elif flag_merge_chamber == '1':
            merge_chamber = None
            group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME"]

    return parse_dict, request_id, group_by_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site

In [134]:
json_config_ = {"requestId": "269",
                    "algorithm": "inline_by_wafer",
                    "requestParam": {"dateRange": {"start": "2021-12-06 19:50:49",
                                                   "end": "2024-03-06 19:50:49"},
                                     "operNo": ["1U.CDG10", "1U.CDG20", "1V.PQA10", "2U.PQA10", "2V.PQW10", "3U.PQA10",
                                                "6V.CDG10", "7U.PQA10",
                                                "7U.PQX10", "TM.PQX10", "XX.PQW01", "XX.PQX02", "1U.EQW20", "1U.PQW10",
                                                "1U.PQX10", "1V.PQX10",
                                                "1V.PQX20", "2U.PQW10", "2U.PQX10"],
                                     "uploadId": "84f6a2b46a5443ec9797347424402058",
                                     "flagMergeAllProdg1": "0",
                                     "flagMergeAllProductId": "0",
                                     "flagMergeAllChamber": "0",
                                     "mergeProdg1": [],
                                     "mergeProductId": [],
                                     "mergeEqp": [],
                                     "mergeChamber": [],
                                     "mergeOperno": [{
                                         "1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10":
                                             ["1U.CDG10", "1U.CDG20", "1V.PQA10", "2U.PQA10",
                                              "2V.PQW10", "3U.PQA10", "6V.CDG10", "7U.PQA10", "7U.PQX10",
                                              "TM.PQX10", "XX.PQW01", "XX.PQX02", "1U.EQW20", "1U.PQW10",
                                              "1U.PQX10", "1V.PQX10", "1V.PQX20", "2U.PQW10", "2U.PQX10"]}],
                                     "goodSite": ["SITE41_VAL", "SITE18_VAL", "SITE19_VAL", "SITE12_VAL", "SITE22_VAL"],
                                     "badSite": ["SITE2_VAL", "SITE6_VAL", "SITE7_VAL", "SITE10_VAL", "SITE11_VAL"],
                                     }
                    }

In [135]:
df_info_ = pd.DataFrame({"requestId": [json_config_["requestId"]],
                         "requestParam": [json.dumps(json_config_["requestParam"])]})

# 解析JSON并且读取数据
parse_dict, request_id, grpby_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site = parse_JSON_config(
    df_info_)
print("parse_dict:")
print(parse_dict)
print("request_id:")
print(request_id)
print("grpby_list:")
print(grpby_list)
print("merge_operno:")
print(merge_operno)
print("merge_prodg1:")
print(merge_prodg1)
print("merge_product:")
print(merge_product)
print("merge_eqp:")
print(merge_eqp)
print("merge_chamber:")
print(merge_chamber)
print("good_site:")
print(good_site)
print("bad_site:")
print(bad_site)

parse_dict:
{'dateRange': {'start': '2021-12-06 19:50:49', 'end': '2024-03-06 19:50:49'}, 'operNo': ['1U.CDG10', '1U.CDG20', '1V.PQA10', '2U.PQA10', '2V.PQW10', '3U.PQA10', '6V.CDG10', '7U.PQA10', '7U.PQX10', 'TM.PQX10', 'XX.PQW01', 'XX.PQX02', '1U.EQW20', '1U.PQW10', '1U.PQX10', '1V.PQX10', '1V.PQX20', '2U.PQW10', '2U.PQX10'], 'uploadId': '84f6a2b46a5443ec9797347424402058', 'flagMergeAllProdg1': '0', 'flagMergeAllProductId': '0', 'flagMergeAllChamber': '0', 'mergeProdg1': [], 'mergeProductId': [], 'mergeEqp': [], 'mergeChamber': [], 'mergeOperno': [{'1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10': ['1U.CDG10', '1U.CDG20', '1V.PQA10', '2U.PQA10', '2V.PQW10', '3U.PQA10', '6V.CDG10', '7U.PQA10', '7U.PQX10', 'TM.PQX10', 'XX.PQW01', 'XX.PQX02', '1U.EQW20', '1U.PQW10', '1U.PQX10', '1V.PQX10', '1V.PQX20', '2U.PQW10', '2U.PQX10']}], 'goodSite': ['SITE41_VAL', 'SITE18_VA

### DataPreprocessorForInline

In [125]:
class DataPreprocessorForInline:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 columns_list: list[str],
                 certain_column: str,
                 key_words: list[str],
                 convert_to_numeric_list: list[str],
                 merge_operno_list: List[Dict[str, List[str]]]):
        self.df = df
        self.columns_list = columns_list
        self.certain_column = certain_column
        self.key_words = key_words
        self.convert_to_numeric_list = convert_to_numeric_list
        self.merge_operno_list = merge_operno_list

    @staticmethod
    def select_columns(df: pyspark.sql.dataframe, columns_list: list[str]) -> pyspark.sql.dataframe:
        return df.select(columns_list)

    @staticmethod
    def exclude_some_data(df: pyspark.sql.dataframe, key_words: list[str], certain_column: str) -> pyspark.sql.dataframe:
        key_words_str = '|'.join(key_words)
        df_filtered = df.filter(~col(certain_column).rlike(key_words_str))
        return df_filtered

    @staticmethod
    def pre_process(df: pyspark.sql.dataframe, convert_to_numeric_list: list[str]) -> pyspark.sql.dataframe:
        for column in convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in convert_to_numeric_list:
            convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=convert_to_numeric_list, how='all')
        return df

    @staticmethod
    def integrate_columns(df: pyspark.sql.dataframe, merge_operno_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        # split using comma
        splitter_comma = ","
        if merge_operno_list is not None and len(merge_operno_list) > 0:
            print('safsav')
            values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_operno_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("OPE_NO", when(col("OPE_NO").isin(values), replacement_value).otherwise(col("OPE_NO")))
        return df

    def run(self) -> pyspark.sql.dataframe:
        df_select = self.select_columns(df=self.df, columns_list=self.columns_list)
        df_esd = self.exclude_some_data(df=df_select, key_words=self.key_words, certain_column=self.certain_column)
        df_pp = self.pre_process(df=df_esd, convert_to_numeric_list=self.convert_to_numeric_list)
        df_integrate = self.integrate_columns(df=df_pp, merge_operno_list=self.merge_operno_list)
        return df_integrate

In [144]:
good_site_columns = good_site
bad_site_columns = bad_site

good_site_columns = list(set(good_site_columns))
bad_site_columns = list(set(bad_site_columns))
site_columns = good_site_columns + bad_site_columns


grpby_list = ['PRODUCT_ID', 'OPE_NO']
# grpby_list = ['OPE_NO']

columns_list = grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'AVERAGE'] + site_columns
key_words = ['CXS', 'CYS', 'FDS']
convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns
certain_column = 'INLINE_PARAMETER_ID'
merge_operno_list = merge_operno

df_preprocess = DataPreprocessorForInline(df=df1,
                                          columns_list=columns_list,
                                          certain_column=certain_column,
                                          key_words=key_words,
                                          convert_to_numeric_list=convert_to_numeric_list,
                                          merge_operno_list=merge_operno_list).run()

safsav


In [203]:
df_preprocess_pandas = df_preprocess.toPandas()
df_preprocess_pandas

Unnamed: 0,PRODUCT_ID,OPE_NO,WAFER_ID,INLINE_PARAMETER_ID,SITE_COUNT,AVERAGE,SITE12_VAL,SITE18_VAL,SITE22_VAL,SITE41_VAL,SITE19_VAL,SITE10_VAL,SITE11_VAL,SITE2_VAL,SITE6_VAL,SITE7_VAL
0,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,MCW0,,0.000000,,,,,,,,,,
1,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,OEW0,,6000.092727,,,,,,,,,,
2,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,PEW0,,6.999584,,,,,,,,,,
3,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,PTW0,,5500.000000,,,,,,,,,,
4,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,REW0,,198.846013,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11398,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,HFT0,,0.930000,,,,,,,,,,
11399,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,OEW0,,6000.166667,,,,,,,,,,
11400,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,PEW0,,7.008722,,,,,,,,,,
11401,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,PTW0,,930.000000,,,,,,,,,,


### ExtractFeaturesBySite

In [204]:
class ExtractFeaturesBySite:
    @staticmethod
    def process_missing_values_for_site(df: pd.DataFrame,
                                        good_site_columns: list[str],
                                        bad_site_columns: list[str],
                                        missing_value_threshold: Union[int, float] = 0.6,
                                        process_miss_site_mode: str = 'drop') -> pd.DataFrame:
        assert process_miss_site_mode in ['drop', 'fill']
        site_columns = good_site_columns + bad_site_columns
        if process_miss_site_mode == 'drop':
            # drop rows based on the missing value threshold
            df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
        else:
            # fill missing values in the corresponding site rows using the AVERAGE of that row
            df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
        return df

    @staticmethod
    def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std(),
            'PERCENTILE_25': row.quantile(0.25),
            'PERCENTILE_75': row.quantile(0.75)})

    @staticmethod
    def calculate_site_stats(df: pd.DataFrame, grpby_list: list[str], site_columns: list[str], good_or_bad: str) -> pd.DataFrame:
        assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
        selected_df = df[grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID'] + site_columns].reset_index(drop=True)
        # Perform statistical calculations for each row
        side_features = selected_df.apply(lambda row: ExtractFeaturesBySite.calculate_statistics(row[site_columns]), axis=1)
        side_features = side_features.fillna(0)
        df_with_features = pd.concat([selected_df, side_features], axis=1)
        if good_or_bad == 'good':
            df_with_features['label'] = 0
        else:
            df_with_features['label'] = 1
        return df_with_features

    @staticmethod
    def extract_features_by_site(df: pd.DataFrame,
                                 grpby_list: list[str],
                                 good_site_columns: list[str],
                                 bad_site_columns: list[str],
                                 missing_value_threshold: Union[int, float] = 0.6,
                                 process_miss_site_mode: str = 'drop') -> Union[pd.DataFrame, None]:
        """
        Extracts features from a DataFrame based on good and bad site columns.
        Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - grp_list: ['OPE_NO'] for most the case.
        - good_site_columns (list): List of columns representing good sites.
        - bad_site_columns (list): List of columns representing bad sites.
        - missing_value_threshold (Union[int, float]): Threshold for missing values.
        - process_miss_site_mode (str): Mode for handling missing values in site columns, e.g. drop or fill
        Returns:
        - Union[pd.DataFrame, None]: DataFrame with extracted features or None if no data is available.
        """
        df_pandas_specific_ = ExtractFeaturesBySite.process_missing_values_for_site(df=df,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
        if df_pandas_specific_.shape[0] != 0:
            side_with_features1 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list, good_site_columns,
                                                                             good_or_bad='good')
            side_with_features2 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list, bad_site_columns,
                                                                             good_or_bad='bad')
            side_with_features1_select = side_with_features1[
                grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features2_select = side_with_features2[
                grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
            return side_with_features_all

In [205]:
print(df_preprocess_pandas['OPE_NO'].unique())
print(df_preprocess_pandas['PRODUCT_ID'].unique())
print(grpby_list)

['1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10']
['AFPNR901N.0B0J' 'AFPNR901N.0B01' 'AFPNM301N.0B01' 'AFPNM301N.0A01']
['PRODUCT_ID', 'OPE_NO']


In [174]:
missing_value_threshold=0.6
process_miss_site_mode='drop'
print(good_site_columns)
print(bad_site_columns)

['SITE12_VAL', 'SITE18_VAL', 'SITE22_VAL', 'SITE41_VAL', 'SITE19_VAL']
['SITE10_VAL', 'SITE11_VAL', 'SITE2_VAL', 'SITE6_VAL', 'SITE7_VAL']


In [212]:
oper = '1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10'
prod = 'AFPNM301N.0A01'
df_pandas_specific = df_preprocess_pandas.query(f"OPE_NO == '{oper}' & PRODUCT_ID == '{prod}'")
print(df_pandas_specific.shape)

side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_pandas_specific,
                                                                        grpby_list=grpby_list,
                                                                        good_site_columns=good_site_columns,
                                                                        bad_site_columns=bad_site_columns,
                                                                        missing_value_threshold=missing_value_threshold,
                                                                        process_miss_site_mode=process_miss_site_mode)
side_with_features_all

(3082, 16)


Unnamed: 0,PRODUCT_ID,OPE_NO,WAFER_ID,INLINE_PARAMETER_ID,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-06,QY03,0.006801,0.006801,0.006801,0.006801,0.000000,0.006801,0.006801,0
1,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-06,QY04,-0.003730,-0.003730,-0.003730,-0.003730,0.000000,-0.003730,-0.003730,0
2,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-06,QY05,0.003688,0.003688,0.003688,0.003688,0.000000,0.003688,0.003688,0
3,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-06,QX01,-0.002484,-0.002484,-0.002484,-0.002484,0.000000,-0.002484,-0.002484,0
4,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-06,QX02,-0.002548,-0.002548,-0.002548,-0.002548,0.000000,-0.002548,-0.002548,0
...,...,...,...,...,...,...,...,...,...,...,...,...
595,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX01,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
596,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX02,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
597,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX03,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
598,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX04,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1


In [213]:
columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV','PERCENTILE_25', 'PERCENTILE_75']
missing_value_threshold=0.6

pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
                                                    grpby_list=grpby_list,
                                                    columns_to_process=columns_to_process,
                                                    missing_value_threshold=missing_value_threshold)
pivot_result

Unnamed: 0,WAFER_ID,label,"AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FX01","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FX02","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FX03","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FX04","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FX05","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FY01","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FY02","AVERAGE#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#FY03",...,"STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QX01","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QX02","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QX03","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QX04","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QX05","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QY01","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QY02","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QY03","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QY04","STD_DEV#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3U.PQA10,6V.CDG10,7U.PQA10,7U.PQX10,TM.PQX10,XX.PQW01,XX.PQX02,1U.EQW20,1U.PQW10,1U.PQX10,1V.PQX10,1V.PQX20,2U.PQW10,2U.PQX10#QY05"
0,NAZ415-06,0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NAZ415-06,1,-0.6,-0.6,-0.6,-0.6,-0.6,-1.6,-1.6,-1.6,...,0.0023,0.003176,0.001443,0.003362,0.002058,0.002507,0.002549,0.003281,0.002729,0.002497
2,NAZ415-08,0,-1.714286,-1.714286,-1.714286,-1.714286,-1.714286,-2.142857,-2.142857,-2.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NAZ415-08,1,0.085714,0.085714,0.085714,0.085714,0.085714,-1.371429,-1.371429,-1.371429,...,0.001092,0.00154,0.001323,0.001547,0.001617,0.001296,0.001359,0.000607,0.001615,0.00092
4,NAZ415-12,0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,NAZ415-12,1,-0.6,-0.6,-0.6,-0.6,-0.6,-1.6,-1.6,-1.6,...,0.001832,0.00242,0.002438,0.002341,0.002699,0.003303,0.001573,0.002602,0.001816,0.002488
6,NAZ415-13,0,-1.714286,-1.714286,-1.714286,-1.714286,-1.714286,-2.142857,-2.142857,-2.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,NAZ415-13,1,0.085714,0.085714,0.085714,0.085714,0.085714,-1.371429,-1.371429,-1.371429,...,0.001777,0.001583,0.001002,0.001491,0.001043,0.001033,0.001163,0.001049,0.000958,0.001093


In [157]:
ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_oper, grpby_list, good_site_columns, good_or_bad='good')

Unnamed: 0,PRODUCT_ID,OPE_NO,WAFER_ID,INLINE_PARAMETER_ID,SITE12_VAL,SITE18_VAL,SITE22_VAL,SITE41_VAL,SITE19_VAL,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TY04,0.001449,0.001909,0.001801,,0.001191,0.001909,0.001191,0.001625,0.001588,0.000329,0.001384,0.001828,0
1,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TY05,0.000833,0.001289,0.001117,,0.000482,0.001289,0.000482,0.000975,0.000930,0.000353,0.000745,0.001160,0
2,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX02,0.000540,0.000553,0.000618,,0.000493,0.000618,0.000493,0.000547,0.000551,0.000052,0.000528,0.000569,0
3,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX03,0.000399,0.000580,0.000526,,0.000449,0.000580,0.000399,0.000487,0.000488,0.000080,0.000436,0.000540,0
4,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX04,0.000529,0.000587,0.000534,,0.000527,0.000587,0.000527,0.000531,0.000544,0.000029,0.000528,0.000547,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2707,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX01,-3.000000,,,,,-3.000000,-3.000000,-3.000000,-3.000000,0.000000,-3.000000,-3.000000,0
2708,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX02,-3.000000,,,,,-3.000000,-3.000000,-3.000000,-3.000000,0.000000,-3.000000,-3.000000,0
2709,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX03,-3.000000,,,,,-3.000000,-3.000000,-3.000000,-3.000000,0.000000,-3.000000,-3.000000,0
2710,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX04,-3.000000,,,,,-3.000000,-3.000000,-3.000000,-3.000000,0.000000,-3.000000,-3.000000,0


In [158]:
ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_oper, grpby_list, bad_site_columns, good_or_bad='bad')

Unnamed: 0,PRODUCT_ID,OPE_NO,WAFER_ID,INLINE_PARAMETER_ID,SITE10_VAL,SITE11_VAL,SITE2_VAL,SITE6_VAL,SITE7_VAL,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TY04,0.001612,0.001064,0.001592,0.001175,0.001274,0.001612,0.001064,0.001274,0.001343,0.000248,0.001175,0.001592,1
1,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TY05,0.002019,0.001127,0.000589,0.001036,0.001152,0.002019,0.000589,0.001127,0.001185,0.000519,0.001036,0.001152,1
2,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX02,0.000422,0.000472,0.000478,0.000486,0.000450,0.000486,0.000422,0.000472,0.000462,0.000026,0.000450,0.000478,1
3,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX03,0.000469,0.000469,0.000497,0.000535,0.000576,0.000576,0.000469,0.000497,0.000509,0.000046,0.000469,0.000535,1
4,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NBX219-17,TX04,0.000564,0.000473,0.000441,0.000492,0.000513,0.000564,0.000441,0.000492,0.000497,0.000046,0.000473,0.000513,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2707,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX01,-1.000000,-2.000000,1.000000,3.000000,2.000000,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
2708,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX02,-1.000000,-2.000000,1.000000,3.000000,2.000000,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
2709,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX03,-1.000000,-2.000000,1.000000,3.000000,2.000000,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1
2710,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,1V.PQA10,2U.PQA10,2V.PQW10,3...",NAZ415-13,FX04,-1.000000,-2.000000,1.000000,3.000000,2.000000,3.000000,-2.000000,1.000000,0.600000,2.073644,-1.000000,2.000000,1


### FitInlineModelBySite

In [235]:
# class FitInlineModelBySite:
#     def __init__(self,
#                  df: pyspark.sql.dataframe,
#                  grpby_list: list[str],
#                  good_site_columns: list[str],
#                  bad_site_columns: list[str],
#                  process_miss_site_mode: str,
#                  columns_to_process: list[str],
#                  missing_value_threshold: Union[int, float],
#                  model: str = 'pca'):
#         """
#         Initialize the FitInlineModelBySite object.

#         Parameters:
#         - df: pyspark.sql.dataframe, the input data
#         - grpby_list: list[str], the grouping variable, inline data should be ["OPE_NO"] mostly
#         - good_site_columns: List of str, column names for good sites
#         - bad_site_columns: List of str, column names for bad sites
#         - process_miss_site_mode: str, mode for handling missing values in site data, e.g. drop or fill
#         - columns_to_process: List of str, columns to process in missing value functions
#         - missing_value_threshold: Union[int, float], threshold for missing values
#         - model: str, default is 'pca', other options include 'rf' for random forest, 'decisionTree' for decision tree,
#                  svc, logistic and sgd.
#         """
#         self.df = df
#         self.grpby_list = grpby_list
#         self.good_site_columns = good_site_columns
#         self.bad_site_columns = bad_site_columns
#         self.process_miss_site_mode = process_miss_site_mode
#         self.columns_to_process = columns_to_process
#         self.missing_value_threshold = missing_value_threshold
#         self.model = model

#     @staticmethod
#     def process_missing_values(df, columns_to_process, missing_value_threshold):
#         for column in columns_to_process:
#             missing_percentage = df[column].isnull().mean()
#             if missing_percentage > missing_value_threshold:
#                 df = df.drop(columns=[column])
#             else:
#                 df[column] = df[column].fillna(df[column].mean())
#         return df

#     @staticmethod
#     def get_pivot_table(df, grpby_list, columns_to_process, missing_value_threshold):
#         df = FitInlineModelBySite.process_missing_values(df, columns_to_process, missing_value_threshold)
#         index_list = ['WAFER_ID', 'label']
#         columns_list = grpby_list + ['INLINE_PARAMETER_ID']
#         values_list = df.columns.difference(['WAFER_ID', 'INLINE_PARAMETER_ID', 'label'] + grpby_list)
#         pivot_result = df.pivot_table(index=index_list,
#                                       columns=columns_list,
#                                       values=values_list)
#         pivot_result.columns = pivot_result.columns.map('#'.join)
#         pivot_result = FitInlineModelBySite.process_missing_values(pivot_result, pivot_result.columns, missing_value_threshold)
#         pivot_result = pivot_result.reset_index(drop=False)
#         # Remove completely identical columns
#         for column in pivot_result.columns.difference(index_list):
#             if pivot_result[column].nunique() == 1:
#                 pivot_result = pivot_result.drop(column, axis=1)
#         return pivot_result

#     @staticmethod
#     def fit_pca_model(df, grpby_list, good_site_columns, bad_site_columns, columns_to_process, process_miss_site_mode, missing_value_threshold):
#         schema_all = StructType([StructField("features", StringType(), True),
#                                  StructField("importance", FloatType(), True)])

#         @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
#         def get_model_result(df_run):
#             side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
#                                                                                     grpby_list=grpby_list,
#                                                                                     good_site_columns=good_site_columns,
#                                                                                     bad_site_columns=bad_site_columns,
#                                                                                     missing_value_threshold=missing_value_threshold,
#                                                                                     process_miss_site_mode=process_miss_site_mode)
#             if side_with_features_all is None:
#                 return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

#             pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
#                                                                 grpby_list=grpby_list,
#                                                                 columns_to_process=columns_to_process,
#                                                                 missing_value_threshold=missing_value_threshold)
#             x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

#             if x_train.shape[1] > 1:
#                 n_components = min(min(x_train.shape) - 2, 20)
#                 model = pca(n_components=n_components, verbose=None)
#                 results = model.fit_transform(x_train)
#                 res_top = results['topfeat']
#                 res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
#                 res_top_select['importance'] = abs(res_top_select['loading'])
#                 res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()
#                 return res_top_select
#             else:
#                 res_top_select = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
#                 return res_top_select
#         return df.groupby(grpby_list).apply(get_model_result)

#     def fit_rf_model(self):
#         schema_all = StructType([StructField("features", StringType(), True),
#                                  StructField("importance", FloatType(), True)])
#         good_site_columns = self.good_site_columns
#         bad_site_columns = self.bad_site_columns
#         missing_value_threshold = self.missing_value_threshold
#         process_miss_site_mode = self.process_miss_site_mode
#         columns_to_process = self.columns_to_process

#         @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
#         def get_model_result(df_run):
#             side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
#                                                                                     good_site_columns=good_site_columns,
#                                                                                     bad_site_columns=bad_site_columns,
#                                                                                     missing_value_threshold=missing_value_threshold,
#                                                                                     process_miss_site_mode=process_miss_site_mode)
#             if side_with_features_all is None:
#                 return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

#             pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
#                                                                 columns_to_process=columns_to_process,
#                                                                 missing_value_threshold=missing_value_threshold)
#             x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
#             y_train = pivot_result[['label']]
#             if min(x_train.shape) > 4 and y_train['label'].nunique() > 1:
#                 pipe = Pipeline(steps=[
#                     ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
#                     ('scaler', StandardScaler()),
#                     ('model', RandomForestClassifier(random_state=2024))])
#                 param_grid = {'model__n_estimators': [*range(10, 60, 10)],
#                               'model__max_depth': [*range(5, 50, 10)],
#                               'model__min_samples_split': [2, 5],
#                               'model__min_samples_leaf': [1, 3]}
#                 grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
#                 grid.fit(x_train.values, y_train.values.ravel())
#                 roc_auc_score_ = grid.best_score_
#                 if roc_auc_score_ >= 0.6:
#                     small_importance_res = pd.DataFrame({'features': x_train.columns,
#                                                          'importance': grid.best_estimator_.steps[2][1].feature_importances_})
#                     return small_importance_res
#                 else:
#                     small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
#                     return small_importance_res
#             else:
#                 small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -102}, index=[0])
#                 return small_importance_res
#         return self.df.groupby(self.by).apply(get_model_result)

#     def run(self):
#         if self.model == 'pca':
#             res = self.fit_pca_model(df=self.df, grpby_list=self.grpby_list,
#                                      good_site_columns=self.good_site_columns,
#                                      bad_site_columns=self.bad_site_columns,
#                                      columns_to_process=self.columns_to_process,
#                                      process_miss_site_mode=self.process_miss_site_mode,
#                                      missing_value_threshold=self.missing_value_threshold)
#         elif self.model == 'rf':
#             res = self.fit_rf_model()
#         else:
#             res = None
#         return res

class FitInlineModelBySite:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 grpby_list: list[str],
                 good_site_columns: list[str],
                 bad_site_columns: list[str],
                 process_miss_site_mode: str,
                 columns_to_process: list[str],
                 missing_value_threshold: Union[int, float],
                 model: str = 'pca'):
        """
        Initialize the FitInlineModelBySite object.

        Parameters:
        - df: pyspark.sql.dataframe, the input data
        - grpby_list: list[str], the grouping variable, inline data should be ["OPE_NO"] mostly
        - good_site_columns: List of str, column names for good sites
        - bad_site_columns: List of str, column names for bad sites
        - process_miss_site_mode: str, mode for handling missing values in site data, e.g. drop or fill
        - columns_to_process: List of str, columns to process in missing value functions
        - missing_value_threshold: Union[int, float], threshold for missing values
        - model: str, default is 'pca', other options include 'rf' for random forest, 'decisionTree' for decision tree,
                 svc, logistic and sgd.
        """
        self.df = df
        self.grpby_list = grpby_list
        self.good_site_columns = good_site_columns
        self.bad_site_columns = bad_site_columns
        self.process_miss_site_mode = process_miss_site_mode
        self.columns_to_process = columns_to_process
        self.missing_value_threshold = missing_value_threshold
        self.model = model

    @staticmethod
    def process_missing_values(df, columns_to_process, missing_value_threshold):
        for column in columns_to_process:
            missing_percentage = df[column].isnull().mean()
            if missing_percentage > missing_value_threshold:
                df = df.drop(columns=[column])
            else:
                df[column] = df[column].fillna(df[column].mean())
        return df

    @staticmethod
    def get_pivot_table(df, grpby_list, columns_to_process, missing_value_threshold):
        df = FitInlineModelBySite.process_missing_values(df, columns_to_process, missing_value_threshold)
        index_list = ['WAFER_ID', 'label']
        columns_list = grpby_list + ['INLINE_PARAMETER_ID']
        values_list = df.columns.difference(['WAFER_ID', 'INLINE_PARAMETER_ID', 'label'] + grpby_list)
        pivot_result = df.pivot_table(index=index_list,
                                      columns=columns_list,
                                      values=values_list)
        pivot_result.columns = pivot_result.columns.map('#'.join)
        pivot_result = FitInlineModelBySite.process_missing_values(pivot_result, pivot_result.columns,
                                                                   missing_value_threshold)
        pivot_result = pivot_result.reset_index(drop=False)
        # Remove completely identical columns
        for column in pivot_result.columns.difference(index_list):
            if pivot_result[column].nunique() == 1:
                pivot_result = pivot_result.drop(column, axis=1)
        return pivot_result

    @staticmethod
    def fit_pca_model(df, grpby_list, good_site_columns, bad_site_columns, columns_to_process, process_miss_site_mode,
                      missing_value_threshold):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
                                                                                    grpby_list=grpby_list,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
            if side_with_features_all is None:
                return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

            pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
                                                                grpby_list=grpby_list,
                                                                columns_to_process=columns_to_process,
                                                                missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

            if x_train.shape[1] > 1:
                n_components = min(min(x_train.shape) - 2, 20)
                model = pca(n_components=n_components, verbose=None)
                results = model.fit_transform(x_train)
                res_top = results['topfeat']
                res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
                res_top_select['importance'] = abs(res_top_select['loading'])
                res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading",
                                                                                             axis=1).drop_duplicates()
                return res_top_select
            else:
                res_top_select = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
                return res_top_select

        return df.groupby(grpby_list).apply(get_model_result)

    @staticmethod
    def get_pipe_params(model):
        common_steps = [
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler())
        ]
        models = {
            'rf': (RandomForestClassifier(random_state=2024), {
                'model__n_estimators': [*range(10, 60, 10)],
                'model__max_depth': [*range(5, 50, 10)],
                'model__min_samples_split': [2, 5],
                'model__min_samples_leaf': [1, 3]
            }),

            'decisionTree': (DecisionTreeClassifier(random_state=2024), {
                'model__max_depth': [None, 5, 10, 15],
                'model__min_samples_split': [2, 5, 10],
                'model__min_samples_leaf': [1, 2, 4]
            }),

            'svc': (LinearSVC(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'squared_hinge'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'logistic': (LogisticRegression(random_state=2024, fit_intercept=False, solver='liblinear'), {
                'model__penalty': ['l1', 'l2'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'sgd': (SGDClassifier(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'log_loss', 'perceptron', 'huber'],
                'model__penalty': ['l1', 'l2', 'elasticnet', None],
                'model__alpha': [0.0001, 0.001, 0.01, 0.1],
                'model__max_iter': [100, 500, 1000]
            })
        }

        if model in models:
            model_class, param_grid = models[model]
            steps = common_steps + [('model', model_class)]
            pipe = Pipeline(steps)
        else:
            raise Exception('Wrong Model Selection. Supported models are: pca, rf, decisionTree, svc, logistic, sgd.')
        return pipe, param_grid

    @staticmethod
    def fit_classification_model(df, grpby_list, good_site_columns, bad_site_columns,
                                 columns_to_process, process_miss_site_mode, missing_value_threshold, model):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
                                                                                    grpby_list=grpby_list,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
            if side_with_features_all is None:
                return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

            pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
                                                                grpby_list=grpby_list,
                                                                columns_to_process=columns_to_process,
                                                                missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
            y_train = pivot_result[['label']]

            if min(x_train.shape) > 4 and y_train['label'].nunique() > 1:
                pipe, param_grid = FitInlineModelBySite.get_pipe_params(model=model)
                try:
                    grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
                    grid.fit(x_train.values, y_train.values.ravel())

                except ValueError:
                    small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
                    return small_importance_res

                best_est = grid.best_estimator_.steps[-1][-1]
                if hasattr(best_est, 'feature_importances_'):
                    small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                         'importance': best_est.feature_importances_})
                else:
                    small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                         'importance': abs(best_est.coef_.ravel())})
                return small_importance_res

            else:
                small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -102}, index=[0])
                return small_importance_res

        return df.groupby(grpby_list).apply(get_model_result)

    def run(self):
        if self.model == 'pca':
            res = self.fit_pca_model(df=self.df, grpby_list=self.grpby_list,
                                     good_site_columns=self.good_site_columns,
                                     bad_site_columns=self.bad_site_columns,
                                     columns_to_process=self.columns_to_process,
                                     process_miss_site_mode=self.process_miss_site_mode,
                                     missing_value_threshold=self.missing_value_threshold)
        else:
            res = self.fit_classification_model(df=self.df, grpby_list=self.grpby_list,
                                                good_site_columns=self.good_site_columns,
                                                bad_site_columns=self.bad_site_columns,
                                                columns_to_process=self.columns_to_process,
                                                process_miss_site_mode=self.process_miss_site_mode,
                                                missing_value_threshold=self.missing_value_threshold,
                                                model=self.model)
        return res

In [232]:
res = FitInlineModelBySite(df=df_preprocess,
                            grpby_list=grpby_list,
                            good_site_columns=good_site_columns,
                            bad_site_columns=bad_site_columns,
                            process_miss_site_mode='drop',
                            columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75'],
                            missing_value_threshold=0.6,
                            model='sgd').run()

In [233]:
res_pandas = res.toPandas()
res_pandas.sort_values('importance')

Unnamed: 0,features,importance
435,STATS#OPE#PARAM,-102.000000
434,STATS#OPE#PARAM,-102.000000
293,"MEDIAN#AFPNM301N.0B01#1U.CDG10,1U.CDG20,1V.PQA...",0.000000
292,"MEDIAN#AFPNM301N.0B01#1U.CDG10,1U.CDG20,1V.PQA...",0.000000
291,"MEDIAN#AFPNM301N.0B01#1U.CDG10,1U.CDG20,1V.PQA...",0.000000
...,...,...
111,"MIN_VAL#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQ...",15.286822
167,"PERCENTILE_75#AFPNM301N.0A01#1U.CDG10,1U.CDG20...",15.331213
140,"PERCENTILE_25#AFPNM301N.0A01#1U.CDG10,1U.CDG20...",15.766070
110,"MIN_VAL#AFPNM301N.0A01#1U.CDG10,1U.CDG20,1V.PQ...",17.807478


In [234]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.show()

+--------------+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|    PRODUCT_ID|             OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+--------------+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               QX01|                      0.0|       269|0.062398493|      6.239849|       1|
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               QX04|                      0.0|       269|0.050567783|     5.0567784|       2|
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               MY03|                      0.0|       269|0.049038637|     4.9038634|       3|
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               QY01|                      0.0|       269| 0.04872076|      4.872076|       4|
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               QX02|                      0.0|

### SplitInlineModelResults

In [237]:
class SplitInlineModelResults:
    def __init__(self, df: pyspark.sql.dataframe, grpby_list: List[str], request_id: str):
        self.df = df
        self.grpby_list = grpby_list
        self.request_id = request_id

    @staticmethod
    def split_features(df: pd.DataFrame, index: int) -> str:
        return df['features'].apply(lambda x: x.split('#')[index])

    @staticmethod
    def get_split_features(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
        n_feats = len(grpby_list)
        for i in range(n_feats):
            df[grpby_list[i]] = SplitInlineModelResults.split_features(df, i + 1)

        df['INLINE_PARAMETER_ID'] = SplitInlineModelResults.split_features(df, n_feats + 1)
        df = df.drop(['features'], axis=1).reset_index(drop=True)
        return df

    @staticmethod
    def split_calculate_features(df: pyspark.sql.dataframe, grpby_list: List[str], by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            split_table = SplitInlineModelResults.get_split_features(df=df_run, grpby_list=grpby_list)
            split_table_grpby = split_table.groupby(grpby_list + ['INLINE_PARAMETER_ID'])[
                'importance'].sum().reset_index(drop=False)
            return split_table_grpby

        return df.groupby(by).apply(get_model_result)

    @staticmethod
    def add_certain_column(df: pyspark.sql.dataframe, grpby_list: List[str], request_id: str,
                           by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                              StructField("AVG_SPEC_CHK_RESULT_COUNT", FloatType(), True),
                              StructField("request_id", StringType(), True),
                              StructField("weight", FloatType(), True),
                              StructField("weight_percent", FloatType(), True),
                              StructField("index_no", IntegerType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_result(final_res):
            # Calculate weights and normalize
            final_res['importance'] = final_res['importance'].astype(float)
            final_res = final_res.query("importance > 0")
            final_res['weight'] = final_res['importance'] / final_res['importance'].sum()
            final_res['weight_percent'] = final_res['weight'] * 100
            final_res = final_res.sort_values('weight', ascending=False)

            final_res['index_no'] = [i + 1 for i in range(len(final_res))]
            final_res['AVG_SPEC_CHK_RESULT_COUNT'] = 0.0
            final_res['request_id'] = request_id
            final_res = final_res.drop(['importance', 'temp'], axis=1)
            return final_res

        return df.groupby(by).apply(get_result)

    def run(self):
        df = self.df.filter("importance > 0").withColumn('temp', lit(0))
        res = self.split_calculate_features(df=df, grpby_list=self.grpby_list, by='temp')
        res = res.withColumn('temp', lit(1))
        final_res = self.add_certain_column(df=res, grpby_list=self.grpby_list, request_id=self.request_id, by='temp')
        final_res = final_res.withColumnRenamed('OPE_NO', 'OPER_NO')
        return final_res

In [150]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.show()

+--------------+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|    PRODUCT_ID|             OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+--------------+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|               QX04|                      0.0|       269| 0.17291498|     17.291498|       1|
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|               QX03|                      0.0|       269| 0.09726248|      9.726248|       2|
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|               FX04|                      0.0|       269| 0.09353577|      9.353577|       3|
|AFPNM301N.0A01|1U.CDG10,1U.CDG20...|               FX01|                      0.0|       269| 0.08503456|      8.503456|       4|
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|               FX02|                      0.0|

In [143]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.show()

+--------------------+--------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|             OPER_NO|    PRODUCT_ID|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+--------------------+--------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|1U.CDG10,1U.CDG20...|AFPNM301N.0B01|               QX04|                      0.0|       269| 0.17291498|     17.291498|       1|
|1U.CDG10,1U.CDG20...|AFPNM301N.0B01|               QX03|                      0.0|       269| 0.09726248|      9.726248|       2|
|1U.CDG10,1U.CDG20...|AFPNM301N.0B01|               FX04|                      0.0|       269| 0.09353577|      9.353577|       3|
|1U.CDG10,1U.CDG20...|AFPNM301N.0A01|               FX01|                      0.0|       269| 0.08503456|      8.503456|       4|
|1U.CDG10,1U.CDG20...|AFPNM301N.0B01|               FX02|                      0.0|

In [133]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.show()

+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|             OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+--------------------+-------------------+-------------------------+----------+-----------+--------------+--------+
|1U.CDG10,1U.CDG20...|               QX04|                      0.0|       269| 0.29911017|     29.911018|       1|
|1U.CDG10,1U.CDG20...|               FX05|                      0.0|       269|  0.1355169|      13.55169|       2|
|1U.CDG10,1U.CDG20...|               QX05|                      0.0|       269| 0.13385752|     13.385753|       3|
|1U.CDG10,1U.CDG20...|               QX03|                      0.0|       269| 0.10604785|     10.604784|       4|
|1U.CDG10,1U.CDG20...|               FX04|                      0.0|       269| 0.10411761|     10.411761|       5|
|1U.CDG10,1U.CDG20...|               QX01|                      0.0|    

### ExertInlineBySite

In [265]:
class ExertInlineBySite:
    @staticmethod
    def fit_by_site_model(df: pyspark.sql.dataframe,
                          request_id: str,
                          merge_operno_list: List[Dict[str, List[str]]],
                          good_site_columns: List[str],
                          bad_site_columns: List[str],
                          columns_list=None,
                          key_words=None,
                          convert_to_numeric_list=None,
                          grpby_list=None,
                          certain_column=None,
                          model='pca') -> Union[str, pyspark.sql.dataframe.DataFrame]:
        # drop duplicates
        good_site_columns = list(set(good_site_columns))
        bad_site_columns = list(set(bad_site_columns))
        site_columns = good_site_columns + bad_site_columns

        if grpby_list is None or len(grpby_list) == 0:
            grpby_list = ['OPE_NO']

        if columns_list is None:
            columns_list = grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'AVERAGE'] + site_columns

        if key_words is None:
            key_words = ['CXS', 'CYS', 'FDS']

        if convert_to_numeric_list is None:
            convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns

        if certain_column is None:
            certain_column = 'INLINE_PARAMETER_ID'

        df_preprocess = DataPreprocessorForInline(df=df,
                                                  columns_list=columns_list,
                                                  certain_column=certain_column,
                                                  key_words=key_words,
                                                  convert_to_numeric_list=convert_to_numeric_list,
                                                  merge_operno_list=merge_operno_list).run()
        print(df_preprocess.count())
        if df_preprocess.isEmpty():
            msg = 'No data of this type in the database!'
            raise RCABaseException(msg)

        res = FitInlineModelBySite(df=df_preprocess,
                                   grpby_list=grpby_list,
                                   good_site_columns=good_site_columns,
                                   bad_site_columns=bad_site_columns,
                                   process_miss_site_mode='drop',
                                   columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV',
                                                       'PERCENTILE_25', 'PERCENTILE_75'],
                                   missing_value_threshold=0.6,
                                   model=model).run()
        # res.show()
        if res.isEmpty():
            msg = 'No difference in this data. The output of the algorithm is 0.'
            raise RCABaseException(msg)

        final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
        if final_res.isEmpty():
            msg = 'Temporary exception in adding columns to algorithm results'
            raise RCABaseException(msg)
        else:
            return final_res

In [268]:
# - model: str, default is 'pca', other options include 'rf' for random forest, 'decisionTree' for decision tree,
#                  svc, logistic and sgd.

f_rres = ExertInlineBySite.fit_by_site_model(df=df1, 
                                              request_id=request_id,
                                              merge_operno_list=None,
                                              good_site_columns=good_site_columns,
                                              bad_site_columns=bad_site_columns,
                                              model='rf',
                                              grpby_list=[])

11403


In [269]:
f_rres.show()

+--------+-------------------+-------------------------+----------+-----------+--------------+--------+
| OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+--------+-------------------+-------------------------+----------+-----------+--------------+--------+
|1U.PQX10|               FX03|                      0.0|       269|0.042857144|     4.2857146|       1|
|1V.PQX10|               FY05|                      0.0|       269|0.042857144|     4.2857146|       2|
|2U.PQX10|               FX01|                      0.0|       269| 0.03465909|     3.4659092|       3|
|XX.PQX02|               FX01|                      0.0|       269|0.028571429|      2.857143|       4|
|TM.PQX10|               MX04|                      0.0|       269|0.028571429|      2.857143|       5|
|1V.PQX10|               FX03|                      0.0|       269|0.028571429|      2.857143|       6|
|TM.PQX10|               FY05|                      0.0|       2