In [2]:
import pyspark
import pandas as pd
import pyspark.pandas as ps
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, when

from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

import json



In [3]:
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '8g') \
    .config('spark.driver.cores', '12') \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.cores', '12') \
    .config('spark.cores.max', '12') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [4]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/inline_algorithm/codes_version7/inline_test_data3_bysite.csv")
df_pandas

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,MEASURE_TIME,RANGE_INDEX,FAB_ID,PRODUCT_ID,LOT_ID,AVERAGE,MAX_VAL,...,ACT_CODE,ETL_INSERT_TIME,ETL_ARC_FLAG,ETL_BATCH_SYNC_TS,ETL_DEL_FLAG,ETL_DS_JOB_NM,ETL_SRC_DB,ETL_SRC_TBL,ETL_TBL_OPER_TS,label
0,IKAS02.082-15,IKAS.OPEN.1,01W0,2023-07-08 16:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.064000,,...,,2023-07-08 16:44:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
1,IKAS02.082-15,IKAS.OPEN.3,YTW0,2023-07-18 13:31:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.000393,,...,,2023-07-18 13:41:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
2,IKAS02.082-15,IKAS.OPEN.3,YMW0,2023-07-18 13:31:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.002098,,...,,2023-07-18 13:41:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
3,IKAS02.082-15,IKAS.OPEN.3,YSIG,2023-07-16 21:09:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.006500,,...,,2023-07-16 21:19:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
4,IKAS02.082-15,IKAS.OPEN.3,YSIG,2023-07-17 04:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.017300,,...,,2023-07-17 04:41:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32273,IKAS02.082-16,IKAS.OPEN.1,04W0,2023-07-08 16:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.064500,,...,,2023-07-08 16:44:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
32274,IKAS02.082-16,IKAS.OPEN.1,05W0,2023-07-08 16:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.064100,,...,,2023-07-08 16:44:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
32275,IKAS02.082-16,IKAS.OPEN.1,06W0,2023-07-08 16:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.064400,,...,,2023-07-08 16:44:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
32276,IKAS02.082-16,IKAS.OPEN.1,07W0,2023-07-08 16:30:00,0,N1,IKAS.PROJECT.0A02,IKAS02.082030,0.064300,,...,,2023-07-08 16:44:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1


In [5]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

32278

In [6]:
def parse_JSON_config(df: pd.DataFrame):
    request_id = df["requestId"].values[0]
    request_params = df["requestParam"].values[0]
    parse_dict = json.loads(request_params)

    # PRODUCT_ID, PROG1, EQP, CHAMBER, OPER_NO存在部分合并的情况
    try:
        # OPER_NO的部分合并结果
        merge_operno = list(parse_dict.get('mergeOperno')) if parse_dict.get('mergeOperno') else None
    except KeyError:
        merge_operno = None

    try:
        # PROG1的部分合并结果
        merge_prodg1 = list(parse_dict.get('mergeProdg1')) if parse_dict.get('mergeProdg1') else None
    except KeyError:
        merge_prodg1 = None

    try:
        # PRODUCT_ID的部分合并结果
        merge_product = list(parse_dict.get('mergeProductId')) if parse_dict.get('mergeProductId') else None
    except KeyError:
        merge_product = None

    try:
        # EQP的部分合并结果
        merge_eqp = list(parse_dict.get('mergeEqp')) if parse_dict.get('mergeEqp') else None
    except KeyError:
        merge_eqp = None

    try:
        # CHAMBER的部分合并结果
        merge_chamber = list(parse_dict.get('mergeChamber')) if parse_dict.get('mergeChamber') else None
    except KeyError:
        merge_chamber = None

    # 获取good_site和bad_site
    try:
        good_site = list(parse_dict.get('goodSite')) if parse_dict.get('goodSite') else None
    except KeyError:
        good_site = None

    try:
        bad_site = list(parse_dict.get('badSite')) if parse_dict.get('badSite') else None
    except KeyError:
        bad_site = None

    # group by 子句中的字段
    group_by_list = parse_dict.get("groupByList")
    if group_by_list is None or len(group_by_list) == 0:
        group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
        # PRODUCT_ID, PROG1, CHAMBER 这3个存在一键合并的切换开关
        # 且一键合并PROG1时会自动一键合并PRODUCT_ID
        flag_merge_prodg1 = parse_dict.get('flagMergeAllProdg1')
        flag_merge_product_id = parse_dict.get('flagMergeAllProductId')
        flag_merge_chamber = parse_dict.get('flagMergeAllChamber')

        if flag_merge_prodg1 == '1':
            # 一键合并PROG1时，部分合并PROG1和PRODUCT_ID的情况都会被忽略
            merge_prodg1 = None
            merge_product = None
            group_by_list = ['OPER_NO', "EQP_NAME", 'TOOL_NAME']
            if flag_merge_chamber == '1':
                group_by_list = ['OPER_NO', "EQP_NAME"]
        elif flag_merge_product_id == '1':
            # 一键合并PRODUCT_ID时，部分合并PRODUCT_ID的情况会被忽略
            merge_product = None
            group_by_list = ["PRODG1", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
            if flag_merge_chamber == '1':
                # 一键合并CHAMBER时，部分合并CHAMBER的情况会被忽略
                group_by_list = ["PRODG1", 'OPER_NO', "EQP_NAME"]
        elif flag_merge_chamber == '1':
            merge_chamber = None
            group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME"]

    return parse_dict, request_id, group_by_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site

In [7]:
json_config_ = {"requestId": "715", "algorithm": "inline_by_site",
                    "requestParam": {"dateRange": {"start": "2021-12-14 16:22:40", "end": "2024-03-14 16:22:40"},
                                     "uploadId": "e6eefec4ba2e4456bdde5efbcf47c438",
                                     "goodSite": ["SITE16_VAL", "SITE7_VAL", "SITE11_VAL", "SITE14_VAL", "SITE17_VAL",
                                                  "SITE20_VAL",
                                                  "SITE9_VAL", "SITE12_VAL", "SITE13_VAL", "SITE15_VAL", "SITE18_VAL",
                                                  "SITE19_VAL",
                                                  "SITE10_VAL", "SITE8_VAL"],
                                     "badSite": ["SITE6_VAL", "SITE2_VAL", "SITE3_VAL", "SITE5_VAL", "SITE1_VAL",
                                                 "SITE4_VAL"],
                                     "flagMergeAllProdg1": "0", "flagMergeAllProductId": "0",
                                     "flagMergeAllChamber": "0",
                                     "mergeProdg1": [], "mergeProductId": [], "mergeEqp": [], "mergeChamber": [],
                                     "mergeOperno": []}}

In [8]:
df_info_ = pd.DataFrame({"requestId": [json_config_["requestId"]],
                         "requestParam": [json.dumps(json_config_["requestParam"])]})

# 解析JSON并且读取数据
parse_dict, request_id, grpby_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site = parse_JSON_config(
    df_info_)
print("parse_dict:")
print(parse_dict)
print("request_id:")
print(request_id)
print("grpby_list:")
print(grpby_list)
print("merge_operno:")
print(merge_operno)
print("merge_prodg1:")
print(merge_prodg1)
print("merge_product:")
print(merge_product)
print("merge_eqp:")
print(merge_eqp)
print("merge_chamber:")
print(merge_chamber)
print("good_site:")
print(good_site)
print("bad_site:")
print(bad_site)

parse_dict:
{'dateRange': {'start': '2021-12-14 16:22:40', 'end': '2024-03-14 16:22:40'}, 'uploadId': 'e6eefec4ba2e4456bdde5efbcf47c438', 'goodSite': ['SITE16_VAL', 'SITE7_VAL', 'SITE11_VAL', 'SITE14_VAL', 'SITE17_VAL', 'SITE20_VAL', 'SITE9_VAL', 'SITE12_VAL', 'SITE13_VAL', 'SITE15_VAL', 'SITE18_VAL', 'SITE19_VAL', 'SITE10_VAL', 'SITE8_VAL'], 'badSite': ['SITE6_VAL', 'SITE2_VAL', 'SITE3_VAL', 'SITE5_VAL', 'SITE1_VAL', 'SITE4_VAL'], 'flagMergeAllProdg1': '0', 'flagMergeAllProductId': '0', 'flagMergeAllChamber': '0', 'mergeProdg1': [], 'mergeProductId': [], 'mergeEqp': [], 'mergeChamber': [], 'mergeOperno': []}
request_id:
715
grpby_list:
['PRODG1', 'PRODUCT_ID', 'OPER_NO', 'EQP_NAME', 'TOOL_NAME']
merge_operno:
None
merge_prodg1:
None
merge_product:
None
merge_eqp:
None
merge_chamber:
None
good_site:
['SITE16_VAL', 'SITE7_VAL', 'SITE11_VAL', 'SITE14_VAL', 'SITE17_VAL', 'SITE20_VAL', 'SITE9_VAL', 'SITE12_VAL', 'SITE13_VAL', 'SITE15_VAL', 'SITE18_VAL', 'SITE19_VAL', 'SITE10_VAL', 'SITE8_V

In [9]:
class DataPreprocessorForInline:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 columns_list: list[str],
                 certain_column: str,
                 key_words: list[str],
                 convert_to_numeric_list: list[str],
                 merge_operno_list: List[Dict[str, List[str]]]):
        self.df = df
        self.columns_list = columns_list
        self.certain_column = certain_column
        self.key_words = key_words
        self.convert_to_numeric_list = convert_to_numeric_list
        self.merge_operno_list = merge_operno_list

    def select_columns(self, df):
        return df.select(self.columns_list)

    def exclude_some_data(self, df):
        key_words_str = '|'.join(self.key_words)
        df_filtered = df.filter(~col(self.certain_column).rlike(key_words_str))
        return df_filtered

    def pre_process(self, df):
        for column in self.convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in self.convert_to_numeric_list:
            self.convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=self.convert_to_numeric_list, how='all')
        return df

    @staticmethod
    def integrate_columns(df, merge_operno_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        # split using comma
        splitter_comma = ","
        if merge_operno_list is not None and len(merge_operno_list) > 0:
            # Extract values from each dictionary in merge_operno_list and create a list
            values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
            # Concatenate values from each dictionary
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_operno_list]

            # Replace values in 'OPER_NO' column based on the rules defined in merge_operno_list
            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("OPE_NO", when(col("OPE_NO").isin(values), replacement_value).otherwise(col("OPE_NO")))
        return df

    def run(self):
        df_select = self.select_columns(df=self.df)
        df_esd = self.exclude_some_data(df=df_select)
        df_pp = self.pre_process(df=df_esd)
        df_integrate = self.integrate_columns(df=df_pp, merge_operno_list=self.merge_operno_list)
        return df_integrate

In [10]:
class ExtractFeaturesBySite:
    @staticmethod
    def process_missing_values_for_site(df: pd.DataFrame,
                                        good_site_columns: list[str],
                                        bad_site_columns: list[str],
                                        missing_value_threshold: Union[int, float] = 0.6,
                                        process_miss_site_mode: str = 'drop') -> pd.DataFrame:
        assert process_miss_site_mode in ['drop', 'fill']
        site_columns = good_site_columns + bad_site_columns
        if process_miss_site_mode == 'drop':
            # drop rows based on the missing value threshold
            df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
        else:
            # fill missing values in the corresponding site rows using the AVERAGE of that row
            df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
        return df

    @staticmethod
    def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std(),
            'PERCENTILE_25': row.quantile(0.25),
            'PERCENTILE_75': row.quantile(0.75)})

    @staticmethod
    def calculate_site_stats(df: pd.DataFrame, site_columns: list[str], good_or_bad: str) -> pd.DataFrame:
        assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
        selected_df = df[['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID'] + site_columns].reset_index(drop=True)
        # Perform statistical calculations for each row
        side_features = selected_df.apply(lambda row: ExtractFeaturesBySite.calculate_statistics(row[site_columns]), axis=1)
        side_features = side_features.fillna(0)
        df_with_features = pd.concat([selected_df, side_features], axis=1)
        if good_or_bad == 'good':
            df_with_features['label'] = 0
        else:
            df_with_features['label'] = 1
        return df_with_features

    @staticmethod
    def extract_features_by_site(df: pd.DataFrame,
                                 good_site_columns: list[str],
                                 bad_site_columns: list[str],
                                 missing_value_threshold: Union[int, float] = 0.6,
                                 process_miss_site_mode: str = 'drop') -> Union[pd.DataFrame, None]:
        """
        Extracts features from a DataFrame based on good and bad site columns.
        Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - good_site_columns (list): List of columns representing good sites.
        - bad_site_columns (list): List of columns representing bad sites.
        - missing_value_threshold (Union[int, float]): Threshold for missing values.
        - process_miss_site_mode (str): Mode for handling missing values in site columns, e.g. drop or fill
        Returns:
        - Union[pd.DataFrame, None]: DataFrame with extracted features or None if no data is available.
        """
        df_pandas_specific_oper = ExtractFeaturesBySite.process_missing_values_for_site(df=df,
                                                                                        good_site_columns=good_site_columns,
                                                                                        bad_site_columns=bad_site_columns,
                                                                                        missing_value_threshold=missing_value_threshold,
                                                                                        process_miss_site_mode=process_miss_site_mode)
        if df_pandas_specific_oper.shape[0] != 0:
            side_with_features1 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_oper, good_site_columns,
                                                                             good_or_bad='good')
            side_with_features2 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_oper, bad_site_columns,
                                                                             good_or_bad='bad')
            side_with_features1_select = side_with_features1[
                ['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                 'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features2_select = side_with_features2[
                ['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                 'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
            return side_with_features_all

In [11]:
class FitInlineModelBySite:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 by: list[str],
                 good_site_columns: list[str],
                 bad_site_columns: list[str],
                 process_miss_site_mode: str,
                 columns_to_process: list[str],
                 missing_value_threshold: Union[int, float],
                 model: str = 'pca'):
        """
        Initialize the FitInlineModelBySite object.

        Parameters:
        - df: pyspark.sql.dataframe, the input data
        - by: list[str], the grouping variable, inline data should be ["OPE_NO"]
        - good_site_columns: List of str, column names for good sites
        - bad_site_columns: List of str, column names for bad sites
        - process_miss_site_mode: str, mode for handling missing values in site data, e.g. drop or fill
        - columns_to_process: List of str, columns to process in missing value functions
        - missing_value_threshold: Union[int, float], threshold for missing values
        - model: str, default is 'pca', other options include 'rf' for random forest
        """
        self.df = df
        self.by = by
        self.model = model
        self.good_site_columns = good_site_columns
        self.bad_site_columns = bad_site_columns
        self.process_miss_site_mode = process_miss_site_mode
        self.columns_to_process = columns_to_process
        self.missing_value_threshold = missing_value_threshold

    @staticmethod
    def process_missing_values(df, columns_to_process, missing_value_threshold):
        for column in columns_to_process:
            missing_percentage = df[column].isnull().mean()
            if missing_percentage > missing_value_threshold:
                df = df.drop(columns=[column])
            else:
                df[column] = df[column].fillna(df[column].mean())
        return df

    @staticmethod
    def get_pivot_table(df, columns_to_process, missing_value_threshold):
        df = FitInlineModelBySite.process_missing_values(df, columns_to_process, missing_value_threshold)
        index_list = ['WAFER_ID', 'label']
        values_list = df.columns.difference(['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'label'])
        pivot_result = df.pivot_table(index=index_list,
                                      columns=['OPE_NO', 'INLINE_PARAMETER_ID'],
                                      values=values_list)
        pivot_result.columns = pivot_result.columns.map('#'.join)
        pivot_result = FitInlineModelBySite.process_missing_values(pivot_result, pivot_result.columns, missing_value_threshold)
        pivot_result = pivot_result.reset_index(drop=False)
        # Remove completely identical columns
        for column in pivot_result.columns.difference(index_list):
            if pivot_result[column].nunique() == 1:
                pivot_result = pivot_result.drop(column, axis=1)
        return pivot_result

    def fit_pca_model(self):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])
        good_site_columns = self.good_site_columns
        bad_site_columns = self.bad_site_columns
        missing_value_threshold = self.missing_value_threshold
        process_miss_site_mode = self.process_miss_site_mode
        columns_to_process = self.columns_to_process

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
            if side_with_features_all is None:
                return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

            pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
                                                                columns_to_process=columns_to_process,
                                                                missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

            if x_train.shape[1] > 1:
                n_components = min(min(x_train.shape) - 2, 20)
                model = pca(n_components=n_components, verbose=None)
                results = model.fit_transform(x_train)
                res_top = results['topfeat']
                res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
                res_top_select['importance'] = abs(res_top_select['loading'])
                res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()
                return res_top_select
            else:
                res_top_select = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
                return res_top_select
        return self.df.groupby(self.by).apply(get_model_result)

    def fit_rf_model(self):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])
        good_site_columns = self.good_site_columns
        bad_site_columns = self.bad_site_columns
        missing_value_threshold = self.missing_value_threshold
        process_miss_site_mode = self.process_miss_site_mode
        columns_to_process = self.columns_to_process

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df_run,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
            if side_with_features_all is None:
                return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

            pivot_result = FitInlineModelBySite.get_pivot_table(df=side_with_features_all,
                                                                columns_to_process=columns_to_process,
                                                                missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
            y_train = pivot_result[['label']]
            if min(x_train.shape) > 4 and y_train['label'].nunique() > 1:
                pipe = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
                    ('scaler', StandardScaler()),
                    ('model', RandomForestClassifier(random_state=2024))])
                param_grid = {'model__n_estimators': [*range(10, 60, 10)],
                              'model__max_depth': [*range(5, 50, 10)],
                              'model__min_samples_split': [2, 5],
                              'model__min_samples_leaf': [1, 3]}
                grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
                grid.fit(x_train.values, y_train.values.ravel())
                roc_auc_score_ = grid.best_score_
                if roc_auc_score_ >= 0.6:
                    small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                         'importance': grid.best_estimator_.steps[2][1].feature_importances_})
                    return small_importance_res
                else:
                    small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
                    return small_importance_res
            else:
                small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -102}, index=[0])
                return small_importance_res
        return self.df.groupby(self.by).apply(get_model_result)

    def run(self):
        if self.model == 'pca':
            res = self.fit_pca_model()
        elif self.model == 'rf':
            res = self.fit_rf_model()
        else:
            res = None
        return res

In [43]:
class SplitInlineModelResults:
    def __init__(self, df: pyspark.sql.dataframe, grpby_list: List[str], request_id: str):
        self.df = df
        self.grpby_list = grpby_list
        self.request_id = request_id

    @staticmethod
    def split_features(df: pd.DataFrame, index: int) -> str:
        return df['features'].apply(lambda x: x.split('#')[index])

    @staticmethod
    def get_split_features(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
        n_feats = len(grpby_list)
        for i in range(n_feats):
            df[grpby_list[i]] = SplitInlineModelResults.split_features(df, i + 1)

        df['INLINE_PARAMETER_ID'] = SplitInlineModelResults.split_features(df, n_feats + 1)
        df = df.drop(['features'], axis=1).reset_index(drop=True)
        return df

    @staticmethod
    def split_calculate_features(df: pyspark.sql.dataframe, grpby_list: List[str], by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            split_table = SplitInlineModelResults.get_split_features(df=df_run, grpby_list=grpby_list)
            split_table_grpby = split_table.groupby(grpby_list + ['INLINE_PARAMETER_ID'])[
                'importance'].sum().reset_index(drop=False)
            return split_table_grpby

        return df.groupby(by).apply(get_model_result)

    @staticmethod
    def add_certain_column(df: pyspark.sql.dataframe, grpby_list: List[str], request_id: str,
                           by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                              StructField("AVG_SPEC_CHK_RESULT_COUNT", FloatType(), True),
                              StructField("request_id", StringType(), True),
                              StructField("weight", FloatType(), True),
                              StructField("weight_percent", FloatType(), True),
                              StructField("index_no", IntegerType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_result(final_res):
            # Calculate weights and normalize
            final_res['importance'] = final_res['importance'].astype(float)
            final_res = final_res.query("importance > 0")
            final_res['weight'] = final_res['importance'] / final_res['importance'].sum()
            final_res['weight_percent'] = final_res['weight'] * 100
            final_res = final_res.sort_values('weight', ascending=False)

            final_res['index_no'] = [i + 1 for i in range(len(final_res))]
            final_res['AVG_SPEC_CHK_RESULT_COUNT'] = 0.0
            final_res['request_id'] = request_id
            final_res = final_res.drop(['importance', 'temp'], axis=1)
            return final_res
        return df.groupby(by).apply(get_result)

    def run(self):
        df = self.df.withColumn('temp', lit(0))
        res = self.split_calculate_features(df=df, by='temp')
        res = res.withColumn('temp', lit(1))
        final_res = self.add_certain_column(df=res, by='temp')
        final_res = final_res.withColumn('request_id', lit(self.request_id))
        return final_res


# class ExertInlineBySite:
#     @staticmethod
#     def fit_by_site_model(df: pyspark.sql.dataframe,
#                            request_id: str,
#                            merge_operno_list: List[Dict[str, List[str]]],
#                            good_site_columns: List[str],
#                            bad_site_columns: List[str],
#                            columns_list=None,
#                            key_words=None,
#                            convert_to_numeric_list=None,
#                            grpby_list=None,
#                            certain_column=None) -> Union[str, pyspark.sql.dataframe.DataFrame]:
#         # drop duplicates
#         good_site_columns = list(set(good_site_columns))
#         bad_site_columns = list(set(bad_site_columns))
#         site_columns = good_site_columns + bad_site_columns

#         if columns_list is None:
#             columns_list = ['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'AVERAGE'] + site_columns

#         if key_words is None:
#             key_words = ['CXS', 'CYS', 'FDS']

#         if convert_to_numeric_list is None:
#             convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns

#         if grpby_list is None:
#             grpby_list = ['OPE_NO']

#         if certain_column is None:
#             certain_column = 'INLINE_PARAMETER_ID'

#         df_preprocess = DataPreprocessorForInline(df=df,
#                                                   columns_list=columns_list,
#                                                   certain_column=certain_column,
#                                                   key_words=key_words,
#                                                   convert_to_numeric_list=convert_to_numeric_list,
#                                                   merge_operno_list=merge_operno_list).run()
#         print(df_preprocess.count())
#         if df_preprocess.isEmpty():
#             msg = 'No data of this type in the database!'
#             raise RCABaseException(msg)

#         res = FitInlineModelBySite(df=df_preprocess,
#                                     by=grpby_list,
#                                     model='pca',
#                                     good_site_columns=good_site_columns,
#                                     bad_site_columns=bad_site_columns,
#                                     process_miss_site_mode='drop',
#                                     columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV',
#                                                         'PERCENTILE_25', 'PERCENTILE_75'],
#                                     missing_value_threshold=0.6).run()
#         # res.show()
#         if res.isEmpty():
#             msg = 'No difference in this data. The output of the algorithm is 0.'
#             raise RCABaseException(msg)

#         final_res = SplitInlineModelResults(df=res, request_id=request_id).run()
#         if final_res.isEmpty():
#             msg = 'Temporary exception in adding columns to algorithm results'
#             raise RCABaseException(msg)
#         else:
#             return final_res

In [26]:
good_site_columns = good_site
bad_site_columns = bad_site

good_site_columns = list(set(good_site_columns))
bad_site_columns = list(set(bad_site_columns))
site_columns = good_site_columns + bad_site_columns

grpby_list = ['OPE_NO']

columns_list = grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'AVERAGE'] + site_columns

key_words = ['CXS', 'CYS', 'FDS']

convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns

certain_column = 'INLINE_PARAMETER_ID'

merge_operno_list = merge_operno

In [20]:
request_id

'715'

In [14]:
df_preprocess = DataPreprocessorForInline(df=df1,
                                                  columns_list=columns_list,
                                                  certain_column=certain_column,
                                                  key_words=key_words,
                                                  convert_to_numeric_list=convert_to_numeric_list,
                                                  merge_operno_list=merge_operno_list).run()
df_preprocess.count()

31791

In [16]:
res = FitInlineModelBySite(df=df_preprocess,
                           by=grpby_list,
                           good_site_columns=good_site_columns,
                           bad_site_columns=bad_site_columns,
                           process_miss_site_mode='drop',
                           columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV',
                                               'PERCENTILE_25', 'PERCENTILE_75'],
                           missing_value_threshold=0.6,
                           model='pca').run()
res.show()

+--------------------+----------+
|            features|importance|
+--------------------+----------+
|PERCENTILE_75#IKA...|0.24843656|
|PERCENTILE_75#IKA...|0.25839117|
|MIN_VAL#IKAS.OPEN...|0.30845588|
|PERCENTILE_75#IKA...|0.35949597|
|MAX_VAL#IKAS.OPEN...| 0.3102229|
|MIN_VAL#IKAS.OPEN...|0.41644922|
|MAX_VAL#IKAS.OPEN...| 0.3663416|
|MAX_VAL#IKAS.OPEN...|0.39013106|
|MIN_VAL#IKAS.OPEN...|0.62744516|
|MAX_VAL#IKAS.OPEN...|0.34464037|
|MIN_VAL#IKAS.OPEN...|0.37356374|
|STD_DEV#IKAS.OPEN...|  0.449199|
|MAX_VAL#IKAS.OPEN...|0.30251545|
|MAX_VAL#IKAS.OPEN...| 0.3224624|
|MIN_VAL#IKAS.OPEN...|0.84759843|
|MAX_VAL#IKAS.OPEN...|0.19547674|
|MAX_VAL#IKAS.OPEN...|0.35736012|
|MAX_VAL#IKAS.OPEN...|0.47721264|
|MAX_VAL#IKAS.OPEN...|  0.269516|
|MAX_VAL#IKAS.OPEN...|0.61858433|
+--------------------+----------+
only showing top 20 rows



In [17]:
res_pandas = res.toPandas()
res_pandas

Unnamed: 0,features,importance
0,PERCENTILE_75#IKAS.OPEN.1#FX01,0.248437
1,PERCENTILE_75#IKAS.OPEN.1#MX02,0.258391
2,MIN_VAL#IKAS.OPEN.1#MY03,0.308456
3,PERCENTILE_75#IKAS.OPEN.1#MX01,0.359496
4,MAX_VAL#IKAS.OPEN.1#MY03,0.310223
5,MIN_VAL#IKAS.OPEN.1#MY03,0.416449
6,MAX_VAL#IKAS.OPEN.1#MY01,0.366342
7,MAX_VAL#IKAS.OPEN.1#MY02,0.390131
8,MIN_VAL#IKAS.OPEN.1#MY02,0.627445
9,MAX_VAL#IKAS.OPEN.1#MX03,0.34464


In [44]:
df = res.withColumn('temp', lit(0))
df.show()

+--------------------+----------+----+
|            features|importance|temp|
+--------------------+----------+----+
|PERCENTILE_75#IKA...|0.24843656|   0|
|PERCENTILE_75#IKA...|0.25839117|   0|
|MIN_VAL#IKAS.OPEN...|0.30845588|   0|
|PERCENTILE_75#IKA...|0.35949597|   0|
|MAX_VAL#IKAS.OPEN...| 0.3102229|   0|
|MIN_VAL#IKAS.OPEN...|0.41644922|   0|
|MAX_VAL#IKAS.OPEN...| 0.3663416|   0|
|MAX_VAL#IKAS.OPEN...|0.39013106|   0|
|MIN_VAL#IKAS.OPEN...|0.62744516|   0|
|MAX_VAL#IKAS.OPEN...|0.34464037|   0|
|MIN_VAL#IKAS.OPEN...|0.37356374|   0|
|STD_DEV#IKAS.OPEN...|  0.449199|   0|
|MAX_VAL#IKAS.OPEN...|0.30251545|   0|
|MAX_VAL#IKAS.OPEN...| 0.3224624|   0|
|MIN_VAL#IKAS.OPEN...|0.84759843|   0|
|MAX_VAL#IKAS.OPEN...|0.19547674|   0|
|MAX_VAL#IKAS.OPEN...|0.35736012|   0|
|MAX_VAL#IKAS.OPEN...|0.47721264|   0|
|MAX_VAL#IKAS.OPEN...|  0.269516|   0|
|MAX_VAL#IKAS.OPEN...|0.61858433|   0|
+--------------------+----------+----+
only showing top 20 rows



In [45]:
res_split = SplitInlineModelResults.split_calculate_features(df=df, grpby_list=grpby_list, by='temp')
res_split.show()

+-----------+-------------------+----------+
|     OPE_NO|INLINE_PARAMETER_ID|importance|
+-----------+-------------------+----------+
|IKAS.OPEN.1|               FX01|0.69763553|
|IKAS.OPEN.1|               MX01|0.35949597|
|IKAS.OPEN.1|               MX02|0.25839117|
|IKAS.OPEN.1|               MX03|0.34464037|
|IKAS.OPEN.1|               MY01| 0.3663416|
|IKAS.OPEN.1|               MY02| 1.0175762|
|IKAS.OPEN.1|               MY03| 1.4086918|
|IKAS.OPEN.2|               FGS1|0.32949623|
|IKAS.OPEN.2|               FX01|  0.269516|
|IKAS.OPEN.2|               FX04|0.31579003|
|IKAS.OPEN.2|               FX05| 0.6957592|
|IKAS.OPEN.2|               TGS1| 2.6643157|
|IKAS.OPEN.2|               TGS2|  4.867313|
|IKAS.OPEN.3|               FX01|0.71381706|
|IKAS.OPEN.3|               MX01|0.33225435|
|IKAS.OPEN.3|               MX02|0.54970974|
|IKAS.OPEN.3|               MX03|0.90920126|
|IKAS.OPEN.3|               MX04| 0.7738442|
|IKAS.OPEN.3|               MX05|0.32785642|
|IKAS.OPEN

In [46]:
res_split = res_split.withColumn('temp', lit(1))
res_split.show()

+-----------+-------------------+----------+----+
|     OPE_NO|INLINE_PARAMETER_ID|importance|temp|
+-----------+-------------------+----------+----+
|IKAS.OPEN.1|               FX01|0.69763553|   1|
|IKAS.OPEN.1|               MX01|0.35949597|   1|
|IKAS.OPEN.1|               MX02|0.25839117|   1|
|IKAS.OPEN.1|               MX03|0.34464037|   1|
|IKAS.OPEN.1|               MY01| 0.3663416|   1|
|IKAS.OPEN.1|               MY02| 1.0175762|   1|
|IKAS.OPEN.1|               MY03| 1.4086918|   1|
|IKAS.OPEN.2|               FGS1|0.32949623|   1|
|IKAS.OPEN.2|               FX01|  0.269516|   1|
|IKAS.OPEN.2|               FX04|0.31579003|   1|
|IKAS.OPEN.2|               FX05| 0.6957592|   1|
|IKAS.OPEN.2|               TGS1| 2.6643157|   1|
|IKAS.OPEN.2|               TGS2|  4.867313|   1|
|IKAS.OPEN.3|               FX01|0.71381706|   1|
|IKAS.OPEN.3|               MX01|0.33225435|   1|
|IKAS.OPEN.3|               MX02|0.54970974|   1|
|IKAS.OPEN.3|               MX03|0.90920126|   1|


In [48]:
final_res = SplitInlineModelResults.add_certain_column(df=res_split, grpby_list=grpby_list, by='temp', request_id=request_id)
final_res = final_res.withColumn('request_id', lit(request_id))
final_res.show()

+-----------+-------------------+-------------------------+----------+-----------+--------------+--------+
|    OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|     weight|weight_percent|index_no|
+-----------+-------------------+-------------------------+----------+-----------+--------------+--------+
|IKAS.OPEN.2|               TGS2|                      0.0|       715|   0.244264|     24.426401|       1|
|IKAS.OPEN.2|               TGS1|                      0.0|       715| 0.13370754|     13.370753|       2|
|IKAS.OPEN.1|               MY03|                      0.0|       715| 0.07069459|      7.069459|       3|
|IKAS.OPEN.1|               MY02|                      0.0|       715|0.051066626|     5.1066623|       4|
|IKAS.OPEN.3|               MX03|                      0.0|       715|0.045627873|     4.5627875|       5|
|IKAS.OPEN.3|               MY01|                      0.0|       715|0.041278422|     4.1278424|       6|
|IKAS.OPEN.3|               MX04|    

In [38]:
final_res = SplitInlineModelResults.add_certain_column(df=res_split, by='temp')
final_res = final_res.withColumn('request_id', lit(request_id))
print(final_res.count())
final_res.show()

24
+-----------+-------------------+-------------------------+-----------+--------------+--------+----------+
|    OPER_NO|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|     weight|weight_percent|index_no|request_id|
+-----------+-------------------+-------------------------+-----------+--------------+--------+----------+
|IKAS.OPEN.2|               TGS2|                      0.0|   0.244264|     24.426401|       1|       715|
|IKAS.OPEN.2|               TGS1|                      0.0| 0.13370754|     13.370753|       2|       715|
|IKAS.OPEN.1|               MY03|                      0.0| 0.07069459|      7.069459|       3|       715|
|IKAS.OPEN.1|               MY02|                      0.0|0.051066626|     5.1066623|       4|       715|
|IKAS.OPEN.3|               MX03|                      0.0|0.045627873|     4.5627875|       5|       715|
|IKAS.OPEN.3|               MY01|                      0.0|0.041278422|     4.1278424|       6|       715|
|IKAS.OPEN.3|               MX04| 