In [1]:
import json
import requests
import pymysql
import numpy as np
import pandas as pd
import pyspark.pandas as ps
import pyspark.sql.functions as F

from pca import pca
from scipy import stats
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
# from backend_spark.doris_common.doris_client import DorisClient
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, monotonically_increasing_id, lit, col, when, countDistinct



In [2]:
import os
import warnings
warnings.filterwarnings('ignore')

from pyspark.sql import SparkSession
os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [3]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/inline_algorithm/codes_version6/inline_test_data1.csv")
df_pandas

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,MEASURE_TIME,RANGE_INDEX,FAB_ID,PRODUCT_ID,LOT_ID,AVERAGE,MAX_VAL,...,ACT_CODE,ETL_INSERT_TIME,ETL_ARC_FLAG,ETL_BATCH_SYNC_TS,ETL_DEL_FLAG,ETL_DS_JOB_NM,ETL_SRC_DB,ETL_SRC_TBL,ETL_TBL_OPER_TS,label
0,NBX219-17,1U.CDG10,CIW0,2023-08-31 09:58:00,0,N1,AFPNR901N.0B0J,NBX219000,,,...,,2023-08-31 10:08:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
1,NBX219-17,6V.CDG10,SFW0,2023-09-13 12:01:00,0,N1,AFPNR901N.0B0J,NBX219000,1.000000,,...,,2023-09-13 12:13:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
2,NBX219-17,6V.CDG10,TSW0,2023-09-13 12:01:00,0,N1,AFPNR901N.0B0J,NBX219000,350.719689,,...,,2023-09-13 12:13:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
3,NBX219-17,6V.CDG10,FEW0,2023-09-13 12:01:00,0,N1,AFPNR901N.0B0J,NBX219000,346.513125,,...,,2023-09-13 12:13:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
4,NBX219-17,6V.CDG10,HFT0,2023-09-13 12:01:00,0,N1,AFPNR901N.0B0J,NBX219000,38.800000,,...,,2023-09-13 12:13:00,0,1970-01-01 00:00:00,0,,,,1970-01-01 00:00:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,NAZ415-08,1U.CDG20,HFT0,2022-12-12 16:06:00,0,N1,AFPNM301N.0A01,NAZ415000,5.319000,,...,,2023-05-29 04:48:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,0
612,NAZ415-08,1U.CDG20,OEW0,2022-12-12 16:06:00,0,N1,AFPNM301N.0A01,NAZ415000,6000.152830,,...,,2023-05-29 04:48:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,0
613,NAZ415-08,1U.CDG20,PEW0,2022-12-12 16:06:00,0,N1,AFPNM301N.0A01,NAZ415000,7.000239,,...,,2023-05-29 04:48:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,0
614,NAZ415-08,1U.CDG20,PTW0,2022-12-12 16:06:00,0,N1,AFPNM301N.0A01,NAZ415000,5321.000000,,...,,2023-05-29 04:48:00,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00,0


In [4]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

616

In [5]:
def parse_JSON_config(df: pd.DataFrame):
    request_id = df["requestId"].values[0]
    request_params = df["requestParam"].values[0]
    parse_dict = json.loads(request_params)

    # PRODUCT_ID, PROG1, EQP, CHAMBER, OPER_NO存在部分合并的情况
    try:
        # OPER_NO的部分合并结果
        merge_operno = list(parse_dict.get('mergeOperno')) if parse_dict.get('mergeOperno') else None
    except KeyError:
        merge_operno = None

    try:
        # PROG1的部分合并结果
        merge_prodg1 = list(parse_dict.get('mergeProdg1')) if parse_dict.get('mergeProdg1') else None
    except KeyError:
        merge_prodg1 = None

    try:
        # PRODUCT_ID的部分合并结果
        merge_product = list(parse_dict.get('mergeProductId')) if parse_dict.get('mergeProductId') else None
    except KeyError:
        merge_product = None

    try:
        # EQP的部分合并结果
        merge_eqp = list(parse_dict.get('mergeEqp')) if parse_dict.get('mergeEqp') else None
    except KeyError:
        merge_eqp = None

    try:
        # CHAMBER的部分合并结果
        merge_chamber = list(parse_dict.get('mergeChamber')) if parse_dict.get('mergeChamber') else None
    except KeyError:
        merge_chamber = None

    # 获取good_site和bad_site
    try:
        good_site = list(parse_dict.get('goodSite')) if parse_dict.get('goodSite') else None
    except KeyError:
        good_site = None

    try:
        bad_site = list(parse_dict.get('badSite')) if parse_dict.get('badSite') else None
    except KeyError:
        bad_site = None

    # group by 子句中的字段
    group_by_list = parse_dict.get("groupByList")
    if group_by_list is None or len(group_by_list) == 0:
        group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
        # PRODUCT_ID, PROG1, CHAMBER 这3个存在一键合并的切换开关
        # 且一键合并PROG1时会自动一键合并PRODUCT_ID
        flag_merge_prodg1 = parse_dict.get('flagMergeAllProdg1')
        flag_merge_product_id = parse_dict.get('flagMergeAllProductId')
        flag_merge_chamber = parse_dict.get('flagMergeAllChamber')

        if flag_merge_prodg1 == '1':
            # 一键合并PROG1时，部分合并PROG1和PRODUCT_ID的情况都会被忽略
            merge_prodg1 = None
            merge_product = None
            group_by_list = ['OPER_NO', "EQP_NAME", 'TOOL_NAME']
            if flag_merge_chamber == '1':
                group_by_list = ['OPER_NO', "EQP_NAME"]
        elif flag_merge_product_id == '1':
            # 一键合并PRODUCT_ID时，部分合并PRODUCT_ID的情况会被忽略
            merge_product = None
            group_by_list = ["PRODG1", "OPER_NO", "EQP_NAME", "TOOL_NAME"]
            if flag_merge_chamber == '1':
                # 一键合并CHAMBER时，部分合并CHAMBER的情况会被忽略
                group_by_list = ["PRODG1", 'OPER_NO', "EQP_NAME"]
        elif flag_merge_chamber == '1':
            merge_chamber = None
            group_by_list = ["PRODG1", "PRODUCT_ID", "OPER_NO", "EQP_NAME"]

    return parse_dict, request_id, group_by_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site

In [6]:
json_config_ = {"requestId": "346",
                "algorithm": "inline_by_wafer",
                "requestParam": {"dateRange":
                                     {"start": "2022-12-07 18:21:16",
                                      "end": "2024-03-07 18:21:16"},
                                 "operNo": ["1U.CDG10",
                                            "1U.CDG20",
                                            "1V.PQA10",
                                            "2U.PQA10",
                                            "3U.PQA10",
                                            "6V.CDG10"],
                                 "uploadId": "4311a393034a4e0c898d83cb54f86b4e",
                                 "goodSite": [], "badSite": [],
                                 "flagMergeAllProdg1": "0",
                                 "flagMergeAllProductId": "0",
                                 "flagMergeAllChamber": "0",
                                 "mergeProdg1": [],
                                 "mergeProductId": [],
                                 "mergeEqp": [],
                                 "mergeChamber": [],
                                 "mergeOperno": [
                                     {
                                         "1U.CDG10,1U.CDG20,6V.CDG10": [
                                             "1U.CDG10",
                                             "1U.CDG20",
                                             "6V.CDG10"]}
                                            ]
                                 }
                }

In [7]:
df_info_ = pd.DataFrame({"requestId": [json_config_["requestId"]],
                             "requestParam": [json.dumps(json_config_["requestParam"])]})

# 解析JSON并且读取数据
parse_dict, request_id, grpby_list, merge_operno, merge_prodg1, merge_product, merge_eqp, merge_chamber, good_site, bad_site = parse_JSON_config(
    df_info_)
print("parse_dict:")
print(parse_dict)
print("request_id:")
print(request_id)
print("grpby_list:")
print(grpby_list)
print("merge_operno:")
print(merge_operno)
print("merge_prodg1:")
print(merge_prodg1)
print("merge_product:")
print(merge_product)
print("merge_eqp:")
print(merge_eqp)
print("merge_chamber:")
print(merge_chamber)
print("good_site:")
print(good_site)
print("bad_site:")
print(bad_site)

parse_dict:
{'dateRange': {'start': '2022-12-07 18:21:16', 'end': '2024-03-07 18:21:16'}, 'operNo': ['1U.CDG10', '1U.CDG20', '1V.PQA10', '2U.PQA10', '3U.PQA10', '6V.CDG10'], 'uploadId': '4311a393034a4e0c898d83cb54f86b4e', 'goodSite': [], 'badSite': [], 'flagMergeAllProdg1': '0', 'flagMergeAllProductId': '0', 'flagMergeAllChamber': '0', 'mergeProdg1': [], 'mergeProductId': [], 'mergeEqp': [], 'mergeChamber': [], 'mergeOperno': [{'1U.CDG10,1U.CDG20,6V.CDG10': ['1U.CDG10', '1U.CDG20', '6V.CDG10']}]}
request_id:
346
grpby_list:
['PRODG1', 'PRODUCT_ID', 'OPER_NO', 'EQP_NAME', 'TOOL_NAME']
merge_operno:
[{'1U.CDG10,1U.CDG20,6V.CDG10': ['1U.CDG10', '1U.CDG20', '6V.CDG10']}]
merge_prodg1:
None
merge_product:
None
merge_eqp:
None
merge_chamber:
None
good_site:
None
bad_site:
None


### DataPreprocessorForInline

In [23]:
class DataPreprocessorForInline:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 columns_list: list[str],
                 certain_column: str,
                 key_words: list[str],
                 convert_to_numeric_list: list[str],
                 merge_operno_list: List[Dict[str, List[str]]]):
        self.df = df
        self.columns_list = columns_list
        self.certain_column = certain_column
        self.key_words = key_words
        self.convert_to_numeric_list = convert_to_numeric_list
        self.merge_operno_list = merge_operno_list

    @staticmethod
    def select_columns(df: pyspark.sql.dataframe, columns_list: list[str]) -> pyspark.sql.dataframe:
        return df.select(columns_list)

    @staticmethod
    def exclude_some_data(df: pyspark.sql.dataframe, key_words: list[str], certain_column: str) -> pyspark.sql.dataframe:
        key_words_str = '|'.join(key_words)
        df_filtered = df.filter(~col(certain_column).rlike(key_words_str))
        return df_filtered

    @staticmethod
    def pre_process(df: pyspark.sql.dataframe, convert_to_numeric_list: list[str]) -> pyspark.sql.dataframe:
        for column in convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in convert_to_numeric_list:
            convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=convert_to_numeric_list, how='all')
        return df

    @staticmethod
    def integrate_columns(df: pyspark.sql.dataframe, merge_operno_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        # split using comma
        splitter_comma = ","
        if merge_operno_list is not None and len(merge_operno_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_operno_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("OPE_NO", when(col("OPE_NO").isin(values), replacement_value).otherwise(col("OPE_NO")))
        return df

    def run(self) -> pyspark.sql.dataframe:
        df_select = self.select_columns(df=self.df, columns_list=self.columns_list)
        df_esd = self.exclude_some_data(df=df_select, key_words=self.key_words, certain_column=self.certain_column)
        df_pp = self.pre_process(df=df_esd, convert_to_numeric_list=self.convert_to_numeric_list)
        df_integrate = self.integrate_columns(df=df_pp, merge_operno_list=self.merge_operno_list)
        return df_integrate

In [558]:
grpby_list = ['PRODUCT_ID', 'OPE_NO']

columns_list = grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL',
                            'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'SITE_COUNT', 'label']
key_words = ['CXS', 'CYS', 'FDS']
convert_to_numeric_list = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25',
                                       'PERCENTILE_75', 'SITE_COUNT']

certain_column = 'INLINE_PARAMETER_ID'

merge_operno_list = merge_operno

In [545]:
df_preprocess = DataPreprocessorForInline(df=df1,
                                      columns_list=columns_list,
                                      certain_column=certain_column,
                                      key_words=key_words,
                                      convert_to_numeric_list=convert_to_numeric_list,
                                      merge_operno_list=merge_operno_list).run()

In [546]:
df_preprocess.toPandas()

Unnamed: 0,PRODUCT_ID,OPE_NO,LOT_ID,WAFER_ID,INLINE_PARAMETER_ID,AVERAGE,MAX_VAL,MEDIAN,MIN_VAL,STD_DEV,PERCENTILE_25,PERCENTILE_75,SITE_COUNT,label
0,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,6V.CDG10",NBX219000,NBX219-17,SFW0,1.000000,,,,,,,,1
1,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,6V.CDG10",NBX219000,NBX219-17,TSW0,350.719689,,,,,,,,1
2,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,6V.CDG10",NBX219000,NBX219-17,FEW0,346.513125,,,,,,,,1
3,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,6V.CDG10",NBX219000,NBX219-17,HFT0,38.800000,,,,,,,,1
4,AFPNR901N.0B0J,"1U.CDG10,1U.CDG20,6V.CDG10",NBX219000,NBX219-17,OEW0,5999.517098,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
570,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ415000,NAZ415-08,HFT0,5.319000,,,,,,,,0
571,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ415000,NAZ415-08,OEW0,6000.152830,,,,,,,,0
572,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ415000,NAZ415-08,PEW0,7.000239,,,,,,,,0
573,AFPNM301N.0A01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ415000,NAZ415-08,PTW0,5321.000000,,,,,,,,0


### GetTrainDataForInline

In [547]:
class GetTrainDataForInline:
    def __init__(self, df: pyspark.sql.dataframe, grpby_list: list[str]):
        """
        Initialize the GetTrainDataForInline class.

        Parameters:
        - df (pyspark.sql.dataframe): Input DataFrame.
        - grpby_list (list): List of grouping columns, inline data should be ["OPE_NO"]

        This class is designed to perform commonality analysis and retrieve training data based on the
        condition "bad_num > 1 AND wafer_count > 2" in each grpby_list, i.e. each OPE_NO for inline data.
        """
        self.df_run = df
        self.grpby_list = grpby_list

    @staticmethod
    def commonality_analysis(df_run: pyspark.sql.dataframe, grpby_list: list[str]) -> pyspark.sql.dataframe:
        grps = (df_run.groupBy(grpby_list)
                .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
                .na.fill(0)
                .orderBy(['bad_num', 'good_num'], ascending=False))
        if grps.count() == 1:
            return grps
        else:
            grps = grps.filter("bad_num > 1 AND wafer_count > 2")
            return grps

    @staticmethod
    def get_data_list(common_res: pyspark.sql.dataframe, grpby_list: list[str]) -> List[Dict[str, str]]:
        data_list = common_res.select(grpby_list).collect()
        data_dict_list = [row.asDict() for row in data_list]
        return data_dict_list

    @staticmethod
    def get_train_data(df_run: pyspark.sql.dataframe, data_dict_list: List[Dict[str, str]]) -> pyspark.sql.dataframe:
        first_data_dict = data_dict_list[0]
        conditions = " AND ".join(["{} == '{}'".format(col_, first_data_dict[col_]) for col_ in first_data_dict])
        print(conditions)
        df_s = df_run.filter(conditions)
        for i in range(1, len(data_dict_list)):
            data_dict = data_dict_list[i]
            conditions = " AND ".join(["{} == '{}'".format(col_, data_dict[col_]) for col_ in data_dict])
            print(conditions)
            df_m = df_run.filter(conditions)
            df_s = df_s.union(df_m)
        return df_s

    def run(self) -> pyspark.sql.dataframe:
        common_res = self.commonality_analysis(df_run=self.df_run, grpby_list=self.grpby_list)
        common_res.show()
        data_dict_list = self.get_data_list(common_res=common_res, grpby_list=self.grpby_list)
        train_data = self.get_train_data(df_run=self.df_run, data_dict_list=data_dict_list)
        return train_data

In [548]:
df_train = GetTrainDataForInline(df=df_preprocess, grpby_list=grpby_list).run()

+--------------+--------------------+---------+-----------+--------+-------+
|    PRODUCT_ID|              OPE_NO|   LOT_ID|wafer_count|good_num|bad_num|
+--------------+--------------------+---------+-----------+--------+-------+
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|NAZ703000|          3|       0|      3|
+--------------+--------------------+---------+-----------+--------+-------+

PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '1U.CDG10,1U.CDG20,6V.CDG10' AND LOT_ID == 'NAZ703000'


In [549]:
df_train.toPandas()

Unnamed: 0,PRODUCT_ID,OPE_NO,LOT_ID,WAFER_ID,INLINE_PARAMETER_ID,AVERAGE,MAX_VAL,MEDIAN,MIN_VAL,STD_DEV,PERCENTILE_25,PERCENTILE_75,SITE_COUNT,label
0,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,TSW0,350.1,,,,,,,,1
1,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,HFT0,0.839,,,,,,,,1
2,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,OEW0,5999.825,,,,,,,,1
3,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,PEW0,7.013338,,,,,,,,1
4,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,PTW0,890.0,,,,,,,,1
5,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,REW0,151.044719,,,,,,,,1
6,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,SEW0,1.3066,,,,,,,,1
7,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,SFW0,1.0,,,,,,,,1
8,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,EEW0,3999.65,,,,,,,,1
9,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,NAZ703-09,FEW0,25.074,,,,,,,,1


### FitInlineModelByWafer

In [58]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [550]:
class FitInlineModelByWafer:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 grpby_list: list[str],
                 columns_to_process: list[str],
                 missing_value_threshold: float,
                 model: str = 'pca'):
        """
        Initialize the FitInlineModelByWafer object.

        Parameters:
        - df: pyspark.sql.dataframe, the input data
        - grpby_list: list[str], the grouping variable, inline data should be ["OPE_NO"] mostly the case
        - columns_to_process: List of str, columns to process in missing value functions
        - missing_value_threshold: Union[int, float], threshold for missing values
        - model: str, default is 'pca', other options include 'rf' for random forest, 'decisionTree' for decision tree,
                 svc, logistic and sgd.
        """
        self.df = df
        self.grpby_list = grpby_list
        self.columns_to_process = columns_to_process
        self.missing_value_threshold = missing_value_threshold
        self.model = model

    @staticmethod
    def process_missing_values(df, columns_to_process, missing_value_threshold):
        for column in columns_to_process:
            missing_percentage = df[column].isnull().mean()
            if missing_percentage > missing_value_threshold:
                df = df.drop(columns=[column])
            else:
                df[column] = df[column].fillna(df[column].mean())
        return df

    @staticmethod
    def get_pivot_table(df, grpby_list, columns_to_process, missing_value_threshold):
        df_specific = FitInlineModelByWafer.process_missing_values(df, columns_to_process, missing_value_threshold)
        index_list = ['WAFER_ID', 'label']
        columns_list = grpby_list + ['INLINE_PARAMETER_ID']
        values_list = df_specific.columns.difference(
            ['WAFER_ID', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'label'] + grpby_list)
        pivot_result = df_specific.pivot_table(index=index_list,
                                               columns=columns_list,
                                               values=values_list)
        pivot_result.columns = pivot_result.columns.map('#'.join)
        pivot_result = FitInlineModelByWafer.process_missing_values(pivot_result, pivot_result.columns,
                                                                    missing_value_threshold)
        pivot_result = pivot_result.reset_index(drop=False)
        # Remove completely identical columns
        for column in pivot_result.columns.difference(index_list):
            if pivot_result[column].nunique() == 1:
                pivot_result = pivot_result.drop(column, axis=1)
        return pivot_result

    @staticmethod
    def fit_pca_model(df, grpby_list, columns_to_process, missing_value_threshold):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            pivot_result = FitInlineModelByWafer.get_pivot_table(df=df_run,
                                                                 grpby_list=grpby_list,
                                                                 columns_to_process=columns_to_process,
                                                                 missing_value_threshold=missing_value_threshold)

            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
            if x_train.shape[1] > 1:
                n_components = min(min(x_train.shape) - 2, 20)
                model = pca(n_components=n_components, verbose=None)
                results = model.fit_transform(x_train)
                res_top = results['topfeat']
                res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
                res_top_select['importance'] = abs(res_top_select['loading'])
                res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading",
                                                                                             axis=1).drop_duplicates()
                return res_top_select
            else:
                res_top_select = pd.DataFrame()
                return res_top_select

        return df.groupby(grpby_list).apply(get_model_result)

    @staticmethod
    def get_pipe_params(model):
        common_steps = [
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler())
        ]
        models = {
            'rf': (RandomForestClassifier(random_state=2024), {
                'model__n_estimators': [*range(10, 60, 10)],
                'model__max_depth': [*range(5, 50, 10)],
                'model__min_samples_split': [2, 5],
                'model__min_samples_leaf': [1, 3]
            }),

            'decisionTree': (DecisionTreeClassifier(random_state=2024), {
                'model__max_depth': [None, 5, 10, 15],
                'model__min_samples_split': [2, 5, 10],
                'model__min_samples_leaf': [1, 2, 4]
            }),

            'svc': (LinearSVC(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'squared_hinge'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'logistic': (LogisticRegression(random_state=2024, fit_intercept=False, solver='liblinear'), {
                'model__penalty': ['l1', 'l2'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'sgd': (SGDClassifier(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'log_loss', 'perceptron', 'huber'],
                'model__penalty': ['l1', 'l2', 'elasticnet', None],
                'model__alpha': [0.0001, 0.001, 0.01, 0.1],
                'model__max_iter': [100, 500, 1000]
            })
        }

        if model in models:
            model_class, param_grid = models[model]
            steps = common_steps + [('model', model_class)]
            pipe = Pipeline(steps)
        else:
            raise Exception('Wrong Model Selection. Supported models are: pca, rf, decisionTree, svc, logistic, sgd.')
        return pipe, param_grid

    @staticmethod
    def fit_classification_model(df, grpby_list, columns_to_process, missing_value_threshold, model):
        schema_all = StructType([StructField("features", StringType(), True),
                                 StructField("importance", FloatType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            pivot_result = FitInlineModelByWafer.get_pivot_table(df=df_run,
                                                                 grpby_list=grpby_list,
                                                                 columns_to_process=columns_to_process,
                                                                 missing_value_threshold=missing_value_threshold)
            x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
            y_train = pivot_result[['label']]

            if x_train.shape[1] > 1 and y_train['label'].nunique() > 1:
                pipe, param_grid = FitInlineModelByWafer.get_pipe_params(model=model)
                try:  # cv=3 may be large
                    grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
                    grid.fit(x_train.values, y_train.values.ravel())
#                     roc_auc_score_ = grid.best_score_
                except ValueError:
                    return pd.DataFrame()
                
                best_est = grid.best_estimator_.steps[-1][-1]
                if hasattr(best_est, 'feature_importances_'):
                    small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                         'importance': best_est.feature_importances_})
                else:
                    small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                         'importance': abs(best_est.coef_.ravel())})
                return small_importance_res

            else:
                small_importance_res = pd.DataFrame()
                return small_importance_res

        return df.groupby(grpby_list).apply(get_model_result)

    def run(self):
        if self.model == 'pca':
            res = self.fit_pca_model(df=self.df, grpby_list=self.grpby_list,
                                     columns_to_process=self.columns_to_process,
                                     missing_value_threshold=self.missing_value_threshold)
        else:
            res = self.fit_classification_model(df=self.df, grpby_list=self.grpby_list,
                                                columns_to_process=self.columns_to_process,
                                                missing_value_threshold=self.missing_value_threshold,
                                                model=self.model)
        return res

In [556]:
res = FitInlineModelByWafer(df=df_train,
                            grpby_list=grpby_list,
                            columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75'],
                            missing_value_threshold=0.6,
                            model='pca').run()

res.toPandas()

Unnamed: 0,features,importance
0,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",0.999041


In [483]:
# res.show()

In [557]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.toPandas()

Unnamed: 0,PRODUCT_ID,OPER_NO,LOT_ID,INLINE_PARAMETER_ID,AVG_SPEC_CHK_RESULT_COUNT,request_id,weight,weight_percent,index_no
0,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",NAZ703000,REW0,0.0,346,1.0,100.0,1


In [320]:
from sklearn.svm import LinearSVC
from sklearn.inspection import permutation_importance
from sklearn.model_selection import permutation_test_score

In [93]:
# pivot_result = df_pandas_after1.pivot_table(index=['WAFER_ID', 'label'],
#                columns=['INLINE_PARAMETER_ID'] + grpby_list,
#                values=df_pandas_after1.columns.difference(['WAFER_ID', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'label'] + grpby_list))

# pivot_result

In [518]:
# OPE_NO == '2U.PQA10'
# OPE_NO == '3U.PQA10'
# PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '1U.CDG10,1U.CDG20,6V.CDG10'
# PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '2U.PQA10'
# PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '1V.PQA10'
# PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '3U.PQA10'

df_pandas_after = df_preprocess.toPandas()
df_pandas_after1 = df_pandas_after.query("PRODUCT_ID == 'AFPNM301N.0B01' & OPE_NO == '1U.CDG10,1U.CDG20,6V.CDG10'")
pivot_result = FitInlineModelByWafer.get_pivot_table(df=df_pandas_after1,
                                                     grpby_list=grpby_list,
                                            columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 
                                                                'PERCENTILE_25', 'PERCENTILE_75'],
                                            missing_value_threshold=0.6)

In [519]:
pivot_result

Unnamed: 0,WAFER_ID,label,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#EEW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#FEW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#HFT0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#OEW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#PEW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#PTW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#REW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#SEW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#SFW0","AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CDG10#TSW0"
0,NAZ439-03,0,3999.640122,232.60945,22.1495,5999.878578,7.000028,22100.5,554.54035,3.653428,1.5,350.416324
1,NAZ439-06,1,3999.702446,231.411181,22.1495,5999.9408,7.000159,22100.5,553.472673,3.653167,2.0,350.28005
2,NAZ439-07,0,3999.754604,233.108691,21.7,5999.946867,6.999741,21650.5,555.789099,3.654112,1.0,350.51162
3,NAZ703-01,1,3999.705126,162.351153,14.502667,5999.785144,7.00381,14510.0,415.68075,2.870381,1.0,350.320516
4,NAZ703-08,1,3999.760017,163.421002,14.459667,5999.944475,7.004016,14510.333333,413.592696,2.870856,1.0,350.173968
5,NAZ703-09,1,3999.727076,162.850214,14.502333,5999.838045,7.004393,14510.666667,418.327115,2.870584,1.666667,350.243187
6,NBX082-05,1,3999.607028,155.407716,15.049,5999.720876,7.003859,15027.0,412.656204,2.871205,1.666667,350.236616
7,NBX082-12,1,3999.604033,167.136649,14.942667,5999.698879,7.003024,14967.0,414.913292,2.869925,2.0,350.1333
8,NBX082-15,1,3999.731531,163.213194,15.065667,5999.577791,7.003612,15043.333333,411.914438,2.869582,1.333333,350.317455
9,NBX082-16,1,3999.731531,162.283899,15.066,5999.577791,7.003612,15043.333333,412.459568,2.869582,1.333333,350.146031


In [530]:
x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
y_train = pivot_result[['label']]

pipe = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
                        ('scaler', StandardScaler()),
                        ('model', LinearSVC(random_state=2024, fit_intercept=False))])

# 定义要搜索的参数组合
param_grid = {'model__loss': ['hinge', 'squared_hinge'], 
              'model__C': [0.1, 0.5, 1, 10, 50]}

# 使用GridSearchCV执行网格搜索
grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
grid.fit(x_train.values, y_train.values.ravel())
roc_auc_score_ = grid.best_score_

In [531]:
best_est = grid.best_estimator_.steps[-1][-1]
# hasattr(best_est, 'feature_importances_')

In [532]:
grid.predict(x_train.values)

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

In [533]:
roc_auc_score_

nan

In [525]:
grid.best_params_

{'model__C': 0.1, 'model__loss': 'hinge'}

In [526]:
pd.DataFrame({'features': x_train.columns,
              'importance': grid.best_estimator_.steps[2][1].coef_.ravel()})

Unnamed: 0,features,importance
0,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.002239
1,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.213943
2,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.206444
3,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.028268
4,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",0.226793
5,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.206147
6,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.206896
7,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.205279
8,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",0.057295
9,"AVERAGE#AFPNM301N.0B01#1U.CDG10,1U.CDG20,6V.CD...",-0.168261


In [302]:
permutation_importance(grid.best_estimator_.steps[2][1], x_train.values, y_train.values.ravel(), n_repeats=10, random_state=42)

{'importances_mean': array([0., 0., 0., 0.]),
 'importances_std': array([0., 0., 0., 0.]),
 'importances': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])}

In [159]:
from sklearn.tree import DecisionTreeClassifier

In [382]:
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
    ('scaler', StandardScaler()),
    ('model', DecisionTreeClassifier(random_state=2024))])
param_grid = {
    'model__max_depth': [None, 5, 10, 15],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}
grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
grid.fit(x_train.values, y_train.values.ravel())
roc_auc_score_ = grid.best_score_

In [383]:
roc_auc_score_

0.5

In [306]:
grid.best_estimator_.steps[2][1].feature_importances_

array([0., 0., 0., 1.])

In [307]:
permutation_importance(grid.best_estimator_.steps[2][1], x_train.values, y_train.values.ravel(), n_repeats=10, random_state=42)

{'importances_mean': array([0., 0., 0., 0.]),
 'importances_std': array([0., 0., 0., 0.]),
 'importances': array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])}

In [384]:
best_est = grid.best_estimator_.steps[-1][-1]
hasattr(best_est, 'feature_importances_')

True

In [166]:
from sklearn.linear_model import LogisticRegression

In [309]:
pipe = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
                    ('scaler', StandardScaler()),
                    ('model', LogisticRegression(random_state=2024, fit_intercept=False))])

param_grid = {'model__penalty': ['l1', 'l2', 'elasticnet', None],
              'model__C': [0.1, 0.5, 1, 10, 50]}

grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
grid.fit(x_train.values, y_train.values.ravel())

roc_auc_score_ = grid.best_score_

In [310]:
roc_auc_score_

0.0

In [311]:
abs(grid.best_estimator_.steps[2][1].coef_)

array([[0.12041855, 0.12041855, 0.09127949, 0.09298407]])

In [312]:
permutation_importance(grid.best_estimator_.steps[2][1], x_train.values, y_train.values.ravel(), n_repeats=10, random_state=42)

{'importances_mean': array([ 0.        ,  0.        ,  0.        , -0.01428571]),
 'importances_std': array([0.        , 0.        , 0.        , 0.04285714]),
 'importances': array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , -0.14285714,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ,  0.        ]])}

In [173]:
from sklearn.linear_model import SGDClassifier

In [313]:
pipe = Pipeline(steps=[
                    ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
                    ('scaler', StandardScaler()),
                    ('model', SGDClassifier(random_state=2024, fit_intercept=False))])
param_grid = {
    'model__loss': ['hinge', 'log_loss', 'perceptron', 'huber'],
    'model__penalty': ['l1', 'l2', 'elasticnet', None],
    'model__alpha': [0.0001, 0.001, 0.01, 0.1],
    'model__max_iter': [100, 500, 1000]
}
grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
grid.fit(x_train.values, y_train.values.ravel())
roc_auc_score_ = grid.best_score_

In [314]:
grid.best_params_

{'model__alpha': 0.1,
 'model__loss': 'huber',
 'model__max_iter': 100,
 'model__penalty': 'l1'}

In [315]:
roc_auc_score_

0.5

In [378]:
grid.best_estimator_.steps[-1][-1]

In [328]:
score, permutation_scores, pvalue = permutation_test_score(grid.best_estimator_.steps[2][1], x_train.values, y_train.values.ravel(),  random_state=42, cv=3)

In [333]:
permutation_scores

array([0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.44444444, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555556,
       0.55555556, 0.55555556, 0.55555556, 0.55555556, 0.55555

In [449]:
res.toPandas()

Unnamed: 0,features,importance
0,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#PTW0",0.999744
1,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#REW0",0.806053
2,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#FEW0",0.806279
3,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#SFW0",0.977271
4,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#OEW0",0.718583
5,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#TSW0",0.818122
6,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#EEW0",0.844371
7,"AVERAGE#1U.CDG10,1U.CDG20,6V.CDG10#HFT0",0.97998
8,AVERAGE#1V.PQA10#SNW0,0.999942
9,AVERAGE#2U.PQA10#SNW0,0.999456


### SplitInlineModelResults

In [492]:
class SplitInlineModelResults:
    def __init__(self, df: pyspark.sql.dataframe, grpby_list: List[str], request_id: str):
        self.df = df
        self.grpby_list = grpby_list
        self.request_id = request_id

    @staticmethod
    def split_features(df: pd.DataFrame, index: int) -> str:
        return df['features'].apply(lambda x: x.split('#')[index])

    @staticmethod
    def get_split_features(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
        n_feats = len(grpby_list)
        for i in range(n_feats):
            df[grpby_list[i]] = SplitInlineModelResults.split_features(df, i + 1)

        df['INLINE_PARAMETER_ID'] = SplitInlineModelResults.split_features(df, n_feats + 1)
        df = df.drop(['features'], axis=1).reset_index(drop=True)
        return df

    @staticmethod
    def split_calculate_features(df: pyspark.sql.dataframe, grpby_list: List[str], by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            split_table = SplitInlineModelResults.get_split_features(df_run, grpby_list=grpby_list)
            split_table_grpby = split_table.groupby(grpby_list + ['INLINE_PARAMETER_ID'])['importance'].sum().reset_index(drop=False)
            return split_table_grpby
        return df.groupby(by).apply(get_model_result)

    @staticmethod
    def add_certain_column(df: pyspark.sql.dataframe, grpby_list:List[str], request_id: str, by: str) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("INLINE_PARAMETER_ID", StringType(), True),
                             StructField("AVG_SPEC_CHK_RESULT_COUNT", FloatType(), True),
                             StructField("request_id", StringType(), True),
                             StructField("weight", FloatType(), True),
                             StructField("weight_percent", FloatType(), True),
                             StructField("index_no", IntegerType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_result(final_res):
            # Calculate weights and normalize
            final_res['importance'] = final_res['importance'].astype(float)
            final_res = final_res.query("importance > 0")
            final_res['weight'] = final_res['importance'] / final_res['importance'].sum()
            final_res['weight_percent'] = final_res['weight'] * 100
            final_res = final_res.sort_values('weight', ascending=False)

            final_res['index_no'] = [i + 1 for i in range(len(final_res))]
            final_res['AVG_SPEC_CHK_RESULT_COUNT'] = 0.0
            final_res['request_id'] = request_id
            final_res = final_res.drop(['importance', 'temp'], axis=1)
            return final_res
        return df.groupby(by).apply(get_result)

    def run(self):
        df = self.df.withColumn('temp', lit(0))
        res = self.split_calculate_features(df=df, grpby_list=self.grpby_list, by='temp')
        res = res.withColumn('temp', lit(1))
        final_res = self.add_certain_column(df=res, grpby_list=self.grpby_list, request_id=self.request_id, by='temp')
        final_res = final_res.withColumnRenamed('OPE_NO', 'OPER_NO')
        return final_res

In [493]:
final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
final_res.toPandas()

Unnamed: 0,PRODUCT_ID,OPER_NO,INLINE_PARAMETER_ID,AVG_SPEC_CHK_RESULT_COUNT,request_id,weight,weight_percent,index_no
0,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",EEW0,0.0,346,0.179004,17.900446,1
1,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",PTW0,0.0,346,0.128604,12.860449,2
2,AFPNM301N.0B01,3U.PQA10,SNW0,0.0,346,0.128284,12.828409,3
3,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",SFW0,0.0,346,0.126857,12.685661,4
4,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",SEW0,0.0,346,0.114746,11.474552,5
5,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",FEW0,0.0,346,0.112127,11.212702,6
6,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",REW0,0.0,346,0.112055,11.205503,7
7,AFPNM301N.0B01,"1U.CDG10,1U.CDG20,6V.CDG10",OEW0,0.0,346,0.098323,9.832278,8


### ExertInlineByWafer

In [561]:
class ExertInlineByWafer:
    @staticmethod
    def fit_by_wafer_model(df: pyspark.sql.dataframe,
                           request_id: str,
                           merge_operno_list: List[Dict[str, List[str]]],
                           columns_list=None,
                           key_words=None,
                           convert_to_numeric_list=None,
                           grpby_list=None,
                           certain_column=None) -> Union[str, pyspark.sql.dataframe.DataFrame]:
        if grpby_list is None:
            grpby_list = ['OPE_NO']

        if columns_list is None:
            columns_list = grpby_list + ['WAFER_ID', 'INLINE_PARAMETER_ID', 'AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL',
                                         'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'SITE_COUNT', 'label']
        if key_words is None:
            key_words = ['CXS', 'CYS', 'FDS']

        if convert_to_numeric_list is None:
            convert_to_numeric_list = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25',
                                       'PERCENTILE_75', 'SITE_COUNT']

        if certain_column is None:
            certain_column = 'INLINE_PARAMETER_ID'

        df_preprocess = DataPreprocessorForInline(df=df,
                                                  columns_list=columns_list,
                                                  certain_column=certain_column,
                                                  key_words=key_words,
                                                  convert_to_numeric_list=convert_to_numeric_list,
                                                  merge_operno_list=merge_operno_list).run()
        if df_preprocess.isEmpty():
            msg = 'No data of this type in the database!'
            raise RCABaseException(msg)

        df_train = GetTrainDataForInline(df=df_preprocess, grpby_list=grpby_list).run()
        if df_train.isEmpty():
            msg = 'Get train data Exception!'
            raise RCABaseException(msg)

        res = FitInlineModelByWafer(df=df_train,
                                    grpby_list=grpby_list,
                                    columns_to_process=['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV',
                                                        'PERCENTILE_25', 'PERCENTILE_75'],
                                    missing_value_threshold=0.6,
                                    model='pca').run()
        if res.isEmpty():
            msg = 'No difference in this data. The output of the algorithm is 0.'
            raise RCABaseException(msg)

        final_res = SplitInlineModelResults(df=res, grpby_list=grpby_list, request_id=request_id).run()
        if final_res.isEmpty():
            msg = 'Temporary exception in adding columns to algorithm results'
            raise RCABaseException(msg)
        else:
            return final_res

In [567]:
grpby_list = ['PRODUCT_ID', 'OPE_NO', 'LOT_ID']
final_res = ExertInlineByWafer.fit_by_wafer_model(df=df1, request_id=request_id, merge_operno_list=merge_operno, grpby_list=grpby_list)
final_res.show()

+--------------+--------------------+---------+-----------+--------+-------+
|    PRODUCT_ID|              OPE_NO|   LOT_ID|wafer_count|good_num|bad_num|
+--------------+--------------------+---------+-----------+--------+-------+
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|NAZ703000|          3|       0|      3|
+--------------+--------------------+---------+-----------+--------+-------+

PRODUCT_ID == 'AFPNM301N.0B01' AND OPE_NO == '1U.CDG10,1U.CDG20,6V.CDG10' AND LOT_ID == 'NAZ703000'
+--------------+--------------------+---------+-------------------+-------------------------+----------+------+--------------+--------+
|    PRODUCT_ID|             OPER_NO|   LOT_ID|INLINE_PARAMETER_ID|AVG_SPEC_CHK_RESULT_COUNT|request_id|weight|weight_percent|index_no|
+--------------+--------------------+---------+-------------------+-------------------------+----------+------+--------------+--------+
|AFPNM301N.0B01|1U.CDG10,1U.CDG20...|NAZ703000|               REW0|                      0.0|       346|  