In [1]:
import json
import requests
import pymysql
import numpy as np
import pandas as pd
import pyspark.pandas as ps
import pyspark.sql.functions as F

from pca import pca
from scipy import stats
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
# from backend_spark.doris_common.doris_client import DorisClient
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, monotonically_increasing_id, lit, col, when, countDistinct



In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [78]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/inline_algorithm/inline_case5.csv")
df_pandas

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,MEASURE_TIME,RANGE_INDEX,FAB_ID,PRODUCT_ID,LOT_ID,AVERAGE,MAX_VAL,...,RANGE,ACT_CODE,ETL_INSERT_TIME,ETL_ARC_FLAG,ETL_BATCH_SYNC_TS,ETL_DEL_FLAG,ETL_DS_JOB_NM,ETL_SRC_DB,ETL_SRC_TBL,ETL_TBL_OPER_TS
0,NAZ439-03,1F.FQE10,CXS1,2023-01-31 22:52:24,0,N1,AFPNM301N.0A01,NAZ439000,0.000000,144.06000,...,288.120000,,2023-05-29 06:00:43,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
1,NAZ439-03,1F.FQE10,CYS1,2023-01-31 22:52:24,0,N1,AFPNM301N.0A01,NAZ439000,0.287947,144.06000,...,283.220490,,2023-05-29 06:00:43,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
2,NAZ439-03,1F.FQE10,FDS1,2023-01-31 22:52:24,0,N1,AFPNM301N.0A01,NAZ439000,0.993052,0.99309,...,0.000082,,2023-05-29 06:00:43,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
3,NAZ439-03,1F.FQE10,TAW1,2023-01-31 22:52:24,0,N1,AFPNM301N.0A01,NAZ439000,109.251900,,...,,,2023-05-29 06:00:43,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
4,NAZ439-03,1F.FQE10,TAWB,2023-01-31 22:52:24,0,N1,AFPNM301N.0A01,NAZ439000,109.251900,,...,,,2023-05-29 06:00:43,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32273,NAZ703-01,1U.PQA10,MCW0,2023-03-30 06:36:50,0,N1,AFPNM301N.0B01,NAZ703050,0.000000,,...,,,2023-05-29 07:42:52,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
32274,NAZ703-01,1U.PQA10,MEW0,2023-03-30 06:36:50,0,N1,AFPNM301N.0B01,NAZ703050,0.000000,,...,,,2023-05-29 07:42:52,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
32275,NAZ703-01,1U.PQA10,MFW0,2023-03-30 06:36:50,0,N1,AFPNM301N.0B01,NAZ703050,1.000000,,...,,,2023-05-29 07:42:52,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00
32276,NAZ703-01,1U.PQA10,MSW0,2023-03-30 06:36:50,0,N1,AFPNM301N.0B01,NAZ703050,1.000000,,...,,,2023-05-29 07:42:52,0,1970-01-01 00:00:00,0,,EDA,INLINE_WAFER_SUMMARY,1970-01-01 00:00:00


In [79]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

32278

### 打标签

In [5]:
def get_label_single(df, bad_wafer):
    like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_like_conditions = " | ".join(like_conditions)
    isin_conditions = "col('WAFER_ID').isin(bad_wafer)"
    df = df.withColumn('label', 
                when( eval(all_like_conditions) | eval(isin_conditions), int(1)).otherwise(int(0)))
    return df 


def get_label_double(df, bad_wafer, good_wafer):
    good_like_conditions = [f"col('WAFER_ID').like('{good}')" for good in good_wafer]
    all_good_like_conditions = " | ".join(good_like_conditions)
    good_isin_conditions = "col('WAFER_ID').isin(good_wafer)"

    bad_like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_bad_like_conditions = " | ".join(bad_like_conditions)
    bad_isin_conditions = "col('WAFER_ID').isin(bad_wafer)"

    df = df.withColumn('label',  when(eval(all_good_like_conditions) | eval(good_isin_conditions), int(0)).when(eval(all_bad_like_conditions) | eval(bad_isin_conditions), int(1)).otherwise(222333))
    df = df.filter(df['label'] != int(222333))
    return df

In [80]:
good_wafer = ['NAZ439-03', 'NAZ439-07']
bad_wafer = ['NAZ415-06', 'NAZ415-08', 'NAZ415-12', 'NAZ415-13', 'NAZ439-06', 'NAZ703-01', 
             'NAZ703-08', 'NAZ703-09', 'NBX082-05', 'NBX082-12', 'NBX082-15', 'NBX082-16', 'NBX219-17']

df1_with_label = get_label_double(df1, bad_wafer, good_wafer)

num_rows = df1_with_label.count()
num_columns = len(df1_with_label.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (32278, 144)


In [11]:
# df1_with_label.toPandas().to_csv('inline_case5_label.csv', index=False)

### 数据预处理

In [81]:
def select_columns(df, columns_list):
    return df[columns_list]


def exclude_some_data(df, certain_column: str, key_words: list[str]):
    key_words_str = '|'.join(key_words)
    df_filtered = df.filter(~col(certain_column).rlike(key_words_str))
    return df_filtered


def pre_process(df, convert_to_numeric_list):
    for column in convert_to_numeric_list:
        df = df.withColumn(column, col(column).cast('double'))

    if 'SITE_COUNT' in convert_to_numeric_list:
        convert_to_numeric_list.remove('SITE_COUNT')

    df = df.dropna(subset=convert_to_numeric_list, how='all')
    return df

In [82]:
columns_list = ['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 
                'PERCENTILE_25', 'PERCENTILE_75', 'SITE_COUNT', 'label']

df1_with_label = select_columns(df1_with_label, columns_list)

num_rows = df1_with_label.count()
num_columns = len(df1_with_label.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (32278, 12)


In [83]:
df1_esd = exclude_some_data(df=df1_with_label, 
                            certain_column='INLINE_PARAMETER_ID',
                            key_words=['CXS', 'CYS', 'FDS'])

num_rows = df1_esd.count()
num_columns = len(df1_esd.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (32083, 12)


In [84]:
convert_to_numeric_list = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'SITE_COUNT']
df1_pp = pre_process(df1_esd, convert_to_numeric_list)

num_rows = df1_pp.count()
num_columns = len(df1_pp.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (31791, 12)


In [85]:
df1_pp.filter("label == 1").select('WAFER_ID').distinct().count()

13

In [86]:
df1_pp.filter("label == 0").select('WAFER_ID').distinct().count()

2

### 共性分析

In [87]:
def commonality_analysis(df_run, grpby_list=['OPE_NO']):

    grps = (df_run.groupBy(grpby_list)
            .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
            .na.fill(0)
            .orderBy(['bad_num', 'good_num'], ascending=False))

    # 单站点+单腔室的情况
    if grps.count() == 1:
        return grps
    else:
        grps = grps.filter("bad_num > 1 AND wafer_count > 2")
        return grps

In [88]:
common_res = commonality_analysis(df1_pp)
print(common_res.count())
common_res.show(10)

83
+--------+-----------+--------+-------+
|  OPE_NO|wafer_count|good_num|bad_num|
+--------+-----------+--------+-------+
|3U.CDG10|         15|       2|     13|
|7U.PQX10|         15|       2|     13|
|1U.CDG20|         15|       2|     13|
|6V.CDG20|         15|       2|     13|
|TV.EQA20|         15|       2|     13|
|3U.CDG20|         15|       2|     13|
|TM.PQX10|         15|       2|     13|
|1C.CDG10|         15|       2|     13|
|2U.CDG20|         15|       2|     13|
|6V.CDG10|         15|       2|     13|
+--------+-----------+--------+-------+
only showing top 10 rows



In [89]:
def get_data_list(common_res):
    data_list = common_res.select(['OPE_NO']).collect()
    data_dict_list = [row.asDict() for row in data_list]
    return data_dict_list

In [90]:
data_dict_list = get_data_list(common_res)
print(len(data_dict_list))
data_dict_list

83


[{'OPE_NO': '7U.PQA20'},
 {'OPE_NO': 'PV.CDG10'},
 {'OPE_NO': 'TV.EQA20'},
 {'OPE_NO': '2U.CDG10'},
 {'OPE_NO': '2U.CDG20'},
 {'OPE_NO': 'TM.PQX10'},
 {'OPE_NO': '3U.CDG10'},
 {'OPE_NO': '1U.CDG20'},
 {'OPE_NO': '6V.CDG20'},
 {'OPE_NO': '3U.CDG20'},
 {'OPE_NO': '1C.CDG10'},
 {'OPE_NO': '6V.CDG10'},
 {'OPE_NO': '1U.CDG10'},
 {'OPE_NO': 'TV.PQX10'},
 {'OPE_NO': '7U.PQX10'},
 {'OPE_NO': '2V.PQW10'},
 {'OPE_NO': '6V.PQX10'},
 {'OPE_NO': 'TV.CDG10'},
 {'OPE_NO': '3U.PQW10'},
 {'OPE_NO': '2V.PQX10'},
 {'OPE_NO': '2U.PQX10'},
 {'OPE_NO': '1V.PQA10'},
 {'OPE_NO': '2U.PQA10'},
 {'OPE_NO': '7U.EQA20'},
 {'OPE_NO': '7U.PQA10'},
 {'OPE_NO': '3U.PQX10'},
 {'OPE_NO': '1V.PQX10'},
 {'OPE_NO': '1V.PQX20'},
 {'OPE_NO': '7U.ECU10'},
 {'OPE_NO': 'TM.EQA10'},
 {'OPE_NO': '2V.PQX20'},
 {'OPE_NO': 'PV.PQA10'},
 {'OPE_NO': '2V.PQA10'},
 {'OPE_NO': '7U.EQA10'},
 {'OPE_NO': 'PV.PQX10'},
 {'OPE_NO': 'TM.PQA10'},
 {'OPE_NO': 'TV.PQW10'},
 {'OPE_NO': 'TM.PQW10'},
 {'OPE_NO': '6V.PQW10'},
 {'OPE_NO': '7U.PQW10'},


In [91]:
def get_train_data(df_run, data_dict_list):
    oper = data_dict_list[0]['OPE_NO']
    df_s = df_run.filter("OPE_NO == '{}'".format(oper))
    for i in range(1, len(data_dict_list)):
        oper = data_dict_list[i]['OPE_NO']
        df_m = df_run.filter("OPE_NO == '{}'".format(oper))
        df_s = df_s.union(df_m)
    return df_s

In [281]:
df_run = get_train_data(df1_pp, data_dict_list)

num_rows = df_run.count()
num_columns = len(df_run.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (31019, 12)


### PCA建模

In [280]:
def process_missing_values(df, columns_to_process, missing_value_threshold):
    for column in columns_to_process:
        missing_percentage = df[column].isnull().mean()
        if missing_percentage > missing_value_threshold:
            df = df.drop(columns=[column])
        else:
            df[column] = df[column].fillna(df[column].mean())
    return df

def get_pivot_table(df, columns_to_process, missing_value_threshold=0.6):
    df_specific = process_missing_values(df, columns_to_process, missing_value_threshold)
    index_list = ['WAFER_ID', 'label']
    values_list = df_specific.columns.difference(['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'label'])
    pivot_result = df_specific.pivot_table(index=index_list,
                                           columns=['OPE_NO', 'INLINE_PARAMETER_ID'],
                                           values=values_list)
    pivot_result.columns = pivot_result.columns.map('#'.join)
    pivot_result = process_missing_values(pivot_result, pivot_result.columns, missing_value_threshold)
    pivot_result = pivot_result.reset_index(drop=False)
    for column in pivot_result.columns.difference(index_list):
        if pivot_result[column].nunique() == 1:
            pivot_result = pivot_result.drop(column, axis=1)
    return pivot_result

In [128]:
def fit_pca_model(df, by, columns_to_process):
    schema_all = StructType([StructField("features", StringType(), True),
                             StructField("importance", FloatType(), True)]) 

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):
        pivot_result = get_pivot_table(df=df_run, columns_to_process=columns_to_process, missing_value_threshold=0.61)
        # 定义自变量
        x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
        if x_train.shape[1] > 1:
            n_components = min(min(x_train.shape)-1, 5)

            model = pca(n_components=n_components, verbose=None)
            results = model.fit_transform(x_train)
            res_top = results['topfeat']
            res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
            res_top_select['importance'] = abs(res_top_select['loading'])
            res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()
            return res_top_select
        else:
            res_top_select = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])
            return res_top_select
    return df.groupby(by).apply(get_model_result)

In [129]:
by = ['OPE_NO']
columns_to_process = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75']

res = fit_pca_model(df=df_run, by=by, columns_to_process=columns_to_process)
res.show()

+--------------------+----------+
|            features|importance|
+--------------------+----------+
|AVERAGE#1C.CDG10#...| 0.9999995|
|AVERAGE#1C.CDG10#...|  0.782774|
|AVERAGE#1C.CDG10#...|0.77940005|
|AVERAGE#1C.CDG10#...|0.94151205|
|AVERAGE#1C.CDG10#...| 0.7007957|
|     STATS#OPE#PARAM|    -100.0|
|AVERAGE#1C.PQW10#...|0.40030748|
|AVERAGE#1C.PQW10#...| 0.3518084|
|AVERAGE#1C.PQW10#...|0.44084978|
|AVERAGE#1C.PQX10#...|0.53265846|
|AVERAGE#1C.PQX10#...| 0.7599824|
|AVERAGE#1C.PQX10#...| 0.5194199|
|AVERAGE#1C.PQX10#...| 0.8075725|
|AVERAGE#1C.PQX10#...| 0.6866065|
|AVERAGE#1U.CDG10#...|0.87329966|
|AVERAGE#1U.CDG10#...| 0.7692767|
|AVERAGE#1U.CDG10#...|0.81461805|
|AVERAGE#1U.CDG10#...| 0.5973031|
|AVERAGE#1U.CDG10#...|0.76068807|
|AVERAGE#1U.CDG20#...| 0.9999055|
+--------------------+----------+
only showing top 20 rows



In [130]:
num_rows = res.count()
num_columns = len(res.columns)
print(f"DataFrame shape: ({num_rows}, {num_columns})")

DataFrame shape: (307, 2)


In [101]:
df_run_pandas = df_run.toPandas()

In [172]:
# oper_st = []
# for oper in df_run_pandas['OPE_NO'].unique():
#     df_run_oper = df_run_pandas.query(f"OPE_NO == '{oper}'")
#     pivot_result = get_pivot_table(df=df_run_oper, columns_to_process=columns_to_process, missing_value_threshold=0.61)
#     x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
#     if x_train.shape[1] > 1:
#         oper_st.append(oper)
#         print("OPE_NO:", oper, "x_train:", x_train.shape)

In [133]:
oper = '7U.ECU10'
df_run_oper = df_run_pandas.query(f"OPE_NO == '{oper}'")
# df_specific = process_missing_values(df_run_oper, columns_to_process, missing_value_threshold=0.6)
# index_list = ['WAFER_ID', 'label']
# values_list = df_specific.columns.difference(['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'label'])
# pivot_result = df_specific.pivot_table(index=index_list,
#                                        columns=['OPE_NO', 'INLINE_PARAMETER_ID'],
#                                        values=values_list)
# pivot_result.columns = pivot_result.columns.map('#'.join)
# pivot_result = process_missing_values(pivot_result, pivot_result.columns, missing_value_threshold=0.6)
# pivot_result = pivot_result.reset_index(drop=False)
pivot_result = get_pivot_table(df=df_run_oper, columns_to_process=columns_to_process, missing_value_threshold=0.6)
pivot_result

Unnamed: 0,WAFER_ID,label
0,NAZ415-08,1
1,NAZ415-13,1
2,NAZ439-03,0
3,NAZ439-06,1
4,NAZ439-07,0
5,NAZ703-01,1
6,NAZ703-08,1
7,NAZ703-09,1
8,NBX082-05,1
9,NBX082-12,1


### rf建模

In [136]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import ClusterCentroids

In [162]:
df_run_pandas1 = pd.read_csv('../inline_case5_label.csv') 

In [241]:
oper = data_dict_list[15]['OPE_NO']
df_rune = df_run_pandas1.query(f"OPE_NO == '{oper}'")
pivot_result = get_pivot_table(df=df_rune, columns_to_process=columns_to_process, missing_value_threshold=0.6)
pivot_result
# x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

Unnamed: 0,WAFER_ID,label,AVERAGE#2V.PQW10#01W0,AVERAGE#2V.PQW10#02W0,AVERAGE#2V.PQW10#03W0,AVERAGE#2V.PQW10#04W0,AVERAGE#2V.PQW10#05W0,AVERAGE#2V.PQW10#06W0,AVERAGE#2V.PQW10#07W0,AVERAGE#2V.PQW10#08W0,...,VALID_LOW#2V.PQW10#09W0,VALID_LOW#2V.PQW10#10W0,VALID_LOW#2V.PQW10#11W0,VALID_LOW#2V.PQW10#12W0,VALID_LOW#2V.PQW10#13W0,VALID_LOW#2V.PQW10#14W0,VALID_LOW#2V.PQW10#15W0,VALID_LOW#2V.PQW10#16W0,VALID_LOW#2V.PQW10#17W0,VALID_LOW#2V.PQW10#DNW01
0,NAZ415-06,1,0.0649,0.066,0.0658,0.0653,0.0659,0.0658,0.0635,0.0671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,NAZ415-08,1,0.0658,0.065,0.0655,0.0658,0.0666,0.0672,0.0647,0.0652,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,NAZ415-12,1,0.0658,0.0658,0.0651,0.0648,0.0649,0.0652,0.0645,0.0665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NAZ415-13,1,0.065,0.0649,0.0645,0.0651,0.0654,0.0659,0.065,0.0671,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,NAZ439-03,0,0.0655,0.066,0.0655,0.0652,0.0655,0.0653,0.0638,0.0659,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0
5,NAZ439-06,1,0.0652,0.0658,0.0655,0.0656,0.0655,0.0652,0.0653,0.0644,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0
6,NAZ439-07,0,0.0646,0.0644,0.0655,0.0642,0.0649,0.0645,0.0632,0.0654,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0
7,NAZ703-08,1,0.0644,0.0653,0.0638,0.0641,0.0641,0.0637,0.0622,0.0654,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0
8,NAZ703-09,1,0.0647,0.0639,0.0654,0.0641,0.0646,0.0647,0.0637,0.0648,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0
9,NBX082-05,1,0.0649,0.0661,0.0675,0.0654,0.0647,0.0652,0.0659,0.0652,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,0.0,0.0,0.0,0.0


In [277]:
def fit_rf_model(df, oper):
    df_sp = df.query(f"OPE_NO == '{oper}'") 
    pivot_result = get_pivot_table(df=df_sp, columns_to_process=columns_to_process, missing_value_threshold=0.6)
    print(oper, pivot_result.shape)

    x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
    y_train = pivot_result[['label']]

    if x_train.shape[1] > 1 and y_train['label'].nunique() > 1:
        pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler()),
            ('model', RandomForestClassifier())])
        param_grid = {'model__n_estimators': [*range(10, 60, 10)],
                      'model__max_depth': [*range(5, 50, 10)],
                      'model__min_samples_split': [2, 5],
                      'model__min_samples_leaf': [1, 3]}
        grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=2, n_jobs=-1)
        grid.fit(x_train.values, y_train.values.ravel())
        roc_auc_score_ = grid.best_score_
        if roc_auc_score_ >= 0.6:
            small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                'importance': grid.best_estimator_.steps[2][1].feature_importances_})
            return small_importance_res
        else:
            return None
    else:
        return None

In [278]:
importance_res = pd.DataFrame() 
for i in range(2):
    oper = data_dict_list[i]['OPE_NO']
    ress = fit_rf_model(df=df_run_pandas1, oper=oper)
    importance_res = pd.concat([importance_res, ress], axis=0)
    print("*******************************************")

7U.PQA20 (15, 9)
0.7797619047619048
*******************************************
PV.CDG10 (15, 61)
0.9642857142857143
*******************************************


In [286]:
def fit_rf_model(df, by, columns_to_process, missing_value_threshold):
    schema_all = StructType([StructField("features", StringType(), True),
                             StructField("importance", FloatType(), True)]) 

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):
        pivot_result = get_pivot_table(df=df_run, columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)
        x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
        y_train = pivot_result[['label']]

        if x_train.shape[1] > 1 and y_train['label'].nunique() > 1:
            pipe = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
                ('scaler', StandardScaler()),
                ('model', RandomForestClassifier())])
            param_grid = {'model__n_estimators': [*range(10, 60, 10)],
                          'model__max_depth': [*range(5, 50, 10)],
                          'model__min_samples_split': [2, 5],
                          'model__min_samples_leaf': [1, 3]}
            grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=2, n_jobs=-1)
            grid.fit(x_train.values, y_train.values.ravel())
            roc_auc_score_ = grid.best_score_
            if roc_auc_score_ >= 0.6:
                small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                    'importance': grid.best_estimator_.steps[2][1].feature_importances_})
                return small_importance_res
            else:
                small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])
                return small_importance_res
        else:
            small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])
            return small_importance_res
    return df.groupby(by).apply(get_model_result)

In [287]:
by = ['OPE_NO']
columns_to_process = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75']
missing_value_threshold = 0.6

res = fit_rf_model(df=df_run, by=by, columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)
res.show()

+--------------------+-----------+
|            features| importance|
+--------------------+-----------+
|AVERAGE#1C.CDG10#...|        0.0|
|AVERAGE#1C.CDG10#...| 0.33333334|
|AVERAGE#1C.CDG10#...| 0.35069445|
|AVERAGE#1C.CDG10#...|        0.0|
|AVERAGE#1C.CDG10#...|0.017361112|
|AVERAGE#1C.CDG10#...|0.055555556|
|AVERAGE#1C.CDG10#...|0.083333336|
|AVERAGE#1C.CDG10#...| 0.13194445|
|AVERAGE#1C.CDG10#...|        0.0|
|AVERAGE#1C.CDG10#...|0.027777778|
|     STATS#OPE#PARAM|     -100.0|
|     STATS#OPE#PARAM|     -100.0|
|     STATS#OPE#PARAM|     -100.0|
|AVERAGE#1U.CDG10#...|  0.1891892|
|AVERAGE#1U.CDG10#...| 0.08108108|
|AVERAGE#1U.CDG10#...|  0.0929054|
|AVERAGE#1U.CDG10#...| 0.10810811|
|AVERAGE#1U.CDG10#...| 0.16216215|
|AVERAGE#1U.CDG10#...|0.027027028|
|AVERAGE#1U.CDG10#...|0.054054055|
+--------------------+-----------+
only showing top 20 rows



In [288]:
res_p = res.toPandas()

In [289]:
res_p

Unnamed: 0,features,importance
0,AVERAGE#1C.CDG10#EEW0,0.000000
1,AVERAGE#1C.CDG10#FEW0,0.333333
2,AVERAGE#1C.CDG10#HFT0,0.350694
3,AVERAGE#1C.CDG10#OEW0,0.000000
4,AVERAGE#1C.CDG10#PEW0,0.017361
...,...,...
3141,STATS#OPE#PARAM,-100.000000
3142,STATS#OPE#PARAM,-100.000000
3143,STATS#OPE#PARAM,-100.000000
3144,STATS#OPE#PARAM,-100.000000


In [290]:
def split_features(df, index) -> str:
    return df['features'].apply(lambda x: x.split('#')[index])


def get_split_features(df):
    df['STATISTIC_RESULT'] = split_features(df, 0)
    df['OPE_NO'] = split_features(df, 1)
    df['INLINE_PARAMETER_ID'] = split_features(df, 2)
    df = df.drop(['features', 'STATISTIC_RESULT'], axis=1).reset_index(drop=True)
    return df


def split_calculate_features(df, by):
    schema_all = StructType([StructField("OPE_NO", StringType(), True),
                             StructField("INLINE_PARAMETER_ID", StringType(), True),
                             StructField("importance", FloatType(), True)]) 

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):
        split_table = get_split_features(df_run)
        split_table_grpby = split_table.groupby(['OPE_NO', 'INLINE_PARAMETER_ID'])['importance'].sum().reset_index(drop=False)
#         split_table_grpby = split_table_grpby.sort_values('importance', ascending=False).reset_index(drop=True)
        return split_table_grpby
    return df.groupby(by).apply(get_model_result)


def add_certain_column(df, by, request_id):
    
    schema_all = StructType([
        StructField("OPER_NO", StringType(), True),
        StructField("INLINE_PARAMETER_ID", StringType(), True),
        StructField("AVG_SPEC_CHK_RESULT_COUNT", FloatType(), True),
        StructField("weight", FloatType(), True),
        StructField("request_id", StringType(), True),
        StructField("weight_percent", FloatType(), True),
        StructField("index_no", IntegerType(), True)])

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(final_res): 
        # 计算weight, 归一化
        final_res['importance'] = final_res['importance'].astype(float)
        final_res = final_res.query("importance > 0")
        final_res['weight'] = final_res['importance'] / final_res['importance'].sum()
        final_res['weight_percent'] = final_res['weight'] * 100
        final_res = final_res.sort_values('weight', ascending=False)
        # 增加列
        final_res['index_no'] = [i + 1 for i in range(len(final_res))]
        final_res['request_id'] = request_id
        final_res['AVG_SPEC_CHK_RESULT_COUNT'] = 0.0
        final_res = final_res.rename(columns={'OPE_NO': 'OPER_NO'})
        return final_res.drop(['importance', 'add'], axis=1) 
    return df.groupby(by).apply(get_result)

In [291]:
res_add = res.withColumn('add', lit(0))
final_res = split_calculate_features(df=res_add, by='add')

In [292]:
final_res_pandas = final_res.toPandas()
final_res_pandas

Unnamed: 0,OPE_NO,INLINE_PARAMETER_ID,importance
0,1C.CDG10,EEW0,0.000000
1,1C.CDG10,FEW0,0.333333
2,1C.CDG10,HFT0,0.350694
3,1C.CDG10,OEW0,0.000000
4,1C.CDG10,PEW0,0.017361
...,...,...,...
1888,TV.PQX10,YDW0,0.000000
1889,TV.PQX10,YIW0,0.000000
1890,TV.PQX10,YMW0,0.000000
1891,TV.PQX10,YSW0,0.000000


In [295]:
# final_res_pandas['OPE_NO'].unique()

In [296]:
final_res = final_res.withColumn('add', lit(0))
final_res_add = add_certain_column(df=final_res, by='add', request_id='855s')

In [297]:
final_res_add_p = final_res_add.toPandas()
final_res_add_p

Unnamed: 0,OPER_NO,INLINE_PARAMETER_ID,AVG_SPEC_CHK_RESULT_COUNT,weight,request_id,weight_percent,index_no
0,TM.PQA10,SNW0,0.0,0.025000,855s,2.500000,1
1,7U.EQA10,SNW0,0.0,0.024948,855s,2.494792,2
2,7U.EQA20,SNW0,0.0,0.024905,855s,2.490530,3
3,1V.PQA10,SNW0,0.0,0.024316,855s,2.431596,4
4,2U.PQA10,SNW0,0.0,0.023995,855s,2.399471,5
...,...,...,...,...,...,...,...
533,6V.CDG10,OEW0,0.0,0.000119,855s,0.011905,534
534,7U.EQA20,MEW0,0.0,0.000095,855s,0.009470,535
535,7U.EQA10,MEW0,0.0,0.000052,855s,0.005208,536
536,2U.PQA10,MEW0,0.0,0.000042,855s,0.004209,537


In [300]:
# final_res_add_p[final_res_add_p['OPER_NO'].str.contains('V.EQW')]