In [1]:
import warnings
warnings.filterwarnings('ignore')
import pyspark
import json
import requests
import pymysql
import numpy as np
import pandas as pd
import pyspark.pandas as ps
import pyspark.sql.functions as F
from scipy import stats
from pca import pca

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

from scipy import stats
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional
# from backend_spark.doris_common.doris_client import DorisClient
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, monotonically_increasing_id, lit, col, when, countDistinct



In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

### 数据预处理

In [3]:
class DataPreprocessorForInline:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 columns_list: list[str],
                 certain_column: str,
                 key_words: list[str],
                 convert_to_numeric_list: list[str]):
        self.df = df
        self.columns_list = columns_list
        self.certain_column = certain_column
        self.key_words = key_words
        self.convert_to_numeric_list = convert_to_numeric_list

    def select_columns(self, df):
        return df.select(self.columns_list)

    def exclude_some_data(self, df):
        key_words_str = '|'.join(self.key_words)
        df_filtered = df.filter(~col(self.certain_column).rlike(key_words_str))
        return df_filtered

    def pre_process(self, df):
        for column in self.convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in self.convert_to_numeric_list:
            self.convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=self.convert_to_numeric_list, how='all')
        return df

    def run(self):
        df_select = self.select_columns(df=self.df)
        df_esd = self.exclude_some_data(df=df_select)
        df_pp = self.pre_process(df=df_esd)
        return df_pp

In [4]:
# 1. 读取数据
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/inline_algorithm/inline_case5_label.csv")
df_spark = ps.from_pandas(df_pandas).to_spark()
num_rows = df_spark.count()
num_columns = len(df_spark.columns)
print(f"df_spark shape: ({num_rows}, {num_columns})")

# 2. 数据预处理
dp = DataPreprocessorForInline(df=df_spark,
                               columns_list=['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'SITE_COUNT', 'AVERAGE', 'SITE1_VAL', 
                                             'SITE2_VAL', 'SITE3_VAL', 'SITE4_VAL', 'SITE5_VAL', 'SITE6_VAL', 'SITE7_VAL', 'SITE8_VAL', 
                                             'SITE9_VAL', 'SITE10_VAL', 'SITE11_VAL', 'SITE12_VAL', 'SITE13_VAL', 'SITE14_VAL', 
                                             'SITE15_VAL', 'SITE16_VAL', 'SITE17_VAL'],
                               certain_column='INLINE_PARAMETER_ID',
                               key_words=['CXS', 'CYS', 'FDS'],
                               convert_to_numeric_list=['SITE_COUNT', 'AVERAGE', 'SITE1_VAL', 
                                             'SITE2_VAL', 'SITE3_VAL', 'SITE4_VAL', 'SITE5_VAL', 'SITE6_VAL', 'SITE7_VAL', 'SITE8_VAL', 
                                             'SITE9_VAL', 'SITE10_VAL', 'SITE11_VAL', 'SITE12_VAL', 'SITE13_VAL', 'SITE14_VAL', 
                                             'SITE15_VAL', 'SITE16_VAL', 'SITE17_VAL'])
df_pp_ = dp.run()
num_rows = df_pp_.count()
num_columns = len(df_pp_.columns)
print(f"df_pp_ shape: ({num_rows}, {num_columns})")

df_spark shape: (32278, 144)
df_pp_ shape: (31791, 22)


In [5]:
df_pandas_select = df_pp_.toPandas()

In [123]:
unique_oper = df_pandas_select['OPE_NO'].unique()
unique_oper

array(['1F.FQE10', '1C.CDG10', '1U.CDG10', '1U.CDG20', '1U.EQW10',
       '1U.PQW10', '1U.PQX10', '1U.ECU10', '1V.ECU10', '1V.PQA10',
       '1V.PQX10', '1V.PQX20', '2U.CDG10', '2U.CDG20', '2U.EQW10',
       '2U.PQA10', '2U.PQX10', '2V.ECU10', '2V.PQA10', '2V.PQW10',
       '2V.PQX10', '2V.PQX20', '3U.CDG10', '3U.CDG20', '3U.PQA10',
       '3U.PQW10', '3U.PQX10', '6V.CDG10', '6V.CDG20', '6V.PQA10',
       '6V.PQW10', '6V.PQX10', '7U.ECU10', '7U.EQA10', '7U.EQA20',
       '7U.PQA10', '7U.PQA20', '7U.PQW10', '7U.PQX10', 'PV.CDG10',
       'PV.EQA10', 'PV.PQA10', 'PV.PQX10', 'TM.EQA10', 'TM.PQA10',
       'TM.PQW10', 'TM.PQX10', 'TV.CDG10', 'TV.EQA20', 'TV.PQA10',
       'TV.PQW10', 'TV.PQX10', '1V.EQW10', '1V.PQW10', '2U.CQC50',
       '2U.PQW10', '2V.EQW10', '3U.CQC10', '3U.EQW10', '5V.PQA10',
       '6U.CDG10', '6U.CDG20', '6U.PQA10', '7U.CDG10', '7U.EQW10',
       'PV.PQW10', 'XX.PQW03', 'XX.PQX01', 'XX.PQX02', '1C.PQA10',
       '1V.EQW20', '6V.EQA10', 'XX.CCX01', 'XX.CCZ01', '1U.EQW

In [128]:
result_df = pd.DataFrame() 
for i in range(len(unique_oper)):
    oper = unique_oper[i]
    df_oper = df_pandas_select.query(f"OPE_NO == '{oper}'")
    result_df1 = pd.DataFrame({'OPE_NO': oper,
                               'WAFER_ID_NUNIQUE': df_oper['WAFER_ID'].nunique()}, index=[0])
    result_df = pd.concat([result_df, result_df1], axis=0)

In [132]:
result_df.sort_values('WAFER_ID_NUNIQUE', ascending=False).tail(20)

Unnamed: 0,OPE_NO,WAFER_ID_NUNIQUE
0,6V.CQC40,4
0,XX.PQX01,4
0,3U.CQC50,4
0,XX.CCX01,3
0,1V.EQC10,2
0,XX.IQW01,2
0,1U.EQC10,2
0,1F.FQE10,2
0,1C.EQW10,2
0,XX.CCY01,2


### 根据site来处理数据

In [7]:
# def calculate_statistics(row, direction:str):
#     return pd.Series({
#         f'{direction}_MAX_VAL': row.max(),
#         f'{direction}_MIN_VAL': row.min(),
#         f'{direction}_MEDIAN': row.median(),
#         f'{direction}_AVERAGE': row.mean(),
#         f'{direction}_STD_DEV': row.std(),
#         f'{direction}_PERCENTILE_25': row.quantile(0.25),
#         f'{direction}_PERCENTILE_75': row.quantile(0.75)})


# def calculate_site_stats(df, site_columns, which_side):
#     selected_df = df[['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID'] + site_columns].reset_index(drop=True)
#     # 对每一行进行统计计算
#     side_features = selected_df.apply(lambda row: calculate_statistics(row[site_columns], which_side), axis=1)
#     print("side_features:", side_features.isna().any().any())
#     side_features = side_features.fillna(0)
#     df_with_features = pd.concat([selected_df, side_features], axis=1)
#     print("df_with_features:", df_with_features.isna().any().any())
#     return df_with_features

def process_missing_values_for_site(df, good_site_columns, bad_site_columns, missing_value_threshold=0.6, process_miss_site_mode='drop'):
    assert process_miss_site_mode in ['drop', 'fill']
    site_columns = good_site_columns + bad_site_columns
    if process_miss_site_mode == 'drop':
        # 按照missing_value_threshold删除行数据
        df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
    else:
        # 使用该行的AVERAGE去填充对应行site的缺失值
        df[site_columns] = df[site_columns].apply(lambda col: col.fillna(df['AVERAGE']))
    return df 

def calculate_statistics(row):
    return pd.Series({
        'MAX_VAL': row.max(),
        'MIN_VAL': row.min(),
        'MEDIAN': row.median(),
        'AVERAGE': row.mean(),
        'STD_DEV': row.std(),
        'PERCENTILE_25': row.quantile(0.25),
        'PERCENTILE_75': row.quantile(0.75)})

def calculate_site_stats(df, site_columns, good_or_bad):
    assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
    selected_df = df[['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID'] + site_columns].reset_index(drop=True)
    # 对每一行进行统计计算
    side_features = selected_df.apply(lambda row: calculate_statistics(row[site_columns]), axis=1)
    side_features = side_features.fillna(0)
    df_with_features = pd.concat([selected_df, side_features], axis=1)
    if good_or_bad == 'good':
        df_with_features['label'] = 0
    else:
        df_with_features['label'] = 1
    return df_with_features

In [64]:
def extract_features_by_site(df, oper, good_site_columns, bad_site_columns, missing_value_threshold=0.6, process_miss_site_mode='drop'):
    df_pandas_specific_oper = df.query(f"OPE_NO == '{oper}'") 
    df_pandas_specific_oper = process_missing_values_for_site(df=df_pandas_specific_oper, 
                                                              good_site_columns=good_site_columns, 
                                                              bad_site_columns=bad_site_columns, 
                                                              missing_value_threshold=missing_value_threshold, 
                                                              process_miss_site_mode=process_miss_site_mode)
    if df_pandas_specific_oper.shape[0] != 0:
        side_with_features1 = calculate_site_stats(df_pandas_specific_oper, good_site_columns, good_or_bad='good')
        side_with_features2 = calculate_site_stats(df_pandas_specific_oper, bad_site_columns, good_or_bad='bad')
        side_with_features1_select = side_with_features1[['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN', 
                                                          'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
        side_with_features2_select = side_with_features2[['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'MAX_VAL', 'MIN_VAL', 'MEDIAN', 
                                                          'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
        side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
        return side_with_features_all

In [161]:
def process_missing_values(df, columns_to_process, missing_value_threshold):
    for column in columns_to_process:
        missing_percentage = df[column].isnull().mean()
        if missing_percentage > missing_value_threshold:
            df = df.drop(columns=[column])
        else:
            df[column] = df[column].fillna(df[column].mean())
    return df


def get_pivot_table(df, columns_to_process, missing_value_threshold):
    df_specific = process_missing_values(df, columns_to_process, missing_value_threshold)
    index_list = ['WAFER_ID', 'label']
    values_list = df.columns.difference(['WAFER_ID', 'OPE_NO', 'INLINE_PARAMETER_ID', 'label'])
    pivot_result = df.pivot_table(index=index_list, 
                                  columns=['OPE_NO', 'INLINE_PARAMETER_ID'], 
                                  values=values_list)
    pivot_result.columns = pivot_result.columns.map('#'.join)
    pivot_result = process_missing_values(pivot_result, pivot_result.columns, missing_value_threshold)
    pivot_result = pivot_result.reset_index(drop=False)
    # Remove completely identical columns
    for column in pivot_result.columns.difference(index_list):
        if pivot_result[column].nunique() == 1:
            pivot_result = pivot_result.drop(column, axis=1)
    return pivot_result


def fit_pca_model(df, oper, 
                  good_site_columns, bad_site_columns, process_miss_site_mode,
                  columns_to_process, missing_value_threshold):
     
    side_with_features_all = extract_features_by_site(df=df, oper=oper, good_site_columns=good_site_columns, 
                                                      bad_site_columns=bad_site_columns, 
                                                      missing_value_threshold=missing_value_threshold, 
                                                      process_miss_site_mode=process_miss_site_mode)
    print("side_with_features_all的类型是：", type(side_with_features_all))
    if side_with_features_all is None:
        print("No features available for PCA.")
        return pd.DataFrame(columns=["features", "importance"])
    
    pivot_result = get_pivot_table(df=side_with_features_all, columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)
    x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
    print("x_train：", x_train.shape)
    if x_train.shape[1] > 1:
        n_components = min(min(x_train.shape)-1, 10)
        model = pca(n_components=n_components, verbose=None)
        results = model.fit_transform(x_train)
        res_top = results['topfeat']
        res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
        res_top_select['importance'] = abs(res_top_select['loading'])
        res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()
        return res_top_select
    else:
        res_top_select = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])
        return res_top_select

In [None]:
# |7U.ECU10|               FGS1|                   
# |7U.ECU10|               TGS1|                    
# |7U.ECU10|               TGS2|           

----PCA

In [156]:
oper = '7U.ECU10'
good_site_columns = ['SITE4_VAL', 'SITE8_VAL', 'SITE9_VAL', 'SITE12_VAL', 'SITE13_VAL']
bad_site_columns = ['SITE2_VAL', 'SITE6_VAL', 'SITE7_VAL', 'SITE10_VAL', 'SITE11_VAL']
process_miss_site_mode = 'drop'
columns_to_process = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75']
missing_value_threshold = 0.6

In [157]:
df_pandas_specific_oper = df_pandas_select.query(f"OPE_NO == '{oper}'") 
df_pandas_specific_oper = process_missing_values_for_site(df=df_pandas_specific_oper, 
                                                          good_site_columns=good_site_columns, 
                                                          bad_site_columns=bad_site_columns, 
                                                          missing_value_threshold=missing_value_threshold, 
                                                          process_miss_site_mode=process_miss_site_mode)
df_pandas_specific_oper

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,SITE_COUNT,AVERAGE,SITE1_VAL,SITE2_VAL,SITE3_VAL,SITE4_VAL,SITE5_VAL,...,SITE8_VAL,SITE9_VAL,SITE10_VAL,SITE11_VAL,SITE12_VAL,SITE13_VAL,SITE14_VAL,SITE15_VAL,SITE16_VAL,SITE17_VAL
1275,NAZ703-08,7U.ECU10,FGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1276,NAZ703-08,7U.ECU10,TGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1277,NAZ703-08,7U.ECU10,TGS2,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5260,NAZ703-01,7U.ECU10,FGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5261,NAZ703-01,7U.ECU10,TGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5262,NAZ703-01,7U.ECU10,TGS2,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6936,NAZ439-07,7U.ECU10,FGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6937,NAZ439-07,7U.ECU10,TGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6938,NAZ439-07,7U.ECU10,TGS2,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8732,NAZ439-03,7U.ECU10,FGS1,17.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
side_with_features_all = extract_features_by_site(df=df_pandas_select, oper=oper, good_site_columns=good_site_columns, 
                                                  bad_site_columns=bad_site_columns, missing_value_threshold=missing_value_threshold, 
                                                  process_miss_site_mode=process_miss_site_mode)
side_with_features_all

Unnamed: 0,WAFER_ID,OPE_NO,INLINE_PARAMETER_ID,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,NAZ703-08,7U.ECU10,FGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,NAZ703-08,7U.ECU10,TGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,NAZ703-08,7U.ECU10,TGS2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,NAZ703-01,7U.ECU10,FGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,NAZ703-01,7U.ECU10,TGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
31,NAZ415-13,7U.ECU10,TGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
32,NAZ415-13,7U.ECU10,TGS2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
33,NBX082-15,7U.ECU10,FGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
34,NBX082-15,7U.ECU10,TGS1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [163]:
pivot_result = get_pivot_table(df=side_with_features_all, columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)
pivot_result

Unnamed: 0,WAFER_ID,label
0,NAZ415-08,0
1,NAZ415-08,1
2,NAZ415-13,0
3,NAZ415-13,1
4,NAZ439-03,0
5,NAZ439-03,1
6,NAZ439-06,0
7,NAZ439-06,1
8,NAZ439-07,0
9,NAZ439-07,1


In [165]:
# pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]

In [166]:
fit_pca_model(df=df_pandas_select, oper=oper, 
                  good_site_columns=good_site_columns, bad_site_columns=bad_site_columns,  process_miss_site_mode=process_miss_site_mode,
                  columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)

side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (24, 0)


Unnamed: 0,features,importance
0,STATS#OPE#PARAM,-100


In [81]:
good_site_columns = ['SITE4_VAL', 'SITE8_VAL', 'SITE9_VAL', 'SITE12_VAL', 'SITE13_VAL']
bad_site_columns = ['SITE2_VAL', 'SITE6_VAL', 'SITE7_VAL', 'SITE10_VAL', 'SITE11_VAL']
process_miss_site_mode = 'drop'
columns_to_process = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75']
missing_value_threshold = 0.6


res_top_select_all = pd.DataFrame()
for i in range(len(unique_oper)):  
    oper = unique_oper[i]
    print(f"*************当前站点是{oper}*************")
    
    res_top_select=fit_pca_model(df=df_pandas_select, 
                                 oper=oper, 
                                 good_site_columns=good_site_columns, 
                                 bad_site_columns=bad_site_columns,  
                                 process_miss_site_mode=process_miss_site_mode,
                                 columns_to_process=columns_to_process, 
                                 missing_value_threshold=missing_value_threshold)
    res_top_select_all = pd.concat([res_top_select_all, res_top_select], axis=0)

*************当前站点是1F.FQE10*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (4, 7)
*************当前站点是1C.CDG10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1U.CDG10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1U.CDG20*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1U.EQW10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1U.PQW10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1U.PQX10*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (14, 210)
*************当前站点是1U.ECU10*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (14, 35)
*************当前站点是1V.ECU10*************

side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (18, 210)
*************当前站点是1C.PQA10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是1V.EQW20*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是6V.EQA10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是XX.CCX01*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (6, 21)
*************当前站点是XX.CCZ01*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (2, 21)
*************当前站点是1U.EQW20*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是6V.EQW10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for PCA.
*************当前站点是TM.EQW10*************
side_with_features_all的类型是： <class 'Non

In [92]:
res_top_select_all.sort_values('importance').head(9)

Unnamed: 0,features,importance
4,PERCENTILE_75#PV.PQX10#MY04,0.199323
1,MEDIAN#6V.PQX10#FY01,0.201347
1,MIN_VAL#TV.PQX10#FY01,0.201777
1,MIN_VAL#7U.PQX10#FY01,0.201789
1,MIN_VAL#TM.PQX10#FY01,0.201818
1,MEDIAN#PV.PQX10#FY01,0.210319
1,MAX_VAL#2V.PQX20#MY04,0.211106
1,MIN_VAL#1V.PQX30#MX09,0.214443
0,MAX_VAL#TM.PQX10#FY01,0.217876


----RF

In [147]:
def fit_rf_model(df, oper, 
                  good_site_columns, 
                  bad_site_columns, 
                  process_miss_site_mode,
                  columns_to_process, 
                  missing_value_threshold):

    side_with_features_all = extract_features_by_site(df=df, oper=oper, good_site_columns=good_site_columns, 
                                                      bad_site_columns=bad_site_columns, 
                                                      missing_value_threshold=missing_value_threshold, 
                                                      process_miss_site_mode=process_miss_site_mode)
    print("side_with_features_all的类型是：", type(side_with_features_all))
#     if side_with_features_all is None:
#         print("No features available for RF.")
#         return pd.DataFrame(columns=["features", "importance"])
    if side_with_features_all is None:
        print("No features available for RF.")
        return pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -100}, index=[0])

    pivot_result = get_pivot_table(df=side_with_features_all, columns_to_process=columns_to_process, missing_value_threshold=missing_value_threshold)
    x_train = pivot_result[pivot_result.columns.difference(['WAFER_ID', 'label']).tolist()]
    y_train = pivot_result[['label']]
    print("x_train：", x_train.shape)
    print("y_train：", y_train.value_counts())
    
    if min(x_train.shape) > 4 and y_train['label'].nunique() > 1:
        pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler()),
            ('model', RandomForestClassifier(random_state=2024))])
        param_grid = {'model__n_estimators': [*range(10, 60, 10)],
                      'model__max_depth': [*range(5, 50, 10)],
                      'model__min_samples_split': [2, 5],
                      'model__min_samples_leaf': [1, 3]}
        grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
        grid.fit(x_train.values, y_train.values.ravel())
        roc_auc_score_ = grid.best_score_
        if roc_auc_score_ >= 0.6:
            small_importance_res = pd.DataFrame({'features': x_train.columns,
                                                 'importance': grid.best_estimator_.steps[2][1].feature_importances_})
            return small_importance_res
        else:
            print("roc_auc low")
            small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -101}, index=[0])
            return small_importance_res
    else:
        print("x_train.min less than 4")
        small_importance_res = pd.DataFrame({"features": "STATS#OPE#PARAM", "importance": -102}, index=[0])
        return small_importance_res

In [148]:
good_site_columns = ['SITE4_VAL', 'SITE8_VAL', 'SITE9_VAL', 'SITE12_VAL', 'SITE13_VAL']
bad_site_columns = ['SITE2_VAL', 'SITE6_VAL', 'SITE7_VAL', 'SITE10_VAL', 'SITE11_VAL']
process_miss_site_mode = 'drop'
columns_to_process = ['AVERAGE', 'MAX_VAL', 'MEDIAN', 'MIN_VAL', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75']
missing_value_threshold = 0.6


res_top_select_all = pd.DataFrame()
for i in range(len(unique_oper)):  
    oper = unique_oper[i]
    print(f"*************当前站点是{oper}*************")
    
    res_top_select=fit_rf_model(df=df_pandas_select, 
                                 oper=oper, 
                                 good_site_columns=good_site_columns, 
                                 bad_site_columns=bad_site_columns,  
                                 process_miss_site_mode=process_miss_site_mode,
                                 columns_to_process=columns_to_process, 
                                 missing_value_threshold=missing_value_threshold)
    res_top_select_all = pd.concat([res_top_select_all, res_top_select], axis=0)

*************当前站点是1F.FQE10*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (4, 7)
y_train： label
0        2
1        2
dtype: int64
x_train.min less than 4
*************当前站点是1C.CDG10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1U.CDG10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1U.CDG20*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1U.EQW10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1U.PQW10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1U.PQX10*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (14, 210)
y_train： label
0        7
1        7
dtype: int64
*************当前站点是1U.ECU10*************
side

side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (8, 210)
y_train： label
0        4
1        4
dtype: int64
*************当前站点是XX.PQX02*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (18, 210)
y_train： label
0        9
1        9
dtype: int64
*************当前站点是1C.PQA10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是1V.EQW20*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是6V.EQA10*************
side_with_features_all的类型是： <class 'NoneType'>
No features available for RF.
*************当前站点是XX.CCX01*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (6, 21)
y_train： label
0        3
1        3
dtype: int64
*************当前站点是XX.CCZ01*************
side_with_features_all的类型是： <class 'pandas.core.frame.DataFrame'>
x_train： (2, 21)
y_train： label
0        1
1        1
dtype: 

In [154]:
res_top_select_all.query("importance < 0")['importance'].value_counts()

-100.0    61
-102.0     7
-101.0     6
Name: importance, dtype: int64

In [155]:
res_top_select_all.query("importance > 0").sort_values('importance').reset_index(drop=True)

Unnamed: 0,features,importance
0,PERCENTILE_75#1U.ECU10#T2S1,0.000085
1,MEDIAN#1U.ECU10#TGS1,0.000548
2,MIN_VAL#XX.PQX02#MX05,0.001118
3,AVERAGE#1U.ECU10#TGS1,0.001326
4,MEDIAN#XX.PQX02#QX04,0.001575
...,...,...
467,MAX_VAL#3U.CQC10#TDS1,0.233333
468,MAX_VAL#6V.CQC40#TDS1,0.236667
469,PERCENTILE_75#1U.CQC10#TDS1,0.238679
470,STD_DEV#3U.CQC50#TDS2,0.300000


In [140]:
# oper = '1C.CDG10'
# oper = '1U.CDG10'
# oper = '1U.EQW10'

# dfg = df_pandas_select.query(f"OPE_NO == '{oper}'")[good_site_columns+bad_site_columns]
# print(dfg.shape)
# dfg.isna().sum()

### 结果整理

In [None]:
def split_features(df, index) -> str:
    """
    param df: RandomForest建模后的feature_importance_table
    param index: 顺序值
    return: 字段属性值
    """
    return df['features'].apply(lambda x: x.split('#')[index]) 

def get_split_feature_importance_table(df):
    df['STATISTIC_RESULT'] = split_features(df, 0)
    df['OPER_NO'] = split_features(df, 1)
    df['INLINE_PARAMETER_ID'] = split_features(df, 2)
    
    df = df.drop(['features', 'STATISTIC_RESULT'], axis=1).reset_index(drop=True)
    return df

In [None]:
split_table = get_split_feature_importance_table(res_top_select_all)
print(split_table.shape)
split_table_grpby = split_table.groupby(['OPER_NO', 'INLINE_PARAMETER_ID'])['importance'].sum().reset_index(drop=False)

In [None]:
split_table_grpby.sort_values('importance', ascending=False).reset_index(drop=True).head(30)