In [3]:
import pyspark
import pandas as pd
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [4]:
class ExtractFeaturesBySite:
    @staticmethod
    def process_missing_values_for_site(df: pd.DataFrame,
                                        good_site_columns: list[str],
                                        bad_site_columns: list[str],
                                        missing_value_threshold: Union[int, float] = 0.7,
                                        process_miss_site_mode: str = 'drop') -> pd.DataFrame:
        assert process_miss_site_mode in ['drop', 'fill']
        site_columns = good_site_columns + bad_site_columns
        if process_miss_site_mode == 'drop':
            # drop rows based on the missing value threshold
            df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
        else:
            # fill missing values in the corresponding site rows using the AVERAGE of that row
            df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
        return df

    @staticmethod
    def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std(),
            'PERCENTILE_25': row.quantile(0.25),
            'PERCENTILE_75': row.quantile(0.75)})

    @staticmethod
    def calculate_site_stats(df: pd.DataFrame, grpby_list: list[str], site_columns: list[str],
                             good_or_bad: str) -> pd.DataFrame:
        assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
        selected_df = df[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME'] + site_columns].reset_index(drop=True)
        # Perform statistical calculations for each row
        side_features = selected_df.apply(lambda row: ExtractFeaturesBySite.calculate_statistics(row[site_columns]),
                                          axis=1)
        side_features = side_features.fillna(0)
        df_with_features = pd.concat([selected_df, side_features], axis=1)
        if good_or_bad == 'good':
            df_with_features['label'] = 0
        else:
            df_with_features['label'] = 1
        return df_with_features

    @staticmethod
    def extract_features_by_site(df: pd.DataFrame,
                                 grpby_list: list[str],
                                 good_site_columns: list[str],
                                 bad_site_columns: list[str],
                                 missing_value_threshold: Union[int, float] = 0.7,
                                 process_miss_site_mode: str = 'drop') -> Union[pd.DataFrame, None]:
        """
        Extracts features from a DataFrame based on good and bad site columns.
        Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - grp_list: ['OPE_NO'] for most the case.
        - good_site_columns (list): List of columns representing good sites.
        - bad_site_columns (list): List of columns representing bad sites.
        - missing_value_threshold (Union[int, float]): Threshold for missing values.
        - process_miss_site_mode (str): Mode for handling missing values in site columns, e.g. drop or fill
        Returns:
        - Union[pd.DataFrame, None]: DataFrame with extracted features or None if no data is available.
        """
        df_pandas_specific_ = ExtractFeaturesBySite.process_missing_values_for_site(df=df,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
        if df_pandas_specific_.shape[0] != 0:
            side_with_features1 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             good_site_columns,
                                                                             good_or_bad='good')
            side_with_features2 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             bad_site_columns,
                                                                             good_or_bad='bad')
            side_with_features1_select = side_with_features1[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features2_select = side_with_features2[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
            return side_with_features_all

In [6]:
# 示例数据
df = pd.DataFrame({
    'OPE_NO': [1, 2, 3, 4, 5],
    'WAFER_ID': [101, 102, 103, 104, 105],
    'PARAMETRIC_NAME': ['A', 'B', 'C', 'D', 'E'],
    'SITE1_VAL': [10, 20, 30, 40, 50],
    'SITE3_VAL': [15, 25, 35, 45, 55],
    'SITE5_VAL': [20, 30, 40, 50, 60],
    'SITE7_VAL': [25, 35, 45, 55, 65],
    'SITE8_VAL': [30, 40, 50, 60, 70],
    'SITE2_VAL': [35, 45, 55, 65, 75],
    'SITE4_VAL': [40, 50, 60, 70, 80],
    'SITE9_VAL': [45, 55, 65, 75, 85]
})
df 

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,SITE1_VAL,SITE3_VAL,SITE5_VAL,SITE7_VAL,SITE8_VAL,SITE2_VAL,SITE4_VAL,SITE9_VAL
0,1,101,A,10,15,20,25,30,35,40,45
1,2,102,B,20,25,30,35,40,45,50,55
2,3,103,C,30,35,40,45,50,55,60,65
3,4,104,D,40,45,50,55,60,65,70,75
4,5,105,E,50,55,60,65,70,75,80,85


In [10]:
## BySite分析处理后的数据

ExtractFeaturesBySite.extract_features_by_site(df=df,
                         grpby_list=['OPE_NO'],
                         good_site_columns=['SITE1_VAL', 'SITE3_VAL', 'SITE5_VAL', 'SITE7_VAL'],
                         bad_site_columns=['SITE8_VAL', 'SITE2_VAL', 'SITE4_VAL', 'SITE9_VAL'])

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,1,101,A,25.0,10.0,17.5,17.5,6.454972,13.75,21.25,0
1,2,102,B,35.0,20.0,27.5,27.5,6.454972,23.75,31.25,0
2,3,103,C,45.0,30.0,37.5,37.5,6.454972,33.75,41.25,0
3,4,104,D,55.0,40.0,47.5,47.5,6.454972,43.75,51.25,0
4,5,105,E,65.0,50.0,57.5,57.5,6.454972,53.75,61.25,0
0,1,101,A,45.0,30.0,37.5,37.5,6.454972,33.75,41.25,1
1,2,102,B,55.0,40.0,47.5,47.5,6.454972,43.75,51.25,1
2,3,103,C,65.0,50.0,57.5,57.5,6.454972,53.75,61.25,1
3,4,104,D,75.0,60.0,67.5,67.5,6.454972,63.75,71.25,1
4,5,105,E,85.0,70.0,77.5,77.5,6.454972,73.75,81.25,1


In [None]:
## ByZone分析

def extract_features_from_inline_info(df, inline_info, grpby_list):
    all_features = []
    
    for label, zones in inline_info.items():
        for zone, site_columns in zones.items():
            # 提取特征并标记
            zone_features = ExtractFeaturesBySite.extract_features_by_site(
                df=df,
                grpby_list=grpby_list,
                good_site_columns=[],  # Assuming all site columns are bad for this case
                bad_site_columns=site_columns,
                missing_value_threshold=0.7,
                process_miss_site_mode='drop'
            )
            
            if zone_features is not None:
                zone_features['label'] = 1  # 设置标签为 1
                all_features.append(zone_features)
    
    if all_features:
        return pd.concat(all_features, axis=0).reset_index(drop=True)
    else:
        return None

In [None]:
inline_info = {
    'label1': {
        'zone1': ['SITE1_VAL', 'SITE3_VAL'],
        'zone2': ['SITE5_VAL', 'SITE7_VAL']
    },
    'label2': {
        'zone3': ['SITE8_VAL', 'SITE2_VAL'],
        'zone4': ['SITE4_VAL', 'SITE9_VAL']
    }
}

In [12]:
for label, zones in inline_info.items():
    print(label)
    print(zones)

label1
{'zone1': ['SITE1_VAL', 'SITE3_VAL'], 'zone2': ['SITE5_VAL', 'SITE7_VAL']}
label2
{'zone3': ['SITE8_VAL', 'SITE2_VAL'], 'zone4': ['SITE4_VAL', 'SITE9_VAL']}


In [7]:
grpby_list = ['OPE_NO']
extracted_features = extract_features_from_inline_info(df, inline_info, grpby_list)
extracted_features

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,1,101,A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,2,102,B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,3,103,C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,4,104,D,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,5,105,E,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5,1,101,A,15.0,10.0,12.5,12.5,3.535534,11.25,13.75,1
6,2,102,B,25.0,20.0,22.5,22.5,3.535534,21.25,23.75,1
7,3,103,C,35.0,30.0,32.5,32.5,3.535534,31.25,33.75,1
8,4,104,D,45.0,40.0,42.5,42.5,3.535534,41.25,43.75,1
9,5,105,E,55.0,50.0,52.5,52.5,3.535534,51.25,53.75,1
