In [1]:
import pyspark
import pandas as pd
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, when, sum as spark_sum, monotonically_increasing_id, collect_set, explode, countDistinct

In [2]:
class ExtractFeaturesBySite:
    @staticmethod
    def process_missing_values_for_site(df: pd.DataFrame,
                                        good_site_columns: list[str],
                                        bad_site_columns: list[str],
                                        missing_value_threshold: Union[int, float] = 0.7,
                                        process_miss_site_mode: str = 'drop') -> pd.DataFrame:
        assert process_miss_site_mode in ['drop', 'fill']
        site_columns = good_site_columns + bad_site_columns
        if process_miss_site_mode == 'drop':
            # drop rows based on the missing value threshold
            df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
        else:
            # fill missing values in the corresponding site rows using the AVERAGE of that row
            df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
        return df

    @staticmethod
    def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std(),
            'PERCENTILE_25': row.quantile(0.25),
            'PERCENTILE_75': row.quantile(0.75)})

    @staticmethod
    def calculate_site_stats(df: pd.DataFrame, grpby_list: list[str], site_columns: list[str],
                             good_or_bad: str) -> pd.DataFrame:
        assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
        selected_df = df[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME'] + site_columns].reset_index(drop=True)
        # Perform statistical calculations for each row
        side_features = selected_df.apply(lambda row: ExtractFeaturesBySite.calculate_statistics(row[site_columns]),
                                          axis=1)
        side_features = side_features.fillna(0)
        df_with_features = pd.concat([selected_df, side_features], axis=1)
        if good_or_bad == 'good':
            df_with_features['label'] = 0
        else:
            df_with_features['label'] = 1
        return df_with_features

    @staticmethod
    def extract_features_by_site(df: pd.DataFrame,
                                 grpby_list: list[str],
                                 good_site_columns: list[str],
                                 bad_site_columns: list[str],
                                 missing_value_threshold: Union[int, float] = 0.7,
                                 process_miss_site_mode: str = 'drop') -> Union[pd.DataFrame, None]:
        """
        Extracts features from a DataFrame based on good and bad site columns.
        Parameters:
        - df (pd.DataFrame): The input DataFrame.
        - grp_list: ['OPE_NO'] for most the case.
        - good_site_columns (list): List of columns representing good sites.
        - bad_site_columns (list): List of columns representing bad sites.
        - missing_value_threshold (Union[int, float]): Threshold for missing values.
        - process_miss_site_mode (str): Mode for handling missing values in site columns, e.g. drop or fill
        Returns:
        - Union[pd.DataFrame, None]: DataFrame with extracted features or None if no data is available.
        """
        df_pandas_specific_ = ExtractFeaturesBySite.process_missing_values_for_site(df=df,
                                                                                    good_site_columns=good_site_columns,
                                                                                    bad_site_columns=bad_site_columns,
                                                                                    missing_value_threshold=missing_value_threshold,
                                                                                    process_miss_site_mode=process_miss_site_mode)
        if df_pandas_specific_.shape[0] != 0:
            side_with_features1 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             good_site_columns,
                                                                             good_or_bad='good')
            side_with_features2 = ExtractFeaturesBySite.calculate_site_stats(df_pandas_specific_, grpby_list,
                                                                             bad_site_columns,
                                                                             good_or_bad='bad')
            side_with_features1_select = side_with_features1[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features2_select = side_with_features2[
                grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN',
                              'AVERAGE', 'STD_DEV', 'PERCENTILE_25', 'PERCENTILE_75', 'label']]
            side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)
            return side_with_features_all

In [3]:
def process_missing_values(df, columns_to_process, missing_value_threshold):
    for column in columns_to_process:
        missing_percentage = df[column].isnull().mean()
        if missing_percentage > missing_value_threshold:
            df = df.drop(columns=[column])
        else:
            df[column] = df[column].fillna(df[column].mean())
    return df


def get_pivot_table(df, grpby_list, columns_to_process, missing_value_threshold):
    df = FitInlineModelBySite.process_missing_values(df, columns_to_process, missing_value_threshold)
    index_list = ['WAFER_ID', 'label']
    columns_list = grpby_list + ['PARAMETRIC_NAME']
    values_list = df.columns.difference(['WAFER_ID', 'PARAMETRIC_NAME', 'label'] + grpby_list)
    pivot_result = df.pivot_table(index=index_list,
                                  columns=columns_list,
                                  values=values_list)
    pivot_result.columns = pivot_result.columns.map('#'.join)
    pivot_result = FitInlineModelBySite.process_missing_values(pivot_result, pivot_result.columns,
                                                               missing_value_threshold)
    pivot_result = pivot_result.reset_index(drop=False)
    # Remove completely identical columns
    for column in pivot_result.columns.difference(index_list):
        if pivot_result[column].nunique() == 1:
            pivot_result = pivot_result.drop(column, axis=1)
    return pivot_result

In [4]:
data = pd.read_csv("D:/ikas-rca-job/src/inline/df_preprocess_pandas.csv", index_col=0)

In [10]:
oper = '1F.FQE10,1C.CDG10'

df1 = data.query(f"OPE_NO == '{oper}'")
df1

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE9_VAL,SITE4_VAL,SITE12_VAL,SITE8_VAL,SITE13_VAL,SITE10_VAL,SITE2_VAL,SITE11_VAL,SITE6_VAL,SITE7_VAL
0,"1F.FQE10,1C.CDG10",NAZ439-03,TAW1,,109.251900,,,,,,,,,,
1,"1F.FQE10,1C.CDG10",NAZ439-03,TAWB,,109.251900,,,,,,,,,,
2,"1F.FQE10,1C.CDG10",NAZ439-03,TDS0,1.0,108.411700,,,,,,,,,,
3,"1F.FQE10,1C.CDG10",NAZ439-03,TDS1,17.0,109.251924,109.0181,108.8928,109.5629,108.8353,109.7379,110.171,109.1703,110.1915,108.6326,108.7895
4,"1F.FQE10,1C.CDG10",NAZ439-03,THW1,,110.191500,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2355,"1F.FQE10,1C.CDG10",NAZ703-01,THW1,,112.162200,,,,,,,,,,
2356,"1F.FQE10,1C.CDG10",NAZ703-01,TLW1,,110.857500,,,,,,,,,,
2357,"1F.FQE10,1C.CDG10",NAZ703-01,TRW1,,1.304665,,,,,,,,,,
2358,"1F.FQE10,1C.CDG10",NAZ703-01,TUW1,,0.005854,,,,,,,,,,


In [15]:
df1.groupby(["OPE_NO", 'PARAMETRIC_NAME'])['WAFER_ID'].nunique().reset_index()

Unnamed: 0,OPE_NO,PARAMETRIC_NAME,WAFER_ID
0,"1F.FQE10,1C.CDG10",EEW0,15
1,"1F.FQE10,1C.CDG10",FEW0,15
2,"1F.FQE10,1C.CDG10",HFT0,15
3,"1F.FQE10,1C.CDG10",OEW0,15
4,"1F.FQE10,1C.CDG10",PEW0,15
5,"1F.FQE10,1C.CDG10",PTW0,15
6,"1F.FQE10,1C.CDG10",REW0,15
7,"1F.FQE10,1C.CDG10",SEW0,15
8,"1F.FQE10,1C.CDG10",SFW0,15
9,"1F.FQE10,1C.CDG10",TAW1,2


In [11]:
df1['PARAMETRIC_NAME'].nunique()

22

In [8]:
df_pandas_specific_ = ExtractFeaturesBySite.process_missing_values_for_site(df=df1,
                                                                good_site_columns=["SITE4_VAL", "SITE8_VAL", "SITE9_VAL", "SITE12_VAL", "SITE13_VAL"],
                                                                bad_site_columns=["SITE2_VAL", "SITE6_VAL", "SITE7_VAL", "SITE10_VAL", "SITE11_VAL"],
                                                                missing_value_threshold=0.7,
                                                                process_miss_site_mode='drop')
df_pandas_specific_

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE9_VAL,SITE4_VAL,SITE12_VAL,SITE8_VAL,SITE13_VAL,SITE10_VAL,SITE2_VAL,SITE11_VAL,SITE6_VAL,SITE7_VAL
3,"1F.FQE10,1C.CDG10",NAZ439-03,TDS1,17.0,109.251924,109.0181,108.8928,109.5629,108.8353,109.7379,110.171,109.1703,110.1915,108.6326,108.7895
2354,"1F.FQE10,1C.CDG10",NAZ703-01,TDS1,17.0,111.4316,111.6268,111.4483,111.269,111.4877,111.2835,111.8836,111.7336,112.1622,111.2949,111.3615


In [6]:
side_with_features_all = ExtractFeaturesBySite.extract_features_by_site(df=df1,
                                                                        grpby_list=['OPE_NO'],
                                                                        good_site_columns=["SITE4_VAL", "SITE8_VAL", "SITE9_VAL", "SITE12_VAL", "SITE13_VAL"],
                                                                        bad_site_columns=["SITE2_VAL", "SITE6_VAL", "SITE7_VAL", "SITE10_VAL", "SITE11_VAL"],
                                                                        missing_value_threshold=0.7,
                                                                        process_miss_site_mode='drop')

In [7]:
side_with_features_all

Unnamed: 0,OPE_NO,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,PERCENTILE_25,PERCENTILE_75,label
0,"1F.FQE10,1C.CDG10",NAZ439-03,TDS1,109.7379,108.8353,109.0181,109.2094,0.412631,108.8928,109.5629,0
1,"1F.FQE10,1C.CDG10",NAZ703-01,TDS1,111.6268,111.269,111.4483,111.42306,0.149614,111.2835,111.4877,0
0,"1F.FQE10,1C.CDG10",NAZ439-03,TDS1,110.1915,108.6326,109.1703,109.39098,0.747476,108.7895,110.171,1
1,"1F.FQE10,1C.CDG10",NAZ703-01,TDS1,112.1622,111.2949,111.7336,111.68716,0.362743,111.3615,111.8836,1
