In [47]:
import json
import numpy as np

import pandas as pd
import pyspark.sql.dataframe
from pca import pca
from pyspark.sql.functions import max, countDistinct, when, rank, lit, pandas_udf, PandasUDFType, monotonically_increasing_id
from pyspark.sql.window import Window
from typing import List, Dict, Union, Tuple, Optional
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import ClusterCentroids

In [2]:
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression

In [3]:
import warnings
import os
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
warnings.filterwarnings('ignore')

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '8g') \
    .config('spark.driver.cores', '12') \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.cores', '12') \
    .config('spark.cores.max', '12') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()



In [4]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/fdc_advanced_algorithm/CASE1_RUNDATA_stats_results_with_label.csv")
print(df_pandas.shape)

(276752, 22)


In [5]:
df1_ = ps.from_pandas(df_pandas).to_spark()
print(df1_.count())

276752


In [6]:
df_pandas.columns

Index(['WAFER_ID', 'TOOL_ID', 'RUN_ID', 'EQP_NAME', 'PRODUCT_ID', 'PRODG1',
       'TOOL_NAME', 'LOT_ID', 'RECIPE_NAME', 'OPER_NO', 'START_TIME',
       'parametric_name', 'CASE_INFO', 'mean', 'std', 'min', '25percentpoint',
       'median', '75percentpoint', 'max', 'range1', 'label'],
      dtype='object')

### PreprocessForRunData

In [7]:
class PreprocessForRunData:
    @staticmethod
    def integrate_columns(df: pyspark.sql.dataframe,
                          merge_operno_list: List[Dict[str, List[str]]],
                          merge_prodg1_list: List[Dict[str, List[str]]],
                          merge_product_list: List[Dict[str, List[str]]],
                          merge_eqp_list: List[Dict[str, List[str]]],
                          merge_chamber_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :param merge_prodg1_list: A list of dictionaries for merging 'PRODG1' column in a similar fashion.
        :param merge_product_list: A list of dictionaries for merging 'PRODUCT_ID' column in a similar fashion.
        :param merge_eqp_list: A list of dictionaries for merging 'EQP_NAME' column in a similar fashion.
        :param merge_chamber_list: A list of dictionaries for merging 'TOOL_NAME' column in a similar fashion.

        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        # split using comma
        splitter_comma = ","
        if merge_operno_list is not None and len(merge_operno_list) > 0:
            # Extract values from each dictionary in merge_operno_list and create a list
            values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
            # Concatenate values from each dictionary
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_operno_list]

            # Replace values in 'OPER_NO' column based on the rules defined in merge_operno_list
            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("OPER_NO",
                                   when(col("OPER_NO").isin(values), replacement_value).otherwise(col("OPER_NO")))

        if merge_prodg1_list is not None and len(merge_prodg1_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_prodg1_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_prodg1_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("PRODG1",
                                   when(col("PRODG1").isin(values), replacement_value).otherwise(col("PRODG1")))

        if merge_product_list is not None and len(merge_product_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_product_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_product_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("PRODUCT_ID",
                                   when(col("PRODUCT_ID").isin(values), replacement_value).otherwise(col("PRODUCT_ID")))

        if merge_eqp_list is not None and len(merge_eqp_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_eqp_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_eqp_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("EQP_NAME",
                                   when(col("EQP_NAME").isin(values), replacement_value).otherwise(col("EQP_NAME")))

        if merge_chamber_list is not None and len(merge_chamber_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_chamber_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_chamber_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn("TOOL_NAME",
                                   when(col("TOOL_NAME").isin(values), replacement_value).otherwise(col("TOOL_NAME")))
        return df

    @staticmethod
    def pre_process(df: pyspark.sql.dataframe) -> pyspark.sql.dataframe:
        """
        Preprocess the data extracted from the database for a specific CASE.
        :param df: Data for a specific CASE retrieved from the database.
        :return: Preprocessed data with relevant columns and filters applied.
        """
        # Select only the columns that will be used
        df = df.select('WAFER_ID', 'TOOL_ID', 'RUN_ID', 'EQP_NAME', 'PRODUCT_ID', 'PRODG1', 'TOOL_NAME', 
                       'LOT_ID', 'RECIPE_NAME', 'OPER_NO',  'parametric_name', 'mean', 'std', 'min', '25percentpoint',
                       'median', '75percentpoint', 'max', 'range1', 'label')
        # Drop duplicates based on all columns
        df1 = df.dropDuplicates()
        # Select the rows with the latest 'RUN_ID' for each combination of 'WAFER_ID', 'OPER_NO', 'TOOL_ID'
        df2 = df1.groupBy('WAFER_ID', 'OPER_NO', 'TOOL_ID').agg(max('RUN_ID').alias('RUN_ID'))
        df_run = df1.join(df2.dropDuplicates(subset=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID']),
                          on=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID'], how='inner')
        return df_run

    @staticmethod
    def commonality_analysis(df_run: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
        """
        Perform commonality analysis on preprocessed data.
        :param df_run: Preprocessed data after data preprocessing.
        :param grpby_list: List of columns ['PRODG1', 'EQP_NAME', 'OPER_NO', 'PRODUCT_ID', 'TOOL_NAME'] for grouping.
                Example: grpby_list = ['PRODG1', 'TOOL_NAME', 'OPER_NO'], grpby_list = ['PRODUCT_ID', 'OPER_NO']
        :return: Results of commonality analysis.
        """
        grps = (df_run.groupBy(grpby_list)
                .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
                .orderBy('bad_num', ascending=False))

        if grps.count() == 1:
            return grps
        else:
            grps = grps.filter("bad_num > 1 AND wafer_count > 2")
            return grps

    @staticmethod
    def get_data_list(common_res: pyspark.sql.dataframe,
                      grpby_list: List[str]) -> List[Dict[str, str]]:
        """
        Get a list of dictionaries for corresponding groups based on commonality analysis.

        :param common_res: Result of commonality analysis.
        :param grpby_list:  List of columns ['PRODG1', 'EQP_NAME', 'OPER_NO', 'PRODUCT_ID', 'TOOL_NAME'] for grouping.
        :return: List of dictionaries for corresponding groups.
                Example: [{'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN2J01N.0U01'},
                          {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN4X01N.0B01'},
                          {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFGN1501N.0C02'}]
        """
        # Order the results and limit to the top 10 groups
        good_bad_grps = common_res.orderBy(col("bad_num").desc(), col("wafer_count").desc(), col("good_num").desc()).limit(10)

        # Collect the data and convert it into a list of dictionaries
        data_list = good_bad_grps[grpby_list].collect()
        data_dict_list = [row.asDict() for row in data_list]
        return data_dict_list

    @staticmethod
    def get_train_data(df_run: pyspark.sql.dataframe, data_dict_list: List[Dict[str, str]]) -> pyspark.sql.dataframe:
        """
        Get the actual combination data for modeling from the original data.

        :param df_run: Preprocessed data after data preprocessing.
        :param data_dict_list: List of dictionaries with filtering conditions.
               Example: [{'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN2J01N.0U01'},
                          {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN4X01N.0B01'},
                          {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFGN1501N.0C02'}]
        :return: Filtered data for modeling.
        """
        # Get the filtering conditions for the first data dictionary
        first_data_dict = data_dict_list[0]
        conditions = " AND ".join(["{} == '{}'".format(col_, first_data_dict[col_]) for col_ in first_data_dict])
        # Filter the data for the first condition
        df_s = df_run.filter(conditions)

        # Loop through the remaining data dictionaries and filter the data accordingly
        for i in range(1, len(data_dict_list)):
            data_dict = data_dict_list[i]
            conditions = " AND ".join(["{} == '{}'".format(col_, data_dict[col_]) for col_ in data_dict])
            df_m = df_run.filter(conditions)
            df_s = df_s.union(df_m)
        return df_s

    @staticmethod
    def get_all_bad_wafer_num(df: pyspark.sql.dataframe) -> int:
        """
        Get the number of distinct bad WAFER in the DataFrame.
        """
        return df.filter("label == 1").select('WAFER_ID').distinct().count()

In [8]:
merge_operno_list = []
merge_prodg1_list = []
merge_product_list = []
merge_eqp_list = []
merge_chamber_list = []
grpby_list = ['OPER_NO', 'TOOL_NAME']

In [9]:
df_integrate_columns = PreprocessForRunData.integrate_columns(df=df1_, 
                                                              merge_operno_list=merge_operno_list, 
                                                              merge_prodg1_list=merge_prodg1_list,
                                                              merge_product_list=merge_product_list, 
                                                              merge_eqp_list=merge_eqp_list, 
                                                              merge_chamber_list=merge_chamber_list)

In [10]:
m, n = df_integrate_columns.count(), len(df_integrate_columns.columns)
print(f"Merged data: ({m}, {n})")

Merged data: (276752, 22)


In [20]:
# df_integrate_columns.show()

In [11]:
df_run = PreprocessForRunData.pre_process(df_integrate_columns)
m, n = df_run.count(), len(df_run.columns)
print(f"Preprocessed data: ({m}, {n})")

Preprocessed data: (276752, 20)


In [12]:
common_res = PreprocessForRunData.commonality_analysis(df_run=df_run, grpby_list=grpby_list)
common_res.show()

+--------+---------+-----------+--------+-------+
| OPER_NO|TOOL_NAME|wafer_count|good_num|bad_num|
+--------+---------+-----------+--------+-------+
|1F.EEK10|EKT72_PM1|       3892|    1207|   2685|
+--------+---------+-----------+--------+-------+



In [13]:
data_dict_list = PreprocessForRunData.get_data_list(common_res=common_res, grpby_list=grpby_list)
data_dict_list

[{'OPER_NO': '1F.EEK10', 'TOOL_NAME': 'EKT72_PM1'}]

In [20]:
train_data = PreprocessForRunData.get_train_data(df_run=df_run, data_dict_list=data_dict_list)
train_data.count()

176479

In [21]:
bad_wafer_num = PreprocessForRunData.get_all_bad_wafer_num(df=df_run_bs)
bad_wafer_num

2685

In [22]:
train_data_pandas = train_data.toPandas()

### FitModelForRunData

In [23]:
class FitModelForRunData:
    @staticmethod
    def get_pivot_table(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
        """
        Pivot the DataFrame based on specified grouping columns.

        Parameters:
        - df: Data for modeling.
        - grpby_list: List of grouping columns.

        Returns:
        - DataFrame: Result of pivoting the table.
        """
        index_cols = ['WAFER_ID', 'label']
        columns_cols = grpby_list + ['parametric_name']
        values_cols = ['mean', 'std', 'min', '25percentpoint', 'median', '75percentpoint', 'max', 'range1']
        df_pivot = df.dropna(axis=0).pivot_table(index=index_cols,
                                                 columns=columns_cols,
                                                 values=values_cols)
        df_pivot.columns = df_pivot.columns.map('#'.join)
        df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
        # Remove completely identical columns
        for column in df_pivot.columns.difference(index_cols):
            if df_pivot[column].nunique() == 1:
                df_pivot = df_pivot.drop(column, axis=1)
        return df_pivot

    
    @staticmethod
    def get_pipe_params(model):
        common_steps = [
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler())
        ]
        models = {
            'rf': (RandomForestClassifier(random_state=2024), {
                'model__n_estimators': [*range(10, 60, 10)],
                'model__max_depth': [*range(5, 50, 10)],
                'model__min_samples_split': [2, 5],
                'model__min_samples_leaf': [1, 3]
            }),

            'decisionTree': (DecisionTreeClassifier(random_state=2024), {
                'model__max_depth': [None, 5, 10, 15],
                'model__min_samples_split': [2, 5, 10],
                'model__min_samples_leaf': [1, 2, 4]
            }),

            'svc': (LinearSVC(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'squared_hinge'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'logistic': (LogisticRegression(random_state=2024, fit_intercept=False, solver='liblinear'), {
                'model__penalty': ['l1', 'l2'],
                'model__C': [0.1, 0.5, 1, 10, 50]
            }),

            'sgd': (SGDClassifier(random_state=2024, fit_intercept=False), {
                'model__loss': ['hinge', 'log_loss', 'perceptron', 'huber'],
                'model__penalty': ['l1', 'l2', 'elasticnet', None],
                'model__alpha': [0.0001, 0.001, 0.01, 0.1],
                'model__max_iter': [100, 500, 1000]
            })
        }

        if model in models:
            model_class, param_grid = models[model]
            steps = common_steps + [('model', model_class)]
            pipe = Pipeline(steps)
        else:
            raise Exception('Wrong Model Selection. Supported models are: pca, rf, decisionTree, svc, logistic, sgd.')
        return pipe, param_grid
    
    
    @staticmethod
    def fit_classification_model(df: pyspark.sql.dataframe, grpby_list: List[str], model) -> pyspark.sql.dataframe:
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("bad_wafer", IntegerType(), True),
                              StructField("roc_auc_score", FloatType(), True),
                              StructField("features", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)
        
        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run: pd.DataFrame) -> pd.DataFrame:
            # Pivot the table
            df_pivot = FitModelForRunData.get_pivot_table(df=df_run, grpby_list=grpby_list)

            # Define independent and dependent variables
            x_train = df_pivot[df_pivot.columns.difference(['WAFER_ID', 'label']).tolist()]
            y_train = df_pivot[['label']]
            
            if x_train.shape[1] > 6 and y_train['label'].nunique() > 1:
                z_ratio = y_train.value_counts(normalize=True)
                good_ratio = z_ratio[0]
                bad_ratio = z_ratio[1]
                if abs(good_ratio - bad_ratio) > 0.7:
                    undersampler = ClusterCentroids(random_state=1024)
                    x_train, y_train = undersampler.fit_resample(x_train, y_train)

                pipe, param_grid = FitModelForRunData.get_pipe_params(model=model)
                try:
                    grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
                    grid.fit(x_train.values, y_train.values.ravel())
                except ValueError:
                    return pd.DataFrame()

                best_est = grid.best_estimator_.steps[-1][-1]
                if hasattr(best_est, 'feature_importances_'):
                    small_importance_res = pd.DataFrame({'features': x_train.columns, 'importance': best_est.feature_importances_})
                else:
                    small_importance_res = pd.DataFrame({'features': x_train.columns, 'importance': abs(best_est.coef_.ravel())})
                
                sample_res_dict = {'bad_wafer': sum(df_pivot['label']), 'roc_auc_score': grid.best_score_}
                sample_res_dict.update({col_: df_run[col_].values[0] for col_ in grpby_list})
                small_sample_res = pd.DataFrame(sample_res_dict, index=[0]) 
                return pd.concat([small_importance_res, small_sample_res])
            else:
                return pd.DataFrame()
            
        return df.groupby(grpby_list).apply(get_model_result)
    
    
    @staticmethod
    def fit_pca_model(df, grpby_list):
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("bad_wafer", IntegerType(), True),
                              StructField("roc_auc_score", FloatType(), True),
                              StructField("features", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_result(df_run):
            df_pivot = FitModelForRunData.get_pivot_table(df=df_run, grpby_list=grpby_list)
            df_pivot_copy = df_pivot.copy()
            df_pivot_all = pd.concat([df_pivot, df_pivot_copy], axis=0)

            x_train = df_pivot_all[df_pivot_all.columns.difference(['WAFER_ID', 'label']).tolist()]
            if min(x_train.shape) > 2:
                n_components = min(min(x_train.shape) - 2, 20)
                model = pca(n_components=n_components, verbose=None)
                results = model.fit_transform(x_train)
                res_top = results['topfeat']
                res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
                res_top_select['importance'] = abs(res_top_select['loading'])
                res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()
                
                sample_res_dict = {'bad_wafer': sum(df_pivot['label']), 'roc_auc_score': -100}
                sample_res_dict.update({col_: df_run[col_].values[0] for col_ in grpby_list})
                small_sample_res = pd.DataFrame(sample_res_dict, index=[0])

                return pd.concat([res_top_select, small_sample_res])
            else:
                return pd.DataFrame()

        return df.groupby(grpby_list).apply(get_model_result)

In [24]:
grpby_list

['OPER_NO', 'TOOL_NAME']

In [25]:
res_pca_spark = FitModelForRunData.fit_pca_model(df=train_data, grpby_list=grpby_list)

res_pca = res_pca_spark.toPandas()

In [26]:
res_pca

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,range1#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+ESC+VOL...,0.706383
1,,,,,max#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+LO+RF+POWER,0.325073
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.406761
3,,,,,max#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+ESC+CURRENT,0.704845
4,,,,,median#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+LO+RF+VPP,0.330718
5,,,,,range1#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+LO+C1+V...,0.631531
6,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+ESC+VOLTAGE,0.723054
7,,,,,max#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+EDGE+HE+FLOW,0.59978
8,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+PROCESS+GA...,0.736919
9,,,,,max#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+LO+RF+REF+...,0.674563


-------------------------

In [27]:
res_rf_spark = FitModelForRunData.fit_classification_model(df=train_data, grpby_list=grpby_list, model='rf')

res_rf_pandas = res_rf_spark.toPandas()

In [29]:
res_rf_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.003595
1,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.000000
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.007630
3,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.000573
4,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.000000
...,...,...,...,...,...,...
227,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPER+TEMP...,0.000870
228,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.000000
229,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.000174
230,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+WALL+TEMPE...,0.000123


In [210]:
# importance_res_dict = {'bad_waferss': 20, 'roc_auc_scoress': 0.85}

# importance_res_dict.update({col_+'ss': df_run_bs_pandas[col_].unique() for col_ in grpby_list})

# importance_res_dict

# res_pca.assign(**importance_res_dict)

------

In [30]:
res_dt_spark = FitModelForRunData.fit_classification_model(df=train_data, grpby_list=grpby_list, model='decisionTree')

res_dt_pandas = res_dt_spark.toPandas()

In [31]:
res_dt_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
1,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
3,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
4,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
...,...,...,...,...,...,...
227,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPER+TEMP...,0.0
228,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.0
229,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.0
230,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+WALL+TEMPE...,0.0


In [51]:
# res_dt[res_dt['importance'] > 0]

---------

In [32]:
res_svc_spark = FitModelForRunData.fit_classification_model(df=train_data, grpby_list=grpby_list, model='svc')

res_svc_pandas = res_svc_spark.toPandas()

In [33]:
res_svc_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,1.042278
1,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.075305
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.147708
3,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.077592
4,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.002767
...,...,...,...,...,...,...
227,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPER+TEMP...,0.365121
228,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.298399
229,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.298399
230,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+WALL+TEMPE...,0.147836


-----------

In [34]:
res_logit_spark = FitModelForRunData.fit_classification_model(df=train_data, grpby_list=grpby_list, model='logistic')

res_logit_pandas = res_logit_spark.toPandas()

In [35]:
res_logit_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,5.544278
1,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.462375
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,1.055835
3,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.039750
4,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.016825
...,...,...,...,...,...,...
227,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPER+TEMP...,0.040526
228,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,1.755943
229,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,1.682798
230,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+WALL+TEMPE...,0.329697


---------

In [36]:
res_sgd_spark = FitModelForRunData.fit_classification_model(df=train_data, grpby_list=grpby_list, model='sgd')

res_sgd_pandas = res_sgd_spark.toPandas()

In [37]:
res_sgd_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,bad_wafer,roc_auc_score,features,importance
0,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
1,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
2,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
3,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
4,,,,,25percentpoint#1F.EEK10#EKT72_PM1#STEP_ALL_RUN...,0.0
...,...,...,...,...,...,...
227,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPER+TEMP...,0.0
228,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.0
229,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+UPPERMAGNE...,0.0
230,,,,,std#1F.EEK10#EKT72_PM1#STEP_ALL_RUN+WALL+TEMPE...,0.0


In [59]:
# res_sgd[res_sgd['importance'] > 0]

### GetFinalResultsForRunData

In [58]:
class GetFinalResultsForRunData:
    
    def __init__(self, df, grpby_list, request_id, bad_wafer_num):
        self.df = df
        self.grpby_list = grpby_list
        self.request_id = request_id
        self.bad_wafer_num = bad_wafer_num
    
#     @staticmethod
#     def split_score(df: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
#         """
#         Split the ROC AUC scores based on the specified grouping columns.

#         Parameters:
#         - df: Results after modeling.
#         - grpby_list: List of grouping columns.

#         Returns:
#         - DataFrame: ROC AUC scores result with each element in grpby_list as columns.
#         """
#         struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
#         struct_fields.extend([StructField("bad_wafer", IntegerType(), True),
#                               StructField("roc_auc_score", FloatType(), True)])
#         schema_all = StructType(struct_fields)

#         @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
#         def get_result(model_results: pd.DataFrame) -> pd.DataFrame:
#             select_expr = grpby_list + ['bad_wafer', 'roc_auc_score']
#             sample_res = model_results[select_expr].dropna(axis=0)
#             sample_res = sample_res[sample_res['roc_auc_score'] > 0]
#             return sample_res
#         return df.groupby(grpby_list).apply(get_result)
    
    
    @staticmethod
    def split_score(df: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
        select_expr = grpby_list + ['bad_wafer', 'roc_auc_score']
        sample_res = df.select(select_expr)
        sample_res = sample_res.dropna().filter(col('roc_auc_score') > 0)
        return sample_res

    
    @staticmethod
    def split_features(df: pd.DataFrame, index: int) -> str:
        """
        Split the 'features' column based on the specified index.

        Parameters:
        - df: Modeling results with 'features' column.
        - index: Order value.

        Returns:
        - str: Field attribute value.
        """
        return df['features'].apply(lambda x: x.split('#')[index])

    @staticmethod
    def get_split_feature_importance_table(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
        """
        Get the table after splitting the 'features' column based on the specified grouping columns.

        Parameters:
        - df: Modeling results with 'features' column.
        - grpby_list: List of grouping columns.

        Returns:
        - DataFrame: Table after splitting features.
        """
        n_feats = len(grpby_list)
        for i in range(n_feats):
            df[grpby_list[i]] = GetFinalResultsForRunData.split_features(df, i + 1)

        df['parametric_name'] = GetFinalResultsForRunData.split_features(df, n_feats + 1)
        df = df.drop(['features'], axis=1).reset_index(drop=True)
        return df
    
    @staticmethod
    def split_calculate_features(df: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
        """
        Split and calculate features based on the specified grouping columns.

        Parameters:
        - df: Results after modeling.
        - grpby_list: List of grouping columns.

        Returns:
        - DataFrame: Features importance results.
        """
        # Dynamically build schema
        struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
        struct_fields.extend([StructField("parametric_name", StringType(), True),
                              StructField("importance", FloatType(), True)])
        schema_all = StructType(struct_fields)

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_result(model_results: pd.DataFrame) -> pd.DataFrame:
            feature_importance_table = model_results[['features', 'importance']].dropna(axis=0)
            
            # Split features
            feature_importance_res_split = GetFinalResultsForRunData.get_split_feature_importance_table(df=feature_importance_table,
                                                                                                        grpby_list=grpby_list)

            feature_importance_res_split_drop = feature_importance_res_split.query("importance > 0").reset_index(drop=True)

            # Take the top 60% or 100% of each combination result
            feature_importance_res_split_nlargest = (feature_importance_res_split_drop.groupby(by=grpby_list)
                                                     .apply(lambda x: x.nlargest(int(x.shape[0] * 0.6), 'importance') if x.shape[0] > 1 else x.nlargest(int(x.shape[0] * 1), 'importance')).reset_index(drop=True))

            # Sum the importance for the same combination and parameter: 'feature_importance_groupby'
            feature_importance_groupby = (
                feature_importance_res_split_nlargest.groupby(grpby_list + ['parametric_name'])['importance'].sum().reset_index())

            return feature_importance_groupby

        return df.groupby(grpby_list).apply(get_result)
    
    @staticmethod
    def get_final_results(s_res: pyspark.sql.dataframe, 
                          f_res: pyspark.sql.dataframe, 
                          grpby_list: List[str], 
                          bad_wafer_num: int) -> pyspark.sql.dataframe:
        if not s_res.isEmpty():
            roc_auc_score_all = s_res.agg({"roc_auc_score": "sum"}).collect()[0][0]
            s_res = s_res.withColumn("roc_auc_score_ratio", col("roc_auc_score") / roc_auc_score_all).withColumn("bad_ratio", col("bad_wafer") / bad_wafer_num)
            df_merge = s_res.join(f_res, on=grpby_list, how='left')
            df_merge = df_merge.withColumn('weight_original', col('roc_auc_score_ratio') * col('bad_ratio') * col('importance'))
        else:
            df_merge = f_res.withColumnRenamed('importance', 'weight_original')

        # Normalize again
        weight_all = df_merge.agg({"weight_original": "sum"}).collect()[0][0]
        df_merge = df_merge.withColumn("weight", col("weight_original") / weight_all)
        df_merge = df_merge.select(grpby_list + ['parametric_name', 'weight']).orderBy('weight', ascending=False)
        return df_merge
    
#     @staticmethod
#     def add_certain_column(df: pyspark.sql.dataframe, by: str, request_id: str,
#                            grpby_list: List[str]) -> pyspark.sql.dataframe:
#         """
#         Add specific columns to the final modeling results.

#         Parameters:
#         - df: Final modeling result.
#         - by: Grouping column, manually add a column 'add'.
#         - request_id: Request ID passed in.
#         - grpby_list: List of grouping columns.

#         Returns:
#         - DataFrame: Final modeling result with specific columns added.
#         """
#         # Dynamically build schema_all
#         struct_fields = [StructField("PRODUCT_ID", StringType(), True),
#                          StructField("OPER_NO", StringType(), True),
#                          StructField("EQP_NAME", StringType(), True),
#                          StructField("PRODG1", StringType(), True),
#                          StructField("TOOL_NAME", StringType(), True)]
#         struct_fields.extend([StructField("parametric_name", StringType(), True),
#                               StructField("weight", FloatType(), True),
#                               StructField("request_id", StringType(), True),
#                               StructField("weight_percent", FloatType(), True),
#                               StructField("index_no", IntegerType(), True)])
#         schema_all = StructType(struct_fields)

#         @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
#         def get_result(final_res: pd.DataFrame) -> pd.DataFrame:
#             final_res['weight'] = final_res['weight'].astype(float)
#             final_res = final_res.query("weight > 0")
#             final_res = final_res.assign(weight_percent=final_res['weight'] * 100, request_id=request_id)
            
#             final_res = final_res.sort_values('weight', ascending=False)
#             final_res['index_no'] = [i + 1 for i in range(len(final_res))]
#             final_res = final_res.drop('add', axis=1)

#             info_list = ['PRODUCT_ID', 'OPER_NO', 'EQP_NAME', 'PRODG1', 'TOOL_NAME']
#             for column in info_list:
#                 if column not in final_res.columns.tolist():
#                     final_res[column] = np.nan
#             return final_res

#         return df.groupby(by).apply(get_result)

    @staticmethod
    def add_certain_column(df, request_id: str, grpby_list: List[str]):
        df = df.withColumn('weight_percent', col('weight') * 100)
        df = df.withColumn('request_id', lit(request_id))
        df = df.withColumn('index_no', monotonically_increasing_id() + 1)
        
        info_list = ['PRODUCT_ID', 'OPER_NO', 'EQP_NAME', 'PRODG1', 'TOOL_NAME']
        for column in info_list:
            if column not in df.columns:
                df = df.withColumn(column, lit(None).cast(StringType()))
        return df
    
    def run(self):
        s_res = GetFinalResultsForRunData.split_score(df=self.df, grpby_list=self.grpby_list)
        print("s_res:")
        s_res.show()
            
        f_res = GetFinalResultsForRunData.split_calculate_features(df=self.df, grpby_list=self.grpby_list)
        print("f_res:")
        f_res.show()
            
        final_res = GetFinalResultsForRunData.get_final_results(s_res=s_res, f_res=f_res,
                                                  grpby_list=self.grpby_list,
                                                  bad_wafer_num=self.bad_wafer_num)
        print("final_res:")
        final_res.show()
        
        final_res_add_columns = GetFinalResultsForRunData.add_certain_column(df=final_res, 
                                                                             request_id=self.request_id,
                                                                             grpby_list=self.grpby_list)
        print("final_res_add_columns:")
        final_res_add_columns.show()
        
        return final_res_add_columns

In [66]:
final_logit = GetFinalResultsForRunData(df=res_logit_spark, grpby_list=grpby_list, request_id='ghg', bad_wafer_num=bad_wafer_num_).run()

s_res:
+--------+---------+---------+-------------+
| OPER_NO|TOOL_NAME|bad_wafer|roc_auc_score|
+--------+---------+---------+-------------+
|1F.EEK10|EKT72_PM1|     2685|    0.7779338|
+--------+---------+---------+-------------+

f_res:
+--------+---------+--------------------+----------+
| OPER_NO|TOOL_NAME|     parametric_name|importance|
+--------+---------+--------------------+----------+
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+APC+...| 29.744667|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+AUTO...| 1.0558355|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+CENT...|  17.91408|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+CENT...| 4.3238883|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+CHAM...| 20.452917|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...| 27.947859|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...| 15.473866|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...|  9.320722|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...| 7.7498856|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...| 1.2216109|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+FLOW...| 18.832312|
|1F.EEK10|EKT72_PM

In [67]:
final_logit_pandas = final_logit.toPandas()
final_logit_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,parametric_name,weight,weight_percent,request_id,index_no,PRODUCT_ID,EQP_NAME,PRODG1
0,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+VPP,0.221299,22.129917,ghg,1,,,
1,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+C1+VAR+CAPACITOR,0.069477,6.947706,ghg,2,,,
2,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+PROCESS+GAS+6+CF4,0.06452,6.452049,ghg,3,,,
3,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+APC+POSITION,0.049712,4.971154,ghg,4,,,
4,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+FLOWSPLITRATIO,0.049036,4.903643,ghg,5,,,
5,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+EDGE+GAS+PRESSURE,0.046709,4.670858,ghg,6,,,
6,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+FLOWSPLITEDGE,0.046406,4.640641,ghg,7,,,
7,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+PROCESS+GAS+14+O2,0.039515,3.951538,ghg,8,,,
8,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+POWER,0.039292,3.929232,ghg,9,,,
9,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+C2+VAR+CAPACITOR,0.037581,3.758084,ghg,10,,,


---------

In [59]:
final_pca = GetFinalResultsForRunData(df=res_pca_spark, grpby_list=grpby_list, request_id='ghg', bad_wafer_num=bad_wafer_num_).run()

s_res:
+-------+---------+---------+-------------+
|OPER_NO|TOOL_NAME|bad_wafer|roc_auc_score|
+-------+---------+---------+-------------+
+-------+---------+---------+-------------+

f_res:
+--------+---------+--------------------+----------+
| OPER_NO|TOOL_NAME|     parametric_name|importance|
+--------+---------+--------------------+----------+
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...| 0.5997805|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...| 1.3727214|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...| 1.4294376|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+FLOW...| 0.6854503|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+LO+C...| 1.1642345|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+LO+R...|0.73587495|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+LO+R...|0.67456263|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+LO+R...|0.75137234|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+PROC...|0.73691916|
+--------+---------+--------------------+----------+

final_res:
+--------+---------+--------------------+-------------------+
| OPER_NO|TOOL_NAME|     parametric_name|     

In [60]:
final_pca_pandas = final_pca.toPandas()
final_pca_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,parametric_name,weight,weight_percent,request_id,index_no,PRODUCT_ID,EQP_NAME,PRODG1
0,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+ESC+VOLTAGE,0.175384,17.538351,ghg,1,,,
1,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+ESC+CURRENT,0.168425,16.842477,ghg,2,,,
2,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+C1+VAR+CAPACITOR,0.142845,14.284467,ghg,3,,,
3,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+VPP,0.092189,9.218893,ghg,4,,,
4,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+PROCESS+GAS+7+AR,0.090416,9.041561,ghg,5,,,
5,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+POWER,0.090287,9.028749,ghg,6,,,
6,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+FLOWSPLITRATIO,0.084101,8.410069,ghg,7,,,
7,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+REF+POWER,0.082765,8.276483,ghg,8,,,
8,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+EDGE+HE+FLOW,0.07359,7.358951,ghg,9,,,


In [61]:
final_pca_pandas.dtypes

OPER_NO             object
TOOL_NAME           object
parametric_name     object
weight             float64
weight_percent     float64
request_id          object
index_no             int64
PRODUCT_ID          object
EQP_NAME            object
PRODG1              object
dtype: object

--------

In [62]:
final_sgd = GetFinalResultsForRunData(df=res_sgd_spark, grpby_list=grpby_list, request_id='ghg', bad_wafer_num=bad_wafer_num_).run()

s_res:
+--------+---------+---------+-------------+
| OPER_NO|TOOL_NAME|bad_wafer|roc_auc_score|
+--------+---------+---------+-------------+
|1F.EEK10|EKT72_PM1|     2685|    0.7819108|
+--------+---------+---------+-------------+

f_res:
+--------+---------+--------------------+------------+
| OPER_NO|TOOL_NAME|     parametric_name|  importance|
+--------+---------+--------------------+------------+
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...|6.1878975E-4|
+--------+---------+--------------------+------------+

final_res:
+--------+---------+--------------------+------+
| OPER_NO|TOOL_NAME|     parametric_name|weight|
+--------+---------+--------------------+------+
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...|   1.0|
+--------+---------+--------------------+------+

final_res_add_columns:
+--------+---------+--------------------+------+--------------+----------+--------+----------+--------+------+
| OPER_NO|TOOL_NAME|     parametric_name|weight|weight_percent|request_id|index_no|PRODUCT_ID

In [63]:
final_sgd_pandas = final_sgd.toPandas()
final_sgd_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,parametric_name,weight,weight_percent,request_id,index_no,PRODUCT_ID,EQP_NAME,PRODG1
0,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+ESC+CURRENT,1.0,100.0,ghg,1,,,


-----

In [64]:
final_rf = GetFinalResultsForRunData(df=res_rf_spark, grpby_list=grpby_list, request_id='ghg', bad_wafer_num=bad_wafer_num_).run()

s_res:
+--------+---------+---------+-------------+
| OPER_NO|TOOL_NAME|bad_wafer|roc_auc_score|
+--------+---------+---------+-------------+
|1F.EEK10|EKT72_PM1|     2685|    0.8938744|
+--------+---------+---------+-------------+

f_res:
+--------+---------+--------------------+------------+
| OPER_NO|TOOL_NAME|     parametric_name|  importance|
+--------+---------+--------------------+------------+
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+APC+...|0.0065902984|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+APCTEMP|0.0010451319|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+AUTO...| 0.040297158|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+AUTO...|  0.04526706|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+CENT...| 0.040968854|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+CHAM...|0.0017869326|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...| 0.028456252|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+EDGE...| 0.042974472|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...|  0.04440391|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+ESC+...|  7.42566E-4|
|1F.EEK10|EKT72_PM1|STEP_ALL_RUN+FLOW...| 0.0

In [65]:
final_rf_pandas = final_rf.toPandas()
final_rf_pandas

Unnamed: 0,OPER_NO,TOOL_NAME,parametric_name,weight,weight_percent,request_id,index_no,PRODUCT_ID,EQP_NAME,PRODG1
0,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+RF+TIME,0.254388,25.438836,ghg,1,,,
1,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+C1+VAR+CAPACITOR,0.132408,13.240831,ghg,2,,,
2,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+C2+VAR+CAPACITOR,0.093807,9.380737,ghg,3,,,
3,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+LO+RF+VPP,0.08321,8.320966,ghg,4,,,
4,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+FLOWSPLITCENTER,0.080785,8.078501,ghg,5,,,
5,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+PROCESS+GAS+8+O2,0.049391,4.939094,ghg,6,,,
6,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+AUTO+CHECK+LEAK+RATEC+SHUTTER,0.045926,4.592555,ghg,7,,,
7,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+ESC+CURRENT,0.04505,4.504984,ghg,8,,,
8,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+EDGE+HE+PRESSURE,0.0436,4.359961,ghg,9,,,
9,1F.EEK10,EKT72_PM1,STEP_ALL_RUN+CENTER+GAS+PRESSURE,0.041565,4.156482,ghg,10,,,


In [113]:
# split1[split1['parametric_name'] == 'STEP_ALL_RUN+AUTO+CHECK+LEAK+RATEC+SHUTTER']

In [114]:
# oper, tool = '1F.EEK10', 'EKT72_PM1'
# df_pandas_query = df_pandas.query(f"OPER_NO == '{oper}' & TOOL_NAME == '{tool}'")

# df_pandas_query[df_pandas_query['parametric_name'] == 'STEP_ALL_RUN+AUTO+CHECK+LEAK+RATEC+SHUTTER']

In [112]:
# index_cols = ['WAFER_ID', 'label']
# columns_cols = grpby_list + ['parametric_name']
# values_cols = ['mean', 'std', 'min', '25percentpoint', 'median', '75percentpoint', 'max', 'range1']
# df_pivot = df_pandas_query.dropna(axis=0).pivot_table(index=index_cols,
#                                          columns=columns_cols,
#                                          values=values_cols)
# df_pivot.columns = df_pivot.columns.map('#'.join)
# df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
# df_pivot

In [111]:
# certain_cols = df_pivot.filter(like='STEP_ALL_RUN+AUTO+CHECK+LEAK+RATEC+SHUTTER').columns
# df_pivot[certain_cols]

In [110]:
# df_pivot.columns[df_pivot.columns.str.contains('STEP_ALL_RUN\+AUTO\+CHECK\+LEAK\+RATEC\+SHUTTER')] 

In [73]:
ddd = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/df_run.csv")

In [71]:
df_dict = [ddd.iloc[i].to_dict() for i in range(len(ddd))]

In [72]:
pd.DataFrame(df_dict)

Unnamed: 0,WAFER_ID,OPER_NO,TOOL_NAME,parametric_name,label,mean,std,min,q25,median,q75,max,range1,PRODG1,PRODUCT_ID,EQP_NAME
0,wafer118-0,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY0,1,56.192925,-37.900608,4.491757,10.787102,16.331970,20.863856,29.652808,23.487218,P1,P11,toolPFT
1,wafer118-0,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY1,1,62.805055,-205.446499,3.392530,11.352952,16.815229,20.480209,28.533770,26.308960,P1,P12,toolPFT
2,wafer118-0,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY2,1,74.744022,53.574687,1.552233,10.047886,16.009955,20.305111,27.964941,24.623259,P1,P13,toolPFT
3,wafer118-0,oper4M.EFM1,toolPFT_SCP_13-1,PLATE_CHILLY3,1,35.347544,-119.300088,1.814826,11.602044,17.632701,20.309359,26.373384,23.648485,P1,P14,toolPFT
4,wafer118-0,oper4M.EFM1,toolPFT_SCP_13-1,PLATE_CHILLY4,1,196.239363,-99.479120,1.771329,12.020322,16.282130,20.435256,29.737618,25.064779,P1,P15,toolPFT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,wafer118-67,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY1,1,75.816229,-16.046649,9.897937,7.075020,5.851767,28.205983,22.362849,52.223474,P1,P83,toolPFT
73,wafer118-67,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY2,1,75.810515,-16.323746,11.174054,6.908946,4.237029,30.621747,22.132919,56.990234,P1,P84,toolPFT
74,wafer118-34,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY0,1,0.000000,-16.600844,0.000000,6.742871,2.622291,0.000000,21.902989,0.000000,P1,P85,toolPFT
75,wafer118-34,oper4M.EFM0,toolPFT_SCP_13-0,PLATE_CHILLY1,1,0.000000,-16.877941,13.726288,6.576797,1.007553,35.453276,21.673058,66.523755,P1,P86,toolPFT
