In [2]:
import pyspark
import pandas as pd
from pca import pca
from typing import Union, List, Dict
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from pyspark.sql.types import StructType, StructField, StringType, FloatType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, when, sum as spark_sum, monotonically_increasing_id, collect_set, explode, countDistinct
# from src.exceptions.rca_base_exception import RCABaseException

In [3]:
import os
import json
from pyspark.sql import SparkSession
import pyspark.pandas as ps

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '8g') \
    .config('spark.driver.cores', '12') \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.cores', '12') \
    .config('spark.cores.max', '12') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()



In [4]:
df_pandas = pd.read_csv(
    "D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/wat_algorithm/wat_select.csv")
df_pandas = df_pandas[df_pandas['PRODUCT_ID'].isin(
    ["AEMNRM01N.0B01", "AEMNE801N.0B01", "AFXNE001N.0C01", "AGKNCE01N.0A01", "AFXNJ701N.0B01"])]
df_spark = ps.from_pandas(df_pandas).to_spark()
print(f"df_spark shape: ({df_spark.count()}, {len(df_spark.columns)})")
df_spark.show()

  fields = [
  for column, series in pdf.iteritems():


df_spark shape: (13536, 155)
+---------------+---------+--------+--------------------+---------+------------+-----------+--------------+----------------+---------+--------------+----------------+---------------------+----------+---------+----------+----------+---------+----------+----------+---------+------------+---------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------

In [5]:
class DataPreprocessorForWat:
    def __init__(self,
                 df: pyspark.sql.dataframe,
                 grpby_list: list[str],
                 columns_list: list[str],
                 convert_to_numeric_list: list[str],
                 merge_operno_list: List[Dict[str, List[str]]],
                 merge_prodg1_list: List[Dict[str, List[str]]],
                 merge_product_list: List[Dict[str, List[str]]]
                 ):
        self.df = df
        self.grpby_list = grpby_list
        self.columns_list = columns_list
        self.convert_to_numeric_list = convert_to_numeric_list
        self.merge_operno_list = merge_operno_list
        self.merge_prodg1_list = merge_prodg1_list
        self.merge_product_list = merge_product_list

    @staticmethod
    def select_columns(df: pyspark.sql.dataframe, columns_list: list[str]) -> pyspark.sql.dataframe:
        return df.select(columns_list)

    # @staticmethod
    # def exclude_some_data(df: pyspark.sql.dataframe, key_words: list[str],
    #                       certain_column: str) -> pyspark.sql.dataframe:
    #     key_words_str = '|'.join(key_words)
    #     df_filtered = df.filter(~col(certain_column).rlike(key_words_str))
    #     return df_filtered

    @staticmethod
    def pre_process(df: pyspark.sql.dataframe, convert_to_numeric_list: list[str]) -> pyspark.sql.dataframe:
        for column in convert_to_numeric_list:
            df = df.withColumn(column, col(column).cast('double'))
        if 'SITE_COUNT' in convert_to_numeric_list:
            convert_to_numeric_list.remove('SITE_COUNT')
        df = df.dropna(subset=convert_to_numeric_list, how='all')
        return df

    @staticmethod
    def integrate_columns(df: pyspark.sql.dataframe,
                          merge_operno_list: List[Dict[str, List[str]]],
                          merge_prodg1_list: List[Dict[str, List[str]]],
                          merge_product_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
               Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                         {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
        :param merge_prodg1_list: A list of dictionaries for merging 'PRODG1' column in a similar fashion.
        :param merge_product_list: A list of dictionaries for merging 'PRODUCT_ID' column in a similar fashion.

        :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
        """
        df_merged = DataPreprocessorForWat.integrate_single_column(df, merge_operno_list, 'OPE_NO')
        df_merged = DataPreprocessorForWat.integrate_single_column(df_merged, merge_prodg1_list, 'PRODG1')
        df_merged = DataPreprocessorForWat.integrate_single_column(df_merged, merge_product_list, 'PRODUCT_ID')
        return df_merged

    @staticmethod
    def integrate_single_column(df: pyspark.sql.dataframe,
                                merge_list: List[Dict[str, List[str]]],
                                column_name: str) -> pyspark.sql.dataframe:
        """
        Integrate columns in the DataFrame based on the provided list.

        :param df: The input DataFrame.
        :param merge_list: A list of dictionaries where each dictionary contains values to be merged.
        :param column_name: The name of the column to be merged.

        :return: DataFrame with specified column integrated according to the merge rules.
        """
        splitter_comma = ","
        if merge_list is not None and len(merge_list) > 0:
            values_to_replace = [list(rule.values())[0] for rule in merge_list]
            merged_values = [splitter_comma.join(list(rule.values())[0]) for rule in merge_list]

            for values, replacement_value in zip(values_to_replace, merged_values):
                df = df.withColumn(column_name,
                                   when(col(column_name).isin(values), replacement_value).otherwise(col(column_name)))
        return df

    @staticmethod
    def commonality_analysis(df_run: pyspark.sql.dataframe, grpby_list: list[str]) -> pyspark.sql.dataframe:
        grps_all = df_run.groupBy(grpby_list).agg(countDistinct('WAFER_ID').alias('GOOD_NUM'))
        return grps_all

    @staticmethod
    def extract_unique_params_within_groups(df: pyspark.sql.dataframe, grpby_list) -> pyspark.sql.dataframe:
        grouped = df.groupby(*grpby_list).agg(collect_set('PARAMETRIC_NAME').alias('unique_values'))
        exploded = grouped.select(*grpby_list, explode(col('unique_values')).alias('PARAMETRIC_NAME'))
        unique_params_within_groups = exploded.dropDuplicates()
        return unique_params_within_groups

    def run(self) -> pyspark.sql.dataframe:
        df_select = self.select_columns(df=self.df, columns_list=self.columns_list)
        df_integrate = self.integrate_columns(df=df_select,
                                              merge_operno_list=self.merge_operno_list,
                                              merge_prodg1_list=self.merge_prodg1_list,
                                              merge_product_list=self.merge_product_list)
        grps_all = self.commonality_analysis(df_run=df_integrate, grpby_list=self.grpby_list)
        print("grps_all:", grps_all.count())
        grps_all.show()

        unique_params_within_groups = self.extract_unique_params_within_groups(df=df_integrate,
                                                                               grpby_list=self.grpby_list)
        df_preprocess = self.pre_process(df=df_integrate, convert_to_numeric_list=self.convert_to_numeric_list)
        return unique_params_within_groups, df_preprocess, grps_all

In [6]:
json_loads_dict = {"requestId": "269",
                   "algorithm": "wat_by_wafer",
                   "requestParam": {"dateRange": {"start": "2021-12-06 19:50:49", "end": "2024-03-06 19:50:49"},
                                    "operNo": [],
                                    "uploadId": "84f6a2b46a5443ec9797347424402058",
                                    "flagMergeAllProdg1": "0",
                                    "flagMergeAllProductId": "0",
                                    "flagMergeAllChamber": "0",
                                    "mergeProdg1": [],
#                                     "mergeProductId": [
#                                         {"xx1": ["AEMNRM01N.0B01", "AEMNE801N.0B01", "AFXNE001N.0C01"]},
#                                         {"xx2": ["AGKNCE01N.0A01", "AFXNJ701N.0B01"]}],
                                    "mergeProductId": [],
                                    "mergeEqp": [],
                                    "mergeChamber": [],
                                    "mergeOperno": [],
                                    "goodSite": ["SITE1_VAL", "SITE2_VAL", "SITE3_VAL"],
                                    "badSite": ["SITE4_VAL", "SITE8_VAL"],
                                    }
                   }

df_ = pd.DataFrame({"requestId": [json_loads_dict["requestId"]],
                    "requestParam": [json.dumps(json_loads_dict["requestParam"])]})

request_id_ = df_["requestId"].values[0]
request_params = df_["requestParam"].values[0]
parse_dict = json.loads(request_params)

merge_operno = list(parse_dict.get('mergeOperno')) if parse_dict.get('mergeOperno') else None
merge_prodg1 = list(parse_dict.get('mergeProdg1')) if parse_dict.get('mergeProdg1') else None
merge_product = list(parse_dict.get('mergeProductId')) if parse_dict.get('mergeProductId') else None
good_site_columns = list(parse_dict.get('goodSite')) if parse_dict.get('goodSite') else None
bad_site_columns = list(parse_dict.get('badSite')) if parse_dict.get('badSite') else None
# grpby_list = ['OPE_NO']
grpby_list = ['OPE_NO', 'PRODUCT_ID']

In [7]:
good_site_columns = list(set(good_site_columns))
bad_site_columns = list(set(bad_site_columns))
site_columns = good_site_columns + bad_site_columns

columns_list = grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'SITE_COUNT', 'AVERAGE'] + site_columns
convert_to_numeric_list = ['SITE_COUNT', 'AVERAGE'] + site_columns

In [8]:
unique_params_within_groups, df_preprocess, grps_all = DataPreprocessorForWat(df=df_spark,
                                                                              grpby_list=grpby_list,
                                                                              columns_list=columns_list,
                                                                              convert_to_numeric_list=convert_to_numeric_list,
                                                                              merge_operno_list=merge_operno,
                                                                              merge_prodg1_list=merge_prodg1,
                                                                              merge_product_list=merge_product).run()
print(f"df_preprocess shape: ({df_preprocess.count()}, {len(df_preprocess.columns)})")
print("unique_params_within_groups:", unique_params_within_groups.count())

grps_all: 5
+--------+--------------+--------+
|  OPE_NO|    PRODUCT_ID|GOOD_NUM|
+--------+--------------+--------+
|ST.TTS10|AEMNRM01N.0B01|       8|
|ST.TTS10|AFXNJ701N.0B01|       3|
|ST.TTS10|AFXNE001N.0C01|      12|
|ST.TTS10|AGKNCE01N.0A01|       2|
|ST.TTS10|AEMNE801N.0B01|       5|
+--------+--------------+--------+

df_preprocess shape: (13536, 11)
unique_params_within_groups: 1381


In [9]:
unique_params_within_groups.show()

+--------+--------------+--------------------+
|  OPE_NO|    PRODUCT_ID|     PARAMETRIC_NAME|
+--------+--------------+--------------------+
|ST.TTS10|AEMNRM01N.0B01|  RcSTK_1CV1V2_Ppoly|
|ST.TTS10|AEMNRM01N.0B01|  VtcsRamp_lcoal_buf|
|ST.TTS10|AEMNRM01N.0B01|      RcSTK_1C_Npoly|
|ST.TTS10|AEMNRM01N.0B01|     RsN+_COp12_serp|
|ST.TTS10|AEMNRM01N.0B01|         IfMNNA_1002|
|ST.TTS10|AEMNRM01N.0B01|       VtclSF1_p3p45|
|ST.TTS10|AEMNRM01N.0B01|         IdlLN_p2p09|
|ST.TTS10|AEMNRM01N.0B01|         IdSDA_PAD_N|
|ST.TTS10|AEMNRM01N.0B01|       VtcsLPL_p2p09|
|ST.TTS10|AEMNRM01N.0B01|            RsMPW_2A|
|ST.TTS10|AEMNRM01N.0B01|Vtcsxdec_levelshi...|
|ST.TTS10|AEMNRM01N.0B01|            BVjNHLPW|
|ST.TTS10|AEMNRM01N.0B01|             RsM3_p1|
|ST.TTS10|AEMNRM01N.0B01|  IdCMPN_STG2_sample|
|ST.TTS10|AEMNRM01N.0B01|         IdlLP_p4p09|
|ST.TTS10|AEMNRM01N.0B01|         IfRS1_p3p28|
|ST.TTS10|AEMNRM01N.0B01|     VtcsDCG1_p24p26|
|ST.TTS10|AEMNRM01N.0B01|       VtclMNNA_1002|
|ST.TTS10|AEM

In [10]:
df_preprocess.show()

+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|     PARAMETRIC_NAME|SITE_COUNT|     AVERAGE|SITE3_VAL|SITE2_VAL|SITE1_VAL|SITE8_VAL|SITE4_VAL|
+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVcbMLPNP_5|       9.0|       -14.6|    -14.6|    -14.6|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVceMLPNP_5|       9.0|       -15.2|    -15.2|    -15.2|    -15.2|    -15.2|    -15.2|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|   BVjDNWDNW_S3_10nA|       9.0| 14.55555556|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVjDNWNW_S2_10nA|       9.0| 14.53333333|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|       BVjGR_S2_10nA|       9.0| 14.48888889|   

In [11]:
df_preprocess_pandas = df_preprocess.toPandas()
df_preprocess_pandas

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE3_VAL,SITE2_VAL,SITE1_VAL,SITE8_VAL,SITE4_VAL
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,9.0,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000
1,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVceMLPNP_5,9.0,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000
2,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWDNW_S3_10nA,9.0,14.555556,14.400000,14.600000,14.400000,14.600000,14.600000
3,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWNW_S2_10nA,9.0,14.533333,14.400000,14.600000,14.400000,14.600000,14.600000
4,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjGR_S2_10nA,9.0,14.488889,14.400000,14.400000,14.400000,14.600000,14.400000
...,...,...,...,...,...,...,...,...,...,...,...
13531,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsTX_p7p38,9.0,-0.056430,-0.007673,-0.063450,-0.042231,-0.065479,-0.043970
13532,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN1,9.0,0.192239,0.195248,0.194373,0.189287,0.190760,0.185206
13533,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN2,9.0,0.190023,0.188890,0.190073,0.189869,0.192523,0.182971
13534,ST.TTS10,AEMNRM01N.0B01,NHH106-23,Vtcsxdec_levelshift_nfet3,9.0,0.559120,0.544319,0.550910,0.544663,0.582070,0.546110


#### 提取特征

In [14]:
def process_missing_values_for_site(df: pd.DataFrame,
                                    good_site_columns: list[str],
                                    bad_site_columns: list[str],
                                    missing_value_threshold: Union[int, float] = 0.7,
                                    process_miss_site_mode: str = 'drop') -> pd.DataFrame:
    assert process_miss_site_mode in ['drop', 'fill']
    site_columns = good_site_columns + bad_site_columns
    if process_miss_site_mode == 'drop':
        # drop rows based on the missing value threshold
        df = df.dropna(subset=site_columns, thresh=missing_value_threshold)
    else:
        # fill missing values in the corresponding site rows using the AVERAGE of that row
        df[site_columns] = df[site_columns].apply(lambda column: column.fillna(df['AVERAGE']))
    return df


def calculate_statistics(row):
        return pd.Series({
            'MAX_VAL': row.max(),
            'MIN_VAL': row.min(),
            'MEDIAN': row.median(),
            'AVERAGE': row.mean(),
            'STD_DEV': row.std()})

    
def calculate_site_stats(df: pd.DataFrame, grpby_list: list[str], site_columns: list[str],
                         good_or_bad: str) -> pd.DataFrame:
    assert good_or_bad in ['good', 'bad'], "Label could only be 'good' or 'bad'"
    selected_df = df[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME'] + site_columns].reset_index(drop=True)
    # Perform statistical calculations for each row
    side_features = selected_df.apply(lambda row: calculate_statistics(row[site_columns]), axis=1)
    side_features = side_features.fillna(0)
    df_with_features = pd.concat([selected_df, side_features], axis=1)
    if good_or_bad == 'good':
        df_with_features['label'] = 0
    else:
        df_with_features['label'] = 1
    return df_with_features

In [13]:
oper, product = 'ST.TTS10', 'AEMNRM01N.0B01'
df_run = df_preprocess_pandas.query(f"OPE_NO == '{oper}' & PRODUCT_ID == '{product}'")
print(df_run.shape)
df_run.isna().any().any()

(3528, 11)


False

In [15]:
df_run_spark = ps.from_pandas(df_run).to_spark()
print((df_run_spark.count(), len(df_run_spark.columns)))
df_run_spark.show()

  fields = [
  for column, series in pdf.iteritems():


(3528, 11)
+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|     PARAMETRIC_NAME|SITE_COUNT|     AVERAGE|SITE3_VAL|SITE2_VAL|SITE1_VAL|SITE8_VAL|SITE4_VAL|
+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVcbMLPNP_5|       9.0|       -14.6|    -14.6|    -14.6|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVceMLPNP_5|       9.0|       -15.2|    -15.2|    -15.2|    -15.2|    -15.2|    -15.2|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|   BVjDNWDNW_S3_10nA|       9.0| 14.55555556|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVjDNWNW_S2_10nA|       9.0| 14.53333333|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|       BVjGR_S2_10nA|       9.0| 14.4

In [16]:
from pyspark.sql.functions import mean
means = df_run_spark.agg(*[mean(c).alias(c) for c in site_columns])
means.first().asDict()

{'SITE3_VAL': 3.117913832199546e+27,
 'SITE2_VAL': 3.6848072562358276e+27,
 'SITE1_VAL': 3.117913832199546e+27,
 'SITE8_VAL': 3.117913832199546e+27,
 'SITE4_VAL': 3.401360544217687e+27}

In [17]:
df_run_spark_filled = df_run_spark.fillna(means.first().asDict())
print((df_run_spark_filled.count(), len(df_run_spark_filled.columns)))

(3528, 11)


In [18]:
df_run_spark_filled.show()

+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|     PARAMETRIC_NAME|SITE_COUNT|     AVERAGE|SITE3_VAL|SITE2_VAL|SITE1_VAL|SITE8_VAL|SITE4_VAL|
+--------+--------------+---------+--------------------+----------+------------+---------+---------+---------+---------+---------+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVcbMLPNP_5|       9.0|       -14.6|    -14.6|    -14.6|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|         BVceMLPNP_5|       9.0|       -15.2|    -15.2|    -15.2|    -15.2|    -15.2|    -15.2|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|   BVjDNWDNW_S3_10nA|       9.0| 14.55555556|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVjDNWNW_S2_10nA|       9.0| 14.53333333|     14.4|     14.6|     14.4|     14.6|     14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|       BVjGR_S2_10nA|       9.0| 14.48888889|   

In [19]:
param = 'BVcbMLPNP_5'
df_run_spark_filled_goood = df_run_spark_filled.filter(f"PARAMETRIC_NAME == '{param}'").select(grpby_list 
                                                                                          + ['WAFER_ID', 'PARAMETRIC_NAME'] 
                                                                                          + good_site_columns)

df_run_spark_filled_bad = df_run_spark_filled.filter(f"PARAMETRIC_NAME == '{param}'").select(grpby_list 
                                                                                          + ['WAFER_ID', 'PARAMETRIC_NAME'] 
                                                                                          + bad_site_columns)

In [20]:
df_run_spark_filled_goood.show()

+--------+--------------+---------+---------------+---------+---------+---------+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|PARAMETRIC_NAME|SITE3_VAL|SITE2_VAL|SITE1_VAL|
+--------+--------------+---------+---------------+---------+---------+---------+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-01|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-23|    BVcbMLPNP_5|    -14.6|    -14.8|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-23|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-18|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-13|    BVcbMLPNP_5|    -14.6|    -14.8|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-18|    BVcbMLPNP_5|    -14.6|    -14.8|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-03|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-23|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|
|ST.TTS10|AEMNRM

In [24]:
from pyspark.sql.functions import expr, percentile_approx

In [21]:
columns_to_aggregate = ["SITE3_VAL", "SITE1_VAL", "SITE2_VAL"]
n = len(columns_to_aggregate)

average_expr = "({}) / {}".format(" + ".join(columns_to_aggregate), n)
print(average_expr)

min_expr = "LEAST({})".format(", ".join(columns_to_aggregate))
print(min_expr)

max_expr = "GREATEST({})".format(", ".join(columns_to_aggregate))
print(max_expr)

median_expr = "percentile_approx([{}], 0.5)".format(", ".join(columns_to_aggregate))
print(median_expr)

std_expr = "stddev_samp({})".format(", ".join(columns_to_aggregate))
print(std_expr)

(SITE3_VAL + SITE1_VAL + SITE2_VAL) / 3
LEAST(SITE3_VAL, SITE1_VAL, SITE2_VAL)
GREATEST(SITE3_VAL, SITE1_VAL, SITE2_VAL)
percentile_approx([SITE3_VAL, SITE1_VAL, SITE2_VAL], 0.5)
stddev_samp(SITE3_VAL, SITE1_VAL, SITE2_VAL)


In [22]:
median_expr

'percentile_approx([SITE3_VAL, SITE1_VAL, SITE2_VAL], 0.5)'

In [25]:
df_run_spark_filled_goood.withColumn("AVERAGE", expr(average_expr)).withColumn("MIN", expr(min_expr)).withColumn("MAX", expr(max_expr)).show()
# withColumn("MIN", expr(min_expr)).  \ 
# withColumn("MAX", expr(max_expr))  \ 
# # withColumn("MEDIAN", expr(median_expr)).  \ 
# # withColumn("STD", expr(std_expr)).show()

+--------+--------------+---------+---------------+---------+---------+---------+-------------------+-----+-----+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|PARAMETRIC_NAME|SITE3_VAL|SITE2_VAL|SITE1_VAL|            AVERAGE|  MIN|  MAX|
+--------+--------------+---------+---------------+---------+---------+---------+-------------------+-----+-----+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|-14.6|-14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-01|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|-14.6|-14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-23|    BVcbMLPNP_5|    -14.6|    -14.8|    -14.6|-14.666666666666666|-14.8|-14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-23|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|-14.6|-14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-18|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|-14.6|-14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-13|    BVcbMLPNP_5|    -14.6|    -14.8|    -14.6|-14.666

In [103]:
df_run_spark_filled_goood.select("*", expr(average_expr).alias("AVERAGE")).show()

+--------+--------------+---------+---------------+---------+---------+---------+-------------------+
|  OPE_NO|    PRODUCT_ID| WAFER_ID|PARAMETRIC_NAME|SITE3_VAL|SITE1_VAL|SITE2_VAL|            AVERAGE|
+--------+--------------+---------+---------------+---------+---------+---------+-------------------+
|ST.TTS10|AEMNRM01N.0B01|NHM078-15|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-01|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-23|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.8|-14.666666666666666|
|ST.TTS10|AEMNRM01N.0B01|NHM078-23|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHM078-18|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.6|              -14.6|
|ST.TTS10|AEMNRM01N.0B01|NHH106-13|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.8|-14.666666666666666|
|ST.TTS10|AEMNRM01N.0B01|NHH106-18|    BVcbMLPNP_5|    -14.6|    -14.6|    -14.8|-

In [87]:
# df_run_spark_filled_bad.show()

In [18]:
df_pandas_specific_ = process_missing_values_for_site(df=df_run, good_site_columns=good_site_columns,
                                                    bad_site_columns=bad_site_columns,
                                                    missing_value_threshold=0.7,
                                                    process_miss_site_mode='drop')
df_pandas_specific_

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,SITE_COUNT,AVERAGE,SITE3_VAL,SITE1_VAL,SITE2_VAL,SITE8_VAL,SITE4_VAL
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,9.0,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000
1,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVceMLPNP_5,9.0,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000
2,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWDNW_S3_10nA,9.0,14.555556,14.400000,14.400000,14.600000,14.600000,14.600000
3,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWNW_S2_10nA,9.0,14.533333,14.400000,14.400000,14.600000,14.600000,14.600000
4,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjGR_S2_10nA,9.0,14.488889,14.400000,14.400000,14.400000,14.600000,14.400000
...,...,...,...,...,...,...,...,...,...,...,...
13531,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsTX_p7p38,9.0,-0.056430,-0.007673,-0.042231,-0.063450,-0.065479,-0.043970
13532,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN1,9.0,0.192239,0.195248,0.189287,0.194373,0.190760,0.185206
13533,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN2,9.0,0.190023,0.188890,0.189869,0.190073,0.192523,0.182971
13534,ST.TTS10,AEMNRM01N.0B01,NHH106-23,Vtcsxdec_levelshift_nfet3,9.0,0.559120,0.544319,0.544663,0.550910,0.582070,0.546110


In [56]:
# df_pandas_specific_['SITE4_VAL'].value_counts()

In [23]:
side_with_features1 = calculate_site_stats(df_pandas_specific_, grpby_list, good_site_columns, good_or_bad='good')
side_with_features2 = calculate_site_stats(df_pandas_specific_, grpby_list, bad_site_columns, good_or_bad='bad')

In [29]:
side_with_features1

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,SITE3_VAL,SITE1_VAL,SITE2_VAL,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,label
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,0.000000e+00,0
1,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVceMLPNP_5,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,2.175584e-15,0
2,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWDNW_S3_10nA,14.400000,14.400000,14.600000,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
3,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWNW_S2_10nA,14.400000,14.400000,14.600000,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
4,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjGR_S2_10nA,14.400000,14.400000,14.400000,14.400000,14.400000,14.400000,14.400000,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3523,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsTX_p7p38,-0.007673,-0.042231,-0.063450,-0.007673,-0.063450,-0.042231,-0.037785,2.815304e-02,0
3524,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN1,0.195248,0.189287,0.194373,0.195248,0.189287,0.194373,0.192969,3.218557e-03,0
3525,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN2,0.188890,0.189869,0.190073,0.190073,0.188890,0.189869,0.189611,6.322904e-04,0
3526,ST.TTS10,AEMNRM01N.0B01,NHH106-23,Vtcsxdec_levelshift_nfet3,0.544319,0.544663,0.550910,0.550910,0.544319,0.544663,0.546630,3.709916e-03,0


In [30]:
side_with_features2

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,SITE8_VAL,SITE4_VAL,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,label
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,-14.600000,0.000000,1
1,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVceMLPNP_5,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,-15.200000,0.000000,1
2,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWDNW_S3_10nA,14.600000,14.600000,14.600000,14.600000,14.600000,14.600000,0.000000,1
3,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWNW_S2_10nA,14.600000,14.600000,14.600000,14.600000,14.600000,14.600000,0.000000,1
4,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjGR_S2_10nA,14.600000,14.400000,14.600000,14.400000,14.500000,14.500000,0.141421,1
...,...,...,...,...,...,...,...,...,...,...,...,...
3523,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsTX_p7p38,-0.065479,-0.043970,-0.043970,-0.065479,-0.054724,-0.054724,0.015209,1
3524,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN1,0.190760,0.185206,0.190760,0.185206,0.187983,0.187983,0.003928,1
3525,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN2,0.192523,0.182971,0.192523,0.182971,0.187747,0.187747,0.006754,1
3526,ST.TTS10,AEMNRM01N.0B01,NHH106-23,Vtcsxdec_levelshift_nfet3,0.582070,0.546110,0.582070,0.546110,0.564090,0.564090,0.025428,1


In [24]:
side_with_features1_select = side_with_features1[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN', 'AVERAGE', 'STD_DEV', 'label']]
side_with_features2_select = side_with_features2[grpby_list + ['WAFER_ID', 'PARAMETRIC_NAME', 'MAX_VAL', 'MIN_VAL', 'MEDIAN', 'AVERAGE', 'STD_DEV', 'label']]
side_with_features_all = pd.concat([side_with_features1_select, side_with_features2_select], axis=0)

In [61]:
side_with_features_all

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,label
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,-14.600000,-14.600000,-14.600000,-14.600000,0.000000e+00,0
1,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVceMLPNP_5,-15.200000,-15.200000,-15.200000,-15.200000,2.175584e-15,0
2,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWDNW_S3_10nA,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
3,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjDNWNW_S2_10nA,14.600000,14.400000,14.400000,14.466667,1.154701e-01,0
4,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVjGR_S2_10nA,14.400000,14.400000,14.400000,14.400000,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...
3523,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsTX_p7p38,-0.043970,-0.065479,-0.054724,-0.054724,1.520915e-02,1
3524,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN1,0.190760,0.185206,0.187983,0.187983,3.927625e-03,1
3525,ST.TTS10,AEMNRM01N.0B01,NHH106-23,VtcsVLN2,0.192523,0.182971,0.187747,0.187747,6.754213e-03,1
3526,ST.TTS10,AEMNRM01N.0B01,NHH106-23,Vtcsxdec_levelshift_nfet3,0.582070,0.546110,0.564090,0.564090,2.542770e-02,1


In [63]:
side_with_features_all1 = side_with_features_all[side_with_features_all['PARAMETRIC_NAME'] == 'BVcbMLPNP_5']
side_with_features_all1

Unnamed: 0,OPE_NO,PRODUCT_ID,WAFER_ID,PARAMETRIC_NAME,MAX_VAL,MIN_VAL,MEDIAN,AVERAGE,STD_DEV,label
0,ST.TTS10,AEMNRM01N.0B01,NHM078-15,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
294,ST.TTS10,AEMNRM01N.0B01,NHM078-01,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
588,ST.TTS10,AEMNRM01N.0B01,NHH106-23,BVcbMLPNP_5,-14.6,-14.8,-14.6,-14.666667,0.11547,0
882,ST.TTS10,AEMNRM01N.0B01,NHM078-23,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
1176,ST.TTS10,AEMNRM01N.0B01,NHM078-18,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
1470,ST.TTS10,AEMNRM01N.0B01,NHH106-13,BVcbMLPNP_5,-14.6,-14.8,-14.6,-14.666667,0.11547,0
1764,ST.TTS10,AEMNRM01N.0B01,NHH106-18,BVcbMLPNP_5,-14.6,-14.8,-14.6,-14.666667,0.11547,0
2058,ST.TTS10,AEMNRM01N.0B01,NHM078-03,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
2352,ST.TTS10,AEMNRM01N.0B01,NHM078-23,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0
2646,ST.TTS10,AEMNRM01N.0B01,NHM078-18,BVcbMLPNP_5,-14.6,-14.6,-14.6,-14.6,0.0,0


In [65]:
index_list = ['WAFER_ID', 'label']
columns_list = grpby_list + ['PARAMETRIC_NAME']
values_list = side_with_features_all1.columns.difference(['WAFER_ID', 'PARAMETRIC_NAME', 'label'] + grpby_list)
pivot_result = side_with_features_all1.pivot_table(index=index_list,
                              columns=columns_list,
                              values=values_list)
pivot_result.columns = pivot_result.columns.map('#'.join)
pivot_result

Unnamed: 0_level_0,Unnamed: 1_level_0,AVERAGE#ST.TTS10#AEMNRM01N.0B01#BVcbMLPNP_5,MAX_VAL#ST.TTS10#AEMNRM01N.0B01#BVcbMLPNP_5,MEDIAN#ST.TTS10#AEMNRM01N.0B01#BVcbMLPNP_5,MIN_VAL#ST.TTS10#AEMNRM01N.0B01#BVcbMLPNP_5,STD_DEV#ST.TTS10#AEMNRM01N.0B01#BVcbMLPNP_5
WAFER_ID,label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
NHH106-13,0,-14.666667,-14.6,-14.6,-14.8,0.11547
NHH106-13,1,-14.8,-14.8,-14.8,-14.8,0.0
NHH106-18,0,-14.666667,-14.6,-14.6,-14.8,0.11547
NHH106-18,1,-14.8,-14.8,-14.8,-14.8,0.0
NHH106-23,0,-14.666667,-14.6,-14.6,-14.8,0.11547
NHH106-23,1,-14.8,-14.8,-14.8,-14.8,0.0
NHM078-01,0,-14.6,-14.6,-14.6,-14.6,0.0
NHM078-01,1,-14.6,-14.6,-14.6,-14.6,0.0
NHM078-03,0,-14.6,-14.6,-14.6,-14.6,0.0
NHM078-03,1,-14.6,-14.6,-14.6,-14.6,0.0
