In [1]:
import pandas as pd
import pyspark.pandas as ps
import requests
import json

from sqlalchemy import create_engine
from pca import pca
from pyspark.sql.functions import pandas_udf, PandasUDFType, max, col, countDistinct, when, rank, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.window import Window

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import ClusterCentroids

# from backend_spark.doris_common.doris_client import DorisClient
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional



In [22]:
import pyspark.sql.dataframe
from typing import List, Dict, Union

In [2]:
import os 
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [7]:
############################################################################
##############################从kafka消息读取需要的资料#########################
############################################################################
def get_some_info(df:pd.DataFrame):
    if len(df) > 0:
        df = df.head(1)

    request_id = df["requestId"].values[0]
    request_params = df["requestParam"].values[0]
    # 避免存在单引号，因为json 引号只有双引号
    request_params = request_params.replace('\'', "\"")   
    parse_dict = json.loads(request_params)
    grpby_list = parse_dict[0]['grpby_list']
    
    try:
        merge_operno = list(parse_dict[0]['mergeOperno'])
    except KeyError:
        merge_operno = None

    return parse_dict, request_id, grpby_list, merge_operno

In [8]:
# 真正的kafka消息里全都是双引号
json_loads_dict = {
    "requestId": "small",
    "requestParam": [
        {'dateRange': [{'start': "2023-12-01 00:00:00", 'end': "2024-01-15 00:00:00"}], 
         'lot': [], 
         'operNo': ["1G.EEG1R","1G.PPB10"], 
         'prodg1': [], 
         'productId': [], 
         'eqp': [], 
         'tool': [], 
         'recipeName': [], 
         'waferId': {'good': ["NBX392-15","NBX392-20","NBX392-24","NBX391-24","NBX391-25","NBX548-09",
                     "NBX391-01","NBX391-02","NBX391-13","NBX391-17"], 
                     'bad': ["NBX500-10","NBX500-01","NBX500-09"]}, 
         'uploadId': '20240110170016023', 
         'grpby_list': ['OPER_NO', 'TOOL_NAME'],
#          'mergeOperno': [{"2F.CDS10_XX.TDS01": ["2F.CDS10", "XX.TDS01"]},
#                            {"2F.CDS20_XX.CDS20": ["2F.CDS20", "XX.CDS20"]}]
        }
    ]
}

df_pa = pd.DataFrame({
    "requestId": [json_loads_dict["requestId"]], 
    "requestParam": [json.dumps(json_loads_dict["requestParam"])]})

df1 = ps.from_pandas(df_pa).to_spark()

  fields = [
  for column, series in pdf.iteritems():


In [10]:
#  1. 解析json 为字典， df1为kafka输入的结果数据，获取到parse_dict, request_id, grpby_list
df2 = df1.toPandas() 
parse_dict, request_id, grpby_list, merge_operno = get_some_info(df2)
print("parse_dict是：", parse_dict)
print("parse_dict的类型是：", type(parse_dict))
print("request_id是：", request_id)
print("grpby_list是：", grpby_list)
print("merge_operno是：", merge_operno)

# 2. 从kafka 关键字映射都具体数据源中的字段,没有的可以删除
# keyword_map_from_json_to_table: dict = {
#     "prodg1": "PRODG1",
#     "waferId": "WAFER_ID",
#     "dateRange": "START_TIME",
#     "productId": "PRODUCT_ID",
#     "operNo": "OPER_NO",
#     "eqp": "EQP_NAME",
#     "tool": "TOOL_NAME",
#     "lot": "LOT_ID",
#     "recipeName": "RECIPE_NAME"}

# # 3. 获取查询条件list
# select_condition_list = parse_dict

# # 4. 指定查询表名, 根据实际情况需要修改
# table_name = "etl.DWD_POC_CASE_FD_UVA_DATA_TEST"

parse_dict是： [{'dateRange': [{'start': '2023-12-01 00:00:00', 'end': '2024-01-15 00:00:00'}], 'lot': [], 'operNo': ['1G.EEG1R', '1G.PPB10'], 'prodg1': [], 'productId': [], 'eqp': [], 'tool': [], 'recipeName': [], 'waferId': {'good': ['NBX392-15', 'NBX392-20', 'NBX392-24', 'NBX391-24', 'NBX391-25', 'NBX548-09', 'NBX391-01', 'NBX391-02', 'NBX391-13', 'NBX391-17'], 'bad': ['NBX500-10', 'NBX500-01', 'NBX500-09']}, 'uploadId': '20240110170016023', 'grpby_list': ['OPER_NO', 'TOOL_NAME']}]
parse_dict的类型是： <class 'list'>
request_id是： small
grpby_list是： ['OPER_NO', 'TOOL_NAME']
merge_operno是： None


In [117]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/uva_algorithm/small_samples_data/small1_labeled.csv")
df_pandas.shape

(3921, 39)

In [118]:
df_pandas

Unnamed: 0,TOOL_ID,TOOL_NAME,RUN_ID,EQP_NAME,CASE_INFO,PRODUCT_ID,PRODG1,OPER_NO,LOT_ID,WAFER_ID,...,UPPER_OUTLIER,RULES_ENABLED,ALARM_RULE,RESULT,STATUS,REGION,ERROR_MSG,STATISTIC_RESULT,VERSION,label
0,6739,SCT07_4-1,1516389,SCT07,2023-12-08,AFPNR901N.0B0L,L2800Z2N,2U.WSC50,NBX293.200,NBX293-06,...,,1,,72.495300,NORMAL,NORMAL,,72.495300,4,1
1,6739,SCT07_4-1,1516389,SCT07,2023-12-08,AFPNR901N.0B0L,L2800Z2N,2U.WSC50,NBX293.200,NBX293-06,...,100.310000,1,,78.785714,NORMAL,UPPER_NORMAL,,78.785714,114,1
2,6739,SCT07_4-1,1516389,SCT07,2023-12-08,AFPNR901N.0B0L,L2800Z2N,2U.WSC50,NBX293.200,NBX293-06,...,81.668838,1,,79.930769,NORMAL,LOWER_NORMAL,,79.930769,114,1
3,6740,SCT07_4-2,1498463,SCT07,2023-12-08,AFPNR901N.0B0L,L2800Z2N,2U.WSC50,NBX293.200,NBX293-07,...,0.440000,1,,0.272773,NORMAL,LOWER_NORMAL,,0.272773,114,1
4,6740,SCT07_4-2,1498463,SCT07,2023-12-08,AFPNR901N.0B0L,L2800Z2N,2U.WSC50,NBX293.200,NBX293-07,...,,1,,71.045500,NORMAL,NORMAL,,71.045500,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3916,5444,DGA01_A1,1187372,DGA01,2023-12-16,AFPZM801N.0A01,L2800Z3N,2U.CDG20,NBX265.000,NBX265-07,...,,1,,0.025200,NORMAL,UPPER_NORMAL,,0.025200,6,1
3917,5444,DGA01_A1,1187372,DGA01,2023-12-16,AFPZM801N.0A01,L2800Z3N,2U.CDG20,NBX265.000,NBX265-07,...,,1,,4.185000,NORMAL,NORMAL,,4.185000,2,1
3918,5444,DGA01_A1,1187372,DGA01,2023-12-16,AFPZM801N.0A01,L2800Z3N,2U.CDG20,NBX265.000,NBX265-07,...,,1,,,ERROR,ERROR,No Match Found for Start criteria [ Step_Name ...,,2,1
3919,5444,DGA01_A1,1187372,DGA01,2023-12-16,AFPZM801N.0A01,L2800Z3N,2U.CDG20,NBX265.000,NBX265-07,...,,1,,,ERROR,ERROR,No Match Found for Start criteria [ Step_Numbe...,,56,1


In [7]:
# df_pandas[df_pandas['label'] == 0]['WAFER_ID'].unique()
# df_pandas[df_pandas['label'] == 1]['WAFER_ID'].unique()

In [119]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

  fields = [
  for column, series in pdf.iteritems():


3921

In [14]:
#####自己打标签
############################################
######## 1. 客户只定义了bad_wafer = []是什么  ########
############################################
# 将传进来的BAD_WAFER, 用 | 连接起来，
# F.col('WAFER_ID').like('NDJ065%') | F.col('WAFER_ID').like('NDJ067%') 作为条件传入增加label
# 同时将isin模式也作为条件传入增加label

def get_label_single(df, bad_wafer):
    like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_like_conditions = " | ".join(like_conditions)
    isin_conditions = "col('WAFER_ID').isin(bad_wafer)"
    df = df.withColumn('label', 
                when( eval(all_like_conditions) | eval(isin_conditions), int(1)).otherwise(int(0)))
    return df 


############################################
## 2. 客户定义了bad_wafer = [] 和 good_wafer = []######
############################################
# 将传进来的BAD_WAFER, 用 | 连接起来，
# 将传进来的GOOD_WAFER, 也用 | 连接起来，
# 同时将isin模式也作为条件传入增加label

def get_label_double(df, bad_wafer, good_wafer):
    good_like_conditions = [f"col('WAFER_ID').like('{good}')" for good in good_wafer]
    all_good_like_conditions = " | ".join(good_like_conditions)
    good_isin_conditions = "col('WAFER_ID').isin(good_wafer)"

    bad_like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_bad_like_conditions = " | ".join(bad_like_conditions)
    bad_isin_conditions = "col('WAFER_ID').isin(bad_wafer)"

    df = df.withColumn('label',  when(eval(all_good_like_conditions) | eval(good_isin_conditions), int(0)).when(eval(all_bad_like_conditions) | eval(bad_isin_conditions), int(1)).otherwise(222333))
    df = df.filter(df['label'] != int(222333))
    return df

In [120]:
###################################################################
##########################融合OPER_NO字段##########################
###################################################################
def integrate_operno(df: pyspark.sql.dataframe,
                     merge_operno_list: List[Dict[str, List[str]]]) -> pyspark.sql.dataframe:
    """
    Integrate the 'OPER_NO' column in the DataFrame based on the provided merge_operno_list.
    :param df: The input DataFrame.
    :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
           Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                     {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
    :return: DataFrame with 'OPER_NO' column integrated according to the merge_operno_list.
    """
    if merge_operno_list is not None and len(merge_operno_list) > 0:
        # Extract values from each dictionary in merge_operno_list and create a list
        values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
        # Concatenate values from each dictionary
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_operno_list]

        # Replace values in 'OPER_NO' column based on the rules defined in merge_operno_list
        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("OPER_NO",
                               when(col("OPER_NO").isin(values), replacement_value).otherwise(col("OPER_NO")))
        return df
    else:
        return df

In [24]:
############################################################################
##################################FDC数据预处理###############################
############################################################################
def pre_process(df: pyspark.sql.dataframe) -> pyspark.sql.dataframe:
    """
     Preprocess the data extracted from the database for a specific CASE.
    :param df: Data for a specific CASE retrieved from the database.
    :return: Preprocessed data with relevant columns and filters applied.
    """
    # Select only the columns that will be used
    df = df.select('WAFER_ID', 'TOOL_ID', 'RUN_ID', 'EQP_NAME', 'PRODUCT_ID', 'PRODG1', 'TOOL_NAME',
                   'OPER_NO', 'parametric_name', 'STATISTIC_RESULT', 'label')
    # Remove rows with missing values in 'STATISTIC_RESULT' column
    df = df.filter(col('STATISTIC_RESULT').isNotNull())
    # Drop duplicates based on all columns
    df1 = df.dropDuplicates()
    # Select the rows with the latest 'RUN_ID' for each combination of 'WAFER_ID', 'OPER_NO', 'TOOL_ID'
    df2 = df1.groupBy('WAFER_ID', 'OPER_NO', 'TOOL_ID').agg(max('RUN_ID').alias('RUN_ID'))
    df_run = df1.join(df2.dropDuplicates(subset=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID']),
                      on=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID'], how='inner')
    return df_run



def commonality_analysis(df_run: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
    """
    Perform commonality analysis on preprocessed data.
    :param df_run: Preprocessed data after data preprocessing.
    :param grpby_list: List of columns ['PRODG1', 'EQP_NAME', 'OPER_NO', 'PRODUCT_ID', 'TOOL_NAME'] for grouping.
            Example: grpby_list = ['PRODG1', 'TOOL_NAME', 'OPER_NO'], grpby_list = ['PRODUCT_ID', 'OPER_NO']
    :return: Results of commonality analysis, showing the top ten combinations with the highest number of bad wafers.
    """
    grps = (df_run.groupBy(grpby_list)
            .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
            .orderBy('bad_num', ascending=False))

    # Handle the case of a single OPER_NO or single TOOL_NAME
    if grps.count() == 1:
        return grps
    else:
        # Filter out groups with no bad wafers
        grps = grps.filter(grps['bad_num'] > 0)
        # Rank the groups based on the number of bad wafers
        window_sep = Window().orderBy(col("bad_num").desc())
        ranked_df = grps.withColumn("rank", rank().over(window_sep))
        # Select the top ten groups and remove the 'rank' column
        grpss = ranked_df.filter(col("rank") <= 10).drop("rank")
        return grpss

In [121]:
df_run = pre_process(df1)
print(df_run.count())

2769


In [122]:
grpby_list = ['EQP_NAME', 'OPER_NO', 'TOOL_NAME']
grpby_list

['EQP_NAME', 'OPER_NO', 'TOOL_NAME']

In [123]:
common_res = commonality_analysis(df_run, grpby_list)
common_res.show()

+--------+--------+---------------+-----------+--------+-------+
|EQP_NAME| OPER_NO|      TOOL_NAME|wafer_count|good_num|bad_num|
+--------+--------+---------------+-----------+--------+-------+
|   EUT0B|1V.EEU10|      EUT0B_PM6|          5|       0|      5|
|   MTA03|2U.CMT10|        MTA03_4|          4|       0|      4|
|   WTL0A|1V.WWK20|       WTL0A_L1|          2|       0|      2|
|   DGA03|2U.CDG10|       DGA03_C1|          2|       0|      2|
|   PBT01|1V.PPB10|PBT01_CGHA_4-14|          2|       0|      2|
|   DGA03|2U.CDG10|       DGA03_C2|          2|       0|      2|
|   MTN01|2U.CMT20|        MTN01_6|          2|       0|      2|
|   SCT06|2U.WSC50|      SCT06_5-3|          2|       0|      2|
|   DBA71|2U.CDB10|       DBA71_B1|          2|       0|      2|
|   SCT07|2U.WSC40|      SCT07_4-4|          2|       0|      2|
|   DBA51|2U.CDB20|       DBA51_B2|          2|       0|      2|
|   SCT07|2U.WSC10|      SCT07_4-3|          2|       0|      2|
|   MTN52|2U.CMT20|      

In [124]:
###########################################################################
#################################获取样本数据#########################
############################################################################
def get_data_list(common_res: pyspark.sql.dataframe,
                  grpby_list: List[str],
                  big_or_small: str = 'big') -> List[Dict[str, str]]:
    """
    Get a list of dictionaries for corresponding groups based on commonality analysis.

    :param common_res: Result of commonality analysis.
    :param grpby_list:  List of columns ['PRODG1', 'EQP_NAME', 'OPER_NO', 'PRODUCT_ID', 'TOOL_NAME'] for grouping.
    :param big_or_small: 'big' or 'small'.
    :return: List of dictionaries for corresponding groups.
            Example: [{'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN2J01N.0U01'},
                      {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN4X01N.0B01'},
                      {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFGN1501N.0C02'}]
    """
    assert big_or_small in ['big', 'small'], "Choose only 'big' or 'small'. Please check the spelling."

    # Filter groups based on big or small sample conditions
    if big_or_small == 'big':
        good_bad_grps = common_res.filter("good_num >= 3 AND bad_num >= 3")
    else:
        good_bad_grps = common_res.filter("bad_num >= 1 AND wafer_count >= 2")

    # Order the results and limit to the top 10 groups
    good_bad_grps = good_bad_grps.orderBy(col("bad_num").desc(), col("wafer_count").desc(),
                                          col("good_num").desc()).limit(10)

    # Collect the data and convert it into a list of dictionaries
    data_list = good_bad_grps[grpby_list].collect()
    data_dict_list = [row.asDict() for row in data_list]
    return data_dict_list


def get_train_data(df_run: pyspark.sql.dataframe, data_dict_list: List[Dict[str, str]]) -> pyspark.sql.dataframe:
    """
    Get the actual combination data for modeling from the original data.

    :param df_run: Preprocessed data after data preprocessing.
    :param data_dict_list: List of dictionaries with filtering conditions.
           Example: [{'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN2J01N.0U01'},
                      {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN4X01N.0B01'},
                      {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFGN1501N.0C02'}]
    :return: Filtered data for modeling.
    """
    # Get the filtering conditions for the first data dictionary
    first_data_dict = data_dict_list[0]
    conditions = " AND ".join(["{} == '{}'".format(col_, first_data_dict[col_]) for col_ in first_data_dict])
    # Filter the data for the first condition
    df_s = df_run.filter(conditions)

    # Loop through the remaining data dictionaries and filter the data accordingly
    for i in range(1, len(data_dict_list)):
        data_dict = data_dict_list[i]
        conditions = " AND ".join(["{} == '{}'".format(col_, data_dict[col_]) for col_ in data_dict])
        df_m = df_run.filter(conditions)
        df_s = df_s.union(df_m)
    return df_s

In [125]:
data_dict_list_ss = get_data_list(common_res, grpby_list, big_or_small='small')
data_dict_list_ss

[{'EQP_NAME': 'EUT0B', 'OPER_NO': '1V.EEU10', 'TOOL_NAME': 'EUT0B_PM6'},
 {'EQP_NAME': 'MTA03', 'OPER_NO': '2U.CMT10', 'TOOL_NAME': 'MTA03_4'},
 {'EQP_NAME': 'WTL0A', 'OPER_NO': '1V.WWK20', 'TOOL_NAME': 'WTL0A_L1'},
 {'EQP_NAME': 'DGA03', 'OPER_NO': '2U.CDG10', 'TOOL_NAME': 'DGA03_C1'},
 {'EQP_NAME': 'DGA03', 'OPER_NO': '2U.CDG10', 'TOOL_NAME': 'DGA03_C2'},
 {'EQP_NAME': 'SCT06', 'OPER_NO': '2U.WSC50', 'TOOL_NAME': 'SCT06_5-3'},
 {'EQP_NAME': 'PBT01', 'OPER_NO': '1V.PPB10', 'TOOL_NAME': 'PBT01_CGHA_4-14'},
 {'EQP_NAME': 'MTN01', 'OPER_NO': '2U.CMT20', 'TOOL_NAME': 'MTN01_6'},
 {'EQP_NAME': 'DBA71', 'OPER_NO': '2U.CDB10', 'TOOL_NAME': 'DBA71_B1'},
 {'EQP_NAME': 'SCT07', 'OPER_NO': '2U.WSC40', 'TOOL_NAME': 'SCT07_4-4'}]

In [126]:
df_run_ss = get_train_data(df_run, data_dict_list_ss)
df_run_ss.count()

660

In [108]:
############################################################################
#########################获取传入的整个数据中的所有bad_wafer个数############
############################################################################
def get_all_bad_wafer_num(df: pyspark.sql.dataframe) -> int:
    """
    Get the number of distinct bad WAFER in the DataFrame.
    """
    return df.filter("label == 1").select('WAFER_ID').distinct().count()

In [127]:
bad_wafer_num_small_sample = get_all_bad_wafer_num(df_run_ss)
bad_wafer_num_small_sample

5

In [130]:
df_run_ss_pandas = df_run_ss.toPandas()

In [142]:
i = 6
oper, tool = data_dict_list_ss[i]['OPER_NO'], data_dict_list_ss[i]['TOOL_NAME']
df_run_ss_pandas1 = df_run_ss_pandas.query(F"OPER_NO == '{oper}' & TOOL_NAME == '{tool}'")
df_pivot = get_pivot_table(df=df_run_ss_pandas1, grpby_list=grpby_list)
df_pivot

Unnamed: 0,WAFER_ID,label
0,NBX265-06,1
1,NBX293-06,1


In [165]:
def get_pca_pandas(df_run):
    df_pivot = get_pivot_table(df=df_run, grpby_list=grpby_list)
    # 由于是小样本，再重新copy一份制造多一点数据传给PCA模型
    df_pivot_copy = df_pivot.copy()
    df_pivot_all = pd.concat([df_pivot, df_pivot_copy], axis=0)

    # 定义自变量
    x_train = df_pivot_all[df_pivot_all.columns.difference(['WAFER_ID', 'label']).tolist()]
    print(x_train.shape)
    
    if min(x_train.shape) <= 0:
        return None

    n_components = min(min(x_train.shape) - 2, 20)
    model = pca(n_components=n_components, verbose=None)
    results = model.fit_transform(x_train)
    res_top = results['topfeat']
    res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
    res_top_select['importance'] = abs(res_top_select['loading'])
    res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()

    # 增加一些字段信息
    res_top_select['bad_wafer'] = sum(df_pivot['label'])
    for col_ in grpby_list:
        res_top_select[col_] = df_run[col_].values[0]
    return res_top_select

In [166]:
columns = ['features', 'importance', 'bad_wafer', 'EQP_NAME', 'OPER_NO', 'TOOL_NAME']
empty_df = pd.DataFrame(columns=columns)

for i in range(len(data_dict_list_ss)):
    oper, tool = data_dict_list_ss[i]['OPER_NO'], data_dict_list_ss[i]['TOOL_NAME']
    df_run_ss_pandas1 = df_run_ss_pandas.query(F"OPER_NO == '{oper}' & TOOL_NAME == '{tool}'")
    resss = get_pca_pandas(df_run_ss_pandas1)
    empty_df = pd.concat([empty_df, resss])

(10, 6)
(8, 18)
(4, 19)
(4, 30)
(4, 31)
(4, 17)
(4, 0)
(4, 5)
(4, 11)
(4, 13)


  rad_cc = (xct**2 / (width / 2.)**2) + (yct**2 / (height / 2.)**2)
  width, height = 2 * n_std * np.sqrt(vals)
  width, height = 2 * n_std * np.sqrt(vals)
  width, height = 2 * n_std * np.sqrt(vals)
  width, height = 2 * n_std * np.sqrt(vals)


In [167]:
empty_df

Unnamed: 0,features,importance,bad_wafer,EQP_NAME,OPER_NO,TOOL_NAME
0,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.862031,5,EUT0B,1V.EEU10,EUT0B_PM6
1,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.862479,5,EUT0B,1V.EEU10,EUT0B_PM6
2,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS9...,0.771763,5,EUT0B,1V.EEU10,EUT0B_PM6
3,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.755538,5,EUT0B,1V.EEU10,EUT0B_PM6
0,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#KITS_L...,0.707122,4,MTA03,2U.CMT10,MTA03_4
1,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#DC_VOL...,0.753177,4,MTA03,2U.CMT10,MTA03_4
2,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#DC_VOL...,0.75991,4,MTA03,2U.CMT10,MTA03_4
3,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#TARGET...,0.706517,4,MTA03,2U.CMT10,MTA03_4
4,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#CHAMBE...,1.0,4,MTA03,2U.CMT10,MTA03_4
5,STATISTIC_RESULT#MTA03#2U.CMT10#MTA03_4#CHAMBE...,0.999002,4,MTA03,2U.CMT10,MTA03_4


In [170]:
##########################################################################################
#######################################对bad>=1的数据，用pca建模##############################
##########################################################################################
def get_pivot_table(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
    """
    Pivot the DataFrame based on specified grouping columns.

    Parameters:
    - df: Data for modeling.
    - grpby_list: List of grouping columns.

    Returns:
    - DataFrame: Result of pivoting the table.
    """
    index_cols = ['WAFER_ID', 'label']
    columns_cols = grpby_list + ['parametric_name']
    df_pivot = df.dropna(axis=0).pivot_table(index=index_cols,
                                             columns=columns_cols,
                                             values=['STATISTIC_RESULT'])
    df_pivot.columns = df_pivot.columns.map('#'.join)
    df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
    
    # Remove completely identical columns
    for column in df_pivot.columns.difference(index_cols):
        if df_pivot[column].nunique() == 1:
            df_pivot = df_pivot.drop(column, axis=1)
    return df_pivot


def fit_pca_small_sample(df: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
    """
    Fit a PCA model on the train data. It is for small sample method(bad_wafer_num >= 1 AND wafer_count >= 2)

    Parameters:
    - df: Data for modeling.
    - grpby_list: List of grouping columns.

    Returns:
    - DataFrame: Combined dataframe of every feature and its importance in each combination of grpby_list after PCA modeling.
    """
    # Dynamically build schema according to the grpby_list
    struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
    struct_fields.extend([StructField("features", StringType(), True),
                          StructField("importance", FloatType(), True),
                          StructField("bad_wafer", IntegerType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):
        df_pivot = get_pivot_table(df=df_run, grpby_list=grpby_list)
        # 由于是小样本，再重新copy一份制造多一点数据传给PCA模型
        df_pivot_copy = df_pivot.copy()
        df_pivot_all = pd.concat([df_pivot, df_pivot_copy], axis=0)

        # 定义自变量
        x_train = df_pivot_all[df_pivot_all.columns.difference(['WAFER_ID', 'label']).tolist()]
        if min(x_train.shape) <= 0:
            return pd.DataFrame()
        
        n_components = min(min(x_train.shape) - 2, 20)
        model = pca(n_components=n_components, verbose=None)
        results = model.fit_transform(x_train)
        res_top = results['topfeat']
        res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
        res_top_select['importance'] = abs(res_top_select['loading'])
        res_top_select = res_top_select.rename(columns={'feature': 'features'}).drop("loading", axis=1).drop_duplicates()

        # 增加一些字段信息
        res_top_select['bad_wafer'] = sum(df_pivot['label'])
        for col_ in grpby_list:
            res_top_select[col_] = df_run[col_].values[0]
        return res_top_select
    return df.groupby(grpby_list).apply(get_model_result)

In [171]:
res = fit_pca_small_sample(df=df_run_ss, grpby_list=grpby_list)
res.show()



+--------+--------+---------+--------------------+----------+---------+
|EQP_NAME| OPER_NO|TOOL_NAME|            features|importance|bad_wafer|
+--------+--------+---------+--------------------+----------+---------+
|   DBA71|2U.CDB10| DBA71_B1|STATISTIC_RESULT#...|0.77276725|        2|
|   DBA71|2U.CDB10| DBA71_B1|STATISTIC_RESULT#...| 0.9843439|        2|
|   DGA03|2U.CDG10| DGA03_C1|STATISTIC_RESULT#...|0.85396254|        2|
|   DGA03|2U.CDG10| DGA03_C1|STATISTIC_RESULT#...| 0.8539707|        2|
|   DGA03|2U.CDG10| DGA03_C2|STATISTIC_RESULT#...| 0.8539624|        2|
|   DGA03|2U.CDG10| DGA03_C2|STATISTIC_RESULT#...|0.85397077|        2|
|   EUT0B|1V.EEU10|EUT0B_PM6|STATISTIC_RESULT#...|0.86203057|        5|
|   EUT0B|1V.EEU10|EUT0B_PM6|STATISTIC_RESULT#...|0.86247903|        5|
|   EUT0B|1V.EEU10|EUT0B_PM6|STATISTIC_RESULT#...| 0.7717628|        5|
|   EUT0B|1V.EEU10|EUT0B_PM6|STATISTIC_RESULT#...| 0.7555381|        5|
|   MTA03|2U.CMT10|  MTA03_4|STATISTIC_RESULT#...| 0.7071218|   

In [172]:
resppp = res.toPandas()

In [173]:
resppp

Unnamed: 0,EQP_NAME,OPER_NO,TOOL_NAME,features,importance,bad_wafer
0,DBA71,2U.CDB10,DBA71_B1,STATISTIC_RESULT#DBA71#2U.CDB10#DBA71_B1#UV_LA...,0.772767,2
1,DBA71,2U.CDB10,DBA71_B1,STATISTIC_RESULT#DBA71#2U.CDB10#DBA71_B1#HEATE...,0.984344,2
2,DGA03,2U.CDG10,DGA03_C1,STATISTIC_RESULT#DGA03#2U.CDG10#DGA03_C1#TOTAL...,0.853963,2
3,DGA03,2U.CDG10,DGA03_C1,STATISTIC_RESULT#DGA03#2U.CDG10#DGA03_C1#TOTAL...,0.853971,2
4,DGA03,2U.CDG10,DGA03_C2,STATISTIC_RESULT#DGA03#2U.CDG10#DGA03_C2#TOTAL...,0.853962,2
5,DGA03,2U.CDG10,DGA03_C2,STATISTIC_RESULT#DGA03#2U.CDG10#DGA03_C2#TOTAL...,0.853971,2
6,EUT0B,1V.EEU10,EUT0B_PM6,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.862031,5
7,EUT0B,1V.EEU10,EUT0B_PM6,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.862479,5
8,EUT0B,1V.EEU10,EUT0B_PM6,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS9...,0.771763,5
9,EUT0B,1V.EEU10,EUT0B_PM6,STATISTIC_RESULT#EUT0B#1V.EEU10#EUT0B_PM6#GAS1...,0.755538,5


In [112]:
#####################################################################################
##################################对bad>=1建模后的结果进行整合############################
#####################################################################################
def split_features(df: pd.DataFrame, index: int) -> str:
    """
    Split the 'features' column based on the specified index.

    Parameters:
    - df: RandomForest modeling results with 'features' column.
    - index: Order value.

    Returns:
    - str: Field attribute value.
    """
    return df['features'].apply(lambda x: x.split('#')[index])


def get_split_feature_importance_table(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
    """
    Get the table after splitting the 'features' column based on the specified grouping columns.

    Parameters:
    - df: RandomForest modeling results with 'features' column.
    - grpby_list: List of grouping columns.

    Returns:
    - DataFrame: Table after splitting features.
    """
    n_feats = len(grpby_list)
    for i in range(n_feats):
        df[grpby_list[i]] = split_features(df, i + 1)

    df['parametric_name'] = split_features(df, n_feats + 1)
    df['step'] = split_features(df, n_feats + 2)
    df['stats'] = split_features(df, n_feats + 3)
    df = df.drop(['features'], axis=1).reset_index(drop=True)
    return df


def add_feature_stats(df: pd.DataFrame, grpby_list: List[str]) -> pd.DataFrame:
    """
    Add a column with all statistical features of parameters.

    Parameters:
    - df: Feature importance table after processing.
    - grpby_list: List of grouping columns.

    Returns:
    - DataFrame: New column containing all statistical features: 'feature_stats'.
    """
    grpby_list_extend = grpby_list + ['parametric_name', 'step']
    feature_stats = df.groupby(grpby_list_extend)['stats'].unique().reset_index()
    feature_stats['stats'] = [feature_stats['stats'].iloc[i].tolist() for i in range(len(feature_stats))]
    feature_stats['stats'] = feature_stats['stats'].apply(lambda x: "#".join(x))
    feature_stats = feature_stats.assign(
        parametric_name=lambda x: x['parametric_name'] + str('#') + x['step']).drop(
        'step', axis=1)
    return feature_stats


def split_calculate_features_small_sample(df: pyspark.sql.dataframe, grpby_list: List[str]) -> pyspark.sql.dataframe:
    """
    param df: PCA建模后的结果
    param by: 分组字段
    return: features和importance结果
    """
    # Dynamically build schema
    struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
    struct_fields.extend([StructField("parametric_name", StringType(), True),
                          StructField("importance", FloatType(), True),
                          StructField("bad_wafer", IntegerType(), True),
                          StructField("stats", StringType(), True),])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(model_results):        
        feature_importance_table = model_results[['features',  'importance', 'bad_wafer']].dropna(axis=0)
        # 分裂features
        feature_importance_res_split = get_split_feature_importance_table(df=feature_importance_table, grpby_list=grpby_list)

        # 新增一列，含有参数的所有统计特征:feature_stats
        feature_stats = add_feature_stats(df=feature_importance_res_split, grpby_list=grpby_list)

        #对同一种组合里的同一个参数进行求和:feature_importance_groupby
        feature_importance_groupby = (feature_importance_res_split.groupby(grpby_list + 
                                                    ['bad_wafer', 'parametric_name', 'step'])['importance'].sum().reset_index())
        feature_importance_groupby = feature_importance_groupby.assign(parametric_name=lambda x: x['parametric_name']+str('#')+x['step']).drop('step', axis=1)

        # feature_stats和feature_importance_groupby连接
        grpby_stats = pd.merge(feature_stats, feature_importance_groupby, on=grpby_list + ['parametric_name']).dropna().reset_index(drop=True)
        return grpby_stats
    return df.groupby(grpby_list).apply(get_result)


def get_finall_results_small_sample(f_res: pyspark.sql.dataframe, bad_wafer_num: int) -> pyspark.sql.dataframe:
    """
    param s_res: roc_auc分数结果
    param f_res: features和importance结果
    param bad_wafer_num: 数据中所有bad_wafer的数量
    return: 最后的建模结果
    """
    f_res = f_res.withColumn("bad_ratio", col("bad_wafer") / bad_wafer_num)
    df_merge = f_res.withColumn('weight_original', col('importance') * col('bad_ratio'))

    # 最后再次进行一次归一化
    weight_all = df_merge.agg({"weight_original": "sum"}).collect()[0][0]
    df_merge = df_merge.withColumn("weight", col("weight_original") / weight_all)

    df_merge = df_merge.select(grpby_list + ['parametric_name', 'weight', 'stats']).orderBy('weight', ascending=False)
    return df_merge

In [113]:
#####################################################################################
#############################将建模后的结果增加特定的列####################################
#####################################################################################
def add_certain_column(df: pyspark.sql.dataframe, by: str, request_id: str,
                       grpby_list: List[str]) -> pyspark.sql.dataframe:
    """
    Add specific columns to the final modeling results.

    Parameters:
    - df: Final modeling result.
    - by: Grouping column, manually add a column 'add'.
    - request_id: Request ID passed in.
    - grpby_list: List of grouping columns.

    Returns:
    - DataFrame: Final modeling result with specific columns added.
    """
    # Dynamically build schema_all
    struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
    struct_fields.extend([StructField("stats", StringType(), True),
                          StructField("parametric_name", StringType(), True),
                          StructField("weight", FloatType(), True),
                          StructField("request_id", StringType(), True),
                          StructField("weight_percent", FloatType(), True),
                          StructField("index_no", IntegerType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(final_res: pd.DataFrame) -> pd.DataFrame:
        final_res['weight'] = final_res['weight'].astype(float)
        final_res = final_res.query("weight > 0")
        final_res['request_id'] = request_id
        final_res['weight_percent'] = final_res['weight'] * 100
        final_res = final_res.sort_values('weight', ascending=False)
        final_res['index_no'] = [i + 1 for i in range(len(final_res))]
        final_res = final_res.drop('add', axis=1)
        return final_res
    return df.groupby(grpby_list).apply(get_result)

In [114]:
f_res = split_calculate_features_small_sample(df=res, grpby_list=grpby_list)
f_res.show()

+--------------+--------+--------+--------+---------------+--------------------+----------+---------+-----+
|    PRODUCT_ID|EQP_NAME| OPER_NO|  PRODG1|      TOOL_NAME|     parametric_name|importance|bad_wafer|stats|
+--------------+--------+--------+--------+---------------+--------------------+----------+---------+-----+
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CGHA_4-24|PLATE_TEMP#HDB205...|       1.0|        1| MEAN|
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CLHA_4-21|PLATE_TEMP#DHP150...|       1.0|        1| MEAN|
+--------------+--------+--------+--------+---------------+--------------------+----------+---------+-----+



In [115]:
model_res_ss = get_finall_results_small_sample(f_res=f_res, bad_wafer_num=bad_wafer_num_small_sample)
model_res_ss.show()

+--------------+--------+--------+--------+---------------+--------------------+------+-----+
|    PRODUCT_ID|EQP_NAME| OPER_NO|  PRODG1|      TOOL_NAME|     parametric_name|weight|stats|
+--------------+--------+--------+--------+---------------+--------------------+------+-----+
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CGHA_4-24|PLATE_TEMP#HDB205...|   0.5| MEAN|
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CLHA_4-21|PLATE_TEMP#DHP150...|   0.5| MEAN|
+--------------+--------+--------+--------+---------------+--------------------+------+-----+



In [116]:
final_res_ss = model_res_ss.withColumn('add', lit(0))
final_res_add_columns = add_certain_column(df=final_res_ss, by='add', request_id=request_id, grpby_list=grpby_list)
final_res_add_columns.show()

+--------------+--------+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+
|    PRODUCT_ID|EQP_NAME| OPER_NO|  PRODG1|      TOOL_NAME|stats|     parametric_name|weight|request_id|weight_percent|index_no|
+--------------+--------+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CGHA_4-24| MEAN|PLATE_TEMP#HDB205...|   0.5|     small|          50.0|       1|
|AFPNR901N.0B0J|   PBT01|1V.PPB10|L2800Z2N|PBT01_CLHA_4-21| MEAN|PLATE_TEMP#DHP150...|   0.5|     small|          50.0|       1|
+--------------+--------+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+



#### 利用CASE1制作一个小样本的CASE

In [50]:
df_case1 = pd.read_csv("../DWD_POC_CASE_FD_UVA_DATA_CASE1_PROCESSED1.csv")
df_case1_small_sample_pandas = df_case1[df_case1['WAFER_ID'].isin(['NGE186-06', 'NGE186-12', 'NGE186-24', 'NGG239-19', 'NGE197-02', 'NGE197-15', 'NGE197-21', 'NGF482-01', 'NGF482-14'])]
df_case1_small_sample_pandas.shape

(736, 16)

In [51]:
df1 = ps.from_pandas(df_case1_small_sample_pandas).to_spark()
df1.count()

  fields = [
  for column, series in pdf.iteritems():


736

In [52]:
grpby_list

['PRODG1', 'OPER_NO', 'TOOL_NAME']

In [53]:
df_run = _pre_process(df1)
print(df_run.count())

736


In [54]:
common_res = commonality_analysis(df_run, grpby_list)
common_res.show()

+--------+--------+---------+-----------+--------+-------+
|  PRODG1| OPER_NO|TOOL_NAME|wafer_count|good_num|bad_num|
+--------+--------+---------+-----------+--------+-------+
|L11CD02A|1F.EEK10|EKT72_PM1|          6|       3|      3|
|L15DV07A|1F.EEK10|EKT72_PM1|          2|       0|      2|
+--------+--------+---------+-----------+--------+-------+



In [55]:
data_dict_list_ss = get_data_list(common_res, grpby_list, big_or_small='small')
data_dict_list_ss

[{'PRODG1': 'L11CD02A', 'OPER_NO': '1F.EEK10', 'TOOL_NAME': 'EKT72_PM1'},
 {'PRODG1': 'L15DV07A', 'OPER_NO': '1F.EEK10', 'TOOL_NAME': 'EKT72_PM1'}]

In [56]:
df_run_ss = get_train_data(df_run, data_dict_list_ss)
df_run_ss.count()

674

In [57]:
bad_wafer_num_small_sample = get_all_bad_wafer_num(df_run_ss)
bad_wafer_num_small_sample

5

In [58]:
res = fit_pca_small_sample(df=df_run_ss, by=grpby_list)
res.show()



+--------+--------+---------+--------------------+----------+---------+
|  PRODG1| OPER_NO|TOOL_NAME|            features|importance|bad_wafer|
+--------+--------+---------+--------------------+----------+---------+
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.5198736|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.9020127|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...|  0.940183|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...|0.76628584|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.6545386|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...|0.92193615|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.9986149|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.9862689|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.9045629|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...|0.98242646|        3|
|L11CD02A|1F.EEK10|EKT72_PM1|STATISTIC_RESULT#...| 0.7724064|   

In [59]:
f_res = split_calculate_features_small_sample(df=res, by=grpby_list)
f_res.show()

+--------+--------+---------+--------------------+----------+---------+----------+
|  PRODG1| OPER_NO|TOOL_NAME|     parametric_name|importance|bad_wafer|     stats|
+--------+--------+---------+--------------------+----------+---------+----------+
|L11CD02A|1F.EEK10|EKT72_PM1|APC_POSITION#AOTU...| 1.9848838|      3.0|MEAN#RANGE|
|L11CD02A|1F.EEK10|EKT72_PM1|BOTTOMFLOWRATE#AO...| 0.9045629|      3.0|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|CENTER_GAS_PRESSU...|0.98242646|      3.0|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|CENTER_HE_PRESSUR...| 0.7724064|      3.0|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|ESC_CURRENT#AOTU_...| 0.6545386|      3.0|       MAX|
|L11CD02A|1F.EEK10|EKT72_PM1|LO_C1_VAR_CAPACIT...|  0.940183|      3.0|     RANGE|
|L11CD02A|1F.EEK10|EKT72_PM1|LO_RF_POWER#AOTU_...|0.92193615|      3.0|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|LO_RF_VPP#AOTU_ST...| 0.5198736|      3.0|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|PROCESS_GAS_5_CHF...| 0.9020127|      3.0|       SUM|
|L11

In [60]:
model_res_ss = get_finall_results_small_sample(f_res=f_res, bad_wafer_num=bad_wafer_num_small_sample)
model_res_ss.show()

+--------+--------+---------+--------------------+--------------------+----------+
|  PRODG1| OPER_NO|TOOL_NAME|     parametric_name|              weight|     stats|
+--------+--------+---------+--------------------+--------------------+----------+
|L11CD02A|1F.EEK10|EKT72_PM1|APC_POSITION#AOTU...|  0.1811487599999665|MEAN#RANGE|
|L11CD02A|1F.EEK10|EKT72_PM1|CENTER_GAS_PRESSU...| 0.08966033032687143|      MEAN|
|L15DV07A|1F.EEK10|EKT72_PM1|APC_POSITION#AOTU...| 0.08591935284863195|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|LO_C1_VAR_CAPACIT...| 0.08580501436300049|     RANGE|
|L11CD02A|1F.EEK10|EKT72_PM1|LO_RF_POWER#AOTU_...|   0.084139732715689|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|BOTTOMFLOWRATE#AO...| 0.08255417630960758|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|PROCESS_GAS_5_CHF...| 0.08232143583157621|       SUM|
|L11CD02A|1F.EEK10|EKT72_PM1|CENTER_HE_PRESSUR...| 0.07049302456904988|      MEAN|
|L11CD02A|1F.EEK10|EKT72_PM1|UPPER_TEMPERATURE...|  0.0699344365422384|      MEAN|
|L15

In [63]:
final_res_ss = model_res_ss.withColumn('add', lit(0))
final_res_add_columns = add_certain_column(df=final_res_ss, by='add', request_id=request_id)
final_res_add_columns.show()

+--------+--------+---------+----------+--------------------+-----------+----------+--------------+--------+
|  PRODG1| OPER_NO|TOOL_NAME|     stats|     parametric_name|     weight|request_id|weight_percent|index_no|
+--------+--------+---------+----------+--------------------+-----------+----------+--------------+--------+
|L11CD02A|1F.EEK10|EKT72_PM1|MEAN#RANGE|APC_POSITION#AOTU...| 0.18114875|       fff|     18.114876|       1|
|L11CD02A|1F.EEK10|EKT72_PM1|      MEAN|CENTER_GAS_PRESSU...| 0.08966033|       fff|      8.966033|       2|
|L15DV07A|1F.EEK10|EKT72_PM1|      MEAN|APC_POSITION#AOTU...| 0.08591935|       fff|      8.591935|       3|
|L11CD02A|1F.EEK10|EKT72_PM1|     RANGE|LO_C1_VAR_CAPACIT...|0.085805014|       fff|      8.580502|       4|
|L11CD02A|1F.EEK10|EKT72_PM1|      MEAN|LO_RF_POWER#AOTU_...|0.084139735|       fff|      8.413973|       5|
|L11CD02A|1F.EEK10|EKT72_PM1|      MEAN|BOTTOMFLOWRATE#AO...| 0.08255418|       fff|      8.255418|       6|
|L11CD02A|1F.EEK10|

In [40]:
# df_run_ss_pandas = df_run_ss.toPandas()
# df_run_ss_pandas

# df_run_bs_pandas[df_run_bs_pandas['label'] == 0]['parametric_name'].nunique()

# df_run_bs_pandas[df_run_bs_pandas['label'] == 1]['parametric_name'].nunique()

# df_pivot = df_run_ss_pandas.dropna(axis=0).pivot_table(index=['WAFER_ID', 'label'], 
#                                                  columns=['OPER_NO', 'TOOL_NAME', 'parametric_name'],
#                                                  values=['STATISTIC_RESULT'])

# df_pivot.columns = df_pivot.columns.map('#'.join)
# df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
# df_pivot

# df_pivot1 = df_pivot.copy()
# df_pivot_all = pd.concat([df_pivot, df_pivot1], axis=0)

# df_pivot_all

# # 定义自变量
# x_train = df_pivot_all[df_pivot_all.columns.difference(['WAFER_ID', 'label']).tolist()]

# # 建立模型
# model = pca(n_components=min(x_train.shape[0], x_train.shape[1])-1, verbose=None)
# results = model.fit_transform(x_train)
# res_top = results['topfeat']
# res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
# res_top_select = res_top_select.drop_duplicates()

# res.toPandas().sort_values('importance')

------------------------------

In [77]:
#####################################################################################
####################################小样本算法整合###################################
#####################################################################################

def fit_small_data_model(df_run, common_res, grpby_list, request_id):
    
    df1 = None
    df2 = None

    data_dict_list_ss = get_data_list(common_res=common_res, grpby_list=grpby_list, big_or_small='small')
    print("data_dict_list_ss:", data_dict_list_ss)
    if len(data_dict_list_ss) == 0:
        msg = '该查询条件下数据库中实际BAD_WAFER数量为0, 无法分析'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    df_run_ss = get_train_data(df_run=df_run, data_dict_list=data_dict_list_ss)
    if df_run_ss.count() == 0:
        msg = '数据库中暂无此类数据!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    bad_wafer_num_small_sample = get_all_bad_wafer_num(df_run_ss)
    if bad_wafer_num_small_sample < 1:
        msg = '该查询条件下数据库中实际BAD_WAFER数量小于1片, 请提供更多的BAD_WAFER数量!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    res = fit_pca_small_sample(df=df_run_ss, by=grpby_list)
    if res.count() == 0:
        msg = '算法内部暂时异常!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    f_res = split_calculate_features_small_sample(df=res, by=grpby_list)
    if f_res.count() == 0:
        msg = '算法结果求和暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    model_res_ss = get_finall_results_small_sample(f_res=f_res, bad_wafer_num=bad_wafer_num_small_sample)
    if model_res_ss.count() == 0:
        msg = '算法结果拼接暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    final_res_ss = model_res_ss.withColumn('add', lit(0))
    final_res_add_columns = add_certain_column(df=final_res_ss, by='add', request_id=request_id)
    if final_res_add_columns.count() == 0:
        msg = '算法结果增加列暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2
    else:
        return df1, final_res_add_columns

In [78]:
#####################################################################################
################################将最后的结果写回数据库###############################
#####################################################################################
def doris_stream_load_from_df(df, engine, table, is_json=True, chunksize=100000, partitions=None):
    engine_url = engine.url
    url = 'http://%s:18030/api/%s/%s/_stream_load' % (engine_url.host, engine_url.database, table)

    format_str = 'csv' if not is_json else 'json'
    headers = {
        'Content-Type': 'text/plain; charset=UTF-8',
        'format': format_str,
        'Expect': '100-continue'
    }
    if is_json:
        headers['strip_outer_array'] = 'true'
        headers['read_json_by_line'] = 'true'
    else:
        headers['column_separator'] = '@'
    
    if partitions:
        headers['partitions'] = partitions
    
    auth = requests.auth.HTTPBasicAuth(engine_url.username, engine_url.password)
    session = requests.sessions.Session()
    session.should_strip_auth = lambda old_url, new_url: False
    
    l = len(df)
    if l > 0:
        if chunksize and chunksize < l:
            batches = l // chunksize
            if l % chunksize > 0:
                batches += 1
            for i in range(batches):
                si = i * chunksize
                ei = min(si + chunksize, l)
                sub = df[si:ei]
                do_doris_stream_load_from_df(sub, session, url, headers, auth, is_json)
        else:
            do_doris_stream_load_from_df(df, session, url, headers, auth, is_json)


def do_doris_stream_load_from_df(df, session, url, headers, auth, is_json=False):
    data = df.to_csv(header=False, index=False, sep='@') if not is_json else df.to_json(orient='records', date_format='iso')
    #print(data)
    
    resp = session.request(
        'PUT',
        url = url,
        data=data.encode('utf-8'),
        headers=headers,
        auth=auth
    )
    print(resp.reason, resp.text)
    check_stream_load_response(resp.text)


def check_stream_load_response(resp_text):
    resp = json.loads(resp_text)
    if resp['Status'] not in ["Success", "Publish Timeout"]:
        raise Exception(resp['Message'])

In [45]:
##########################################################################################
#######################################正式调用以上函数#######################################
##########################################################################################
# request_id = 'sdd'
# grpby_list = ['OPER_NO', 'TOOL_NAME']

# # 1. 解析json 为字典， df1为kafka输入的结果数据，获取到parse_dict, request_id, grpby_list
# df2 = df1.toPandas() 
# parse_dict, request_id, grpby_list = get_some_info(df2)
# print(type(parse_dict))
# print(grpby_list)

# # 2. 从kafka 关键字映射都具体数据源中的字段,没有的可以删除
# keyword_map_from_json_to_table: dict = {
#     "prodg1": "PRODG1",
#     "waferId": "WAFER_ID",
#     "dateRange": "START_TIME",
#     "productId": "PRODUCT_ID",
#     "operNo": "OPER_NO",
#     "eqp": "EQP_NAME",
#     "tool": "TOOL_NAME",
#     "lot": "LOT_ID",
#     "recipeName": "RECIPE_NAME"}

# # 3. 获取查询条件list
# select_condition_list = parse_dict

# # 4. 指定查询表名, 根据实际情况需要修改
# table_name = "etl.DWD_POC_CASE_FD_UVA_DATA_TEST"

In [None]:
# from pyspark.sql import SparkSession
# spark = (SparkSession.builder
#             .master("local[*]")
#             .config("spark.jars.packages", "ai.catboost:catboost-spark_3.3_2.12:1.2")
#             .appName("RF")
#             .getOrCreate())

-----------------------

In [91]:
df_pandas = pd.read_csv("small4.csv")

df1 = ps.from_pandas(df_pandas).to_spark()
print(df1.count())

good = ['NBX392-15', 'NBX392-20', 'NBX392-24', 'NBX391-24', 'NBX391-25', 'NBX548-09', 'NBX391-01', 'NBX391-02', 'NBX391-13', 'NBX391-17']
bad  = ['NBX500-10', 'NBX500-01', 'NBX500-09']

if 'label' in df1.columns:
    df1 = df1
else:
    df1 = get_label_double(df1, bad, good)

  fields = [
  for column, series in pdf.iteritems():


10


In [92]:
df1.show()

+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+-----------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+------+------------+---------+----------------+-------+-----+
+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+-----------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+------+-----------

In [96]:
# 主程序
try:
    # 从数据库中获取数据
#     df1 = get_data_from_doris(select_condition_list=select_condition_list, table_name=table_name)
#     print(df1.count())
#     if df1.count() == 0:
#         msg = '解析SQL获取数据异常: 数据库中可能没有数据!'
#         df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
#         df1 = spark.createDataFrame(df_kafka)
#         raise ValueError

    # 1. 站点融合和数据预处理
    df1 = integrate_operno(df=df1, merge_operno_list=merge_operno)
    print(df1.count())
    df_run = _pre_process(df1)
    print(df_run.count())
    if df_run.count() == 0:
        msg = '该条件下数据库中暂无数据，请检查！'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        raise ValueError

    # 2. 进行共性分析
    common_res = commonality_analysis(df_run, grpby_list)
    common_res.show()
    if common_res.count() == 0:
        msg = '共性分析结果异常!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        raise ValueError

    # 3. 挑选出数据：bad和good要同时大于3
    data_dict_list_bs = get_data_list(common_res, grpby_list, big_or_small='big')
    print("data_dict_list_bs:", data_dict_list_bs)
    if len(data_dict_list_bs) != 0:
        print("****************大样本算法调用****************")
        df1, final_res_add_columns = fit_big_data_model(df_run, data_dict_list_bs, grpby_list, request_id)
    else:        
        print("****************小样本算法调用****************")
        df1, final_res_add_columns = fit_small_data_model(df_run, common_res, grpby_list, request_id)
    

    if df1 is not None:
        raise ValueError
    else:
        # final_res_add_columns 是最后的结果，要写回数据库
        # ddd = final_res_add_columns.toPandas()
        # user ="root"
        # host = "10.52.199.81"
        # password = "Nexchip%40123"
        # db = "etl"
        # port = 9030
        # engine = create_engine("mysql+pymysql://{user}:{password}@{host}:{port}/{db}".format(user = user,
        #                                                                                     password = password,
        #                                                                                     host = host,
        #                                                                                     port = port,
        #                                                                                     db = db))
        # doris_stream_load_from_df(ddd, engine, "results")

        # # 最终成功的话，就会输出下面这条
        print("运行成功")
        df_kafka = pd.DataFrame({"code": 0, "msg": "运行成功", "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)

except ValueError as ve:
    pass

except Exception as e:
    df_kafka = pd.DataFrame({"code": 1, "msg": f"主程序发生异常: {str(e)}", "requestId": request_id}, index=[0])
    df1 = spark.createDataFrame(df_kafka)

NameError: name 'spark' is not defined

In [97]:
print("最终的df1是：")
print(type(df1))
df1.show()

print("最终的算法结果是：")
print(type(final_res_add_columns))
final_res_add_columns.show()

最终的df1是：
<class 'NoneType'>


AttributeError: 'NoneType' object has no attribute 'show'

In [95]:
final_res_add_columns.show()

+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+
|  PRODG1| OPER_NO|      TOOL_NAME|stats|     parametric_name|weight|request_id|weight_percent|index_no|
+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+
|L2800Z2N|1G.PPB10|PBT01_CLHA_4-12| MEAN|PLATE_TEMP#DHP150...|   1.0|       fff|         100.0|       1|
+--------+--------+---------------+-----+--------------------+------+----------+--------------+--------+

