In [2]:
import pandas as pd
import pyspark.pandas as ps
import requests
import json

from sqlalchemy import create_engine
from pca import pca
from pyspark.sql.functions import pandas_udf, PandasUDFType, max, col, countDistinct, when, rank, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.window import Window

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import ClusterCentroids

# from backend_spark.doris_common.doris_client import DorisClient
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional

In [3]:
import os 
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '1024m') \
    .config('spark.driver.cores', '3') \
    .config('spark.executor.memory', '1024m') \
    .config('spark.executor.cores', '1') \
    .config('spark.cores.max', '2') \
    .config('spark.driver.host','192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [None]:
############################################################################
##############################从kafka消息读取需要的资料#########################
############################################################################
def get_some_info(df:pd.DataFrame):
    if len(df) > 0:
        df = df.head(1)

    request_id = df["requestId"].values[0]
    request_params = df["requestParam"].values[0]
    # 避免存在单引号，因为json 引号只有双引号
    request_params = request_params.replace('\'', "\"")   
    parse_dict = json.loads(request_params)
    merge_prodg1 = parse_dict[0]['mergeProdg1']
    
    try:
        merge_operno = list(parse_dict[0]['mergeOperno'])
    except KeyError:
        merge_operno = None

    if merge_prodg1 == '1':
        grpby_list = ['OPER_NO', 'TOOL_NAME']
    elif merge_prodg1 == '0':
        grpby_list = ['PRODG1', 'OPER_NO', 'TOOL_NAME']
    else:
        raise ValueError
    return parse_dict, request_id, grpby_list, merge_operno

In [4]:
df_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/uva_algorithm/CASE1_DATA/DWD_POC_CASE_FD_UVA_DATA_CASE1_PROCESSED1.csv")
df_pandas.shape

(550911, 16)

In [5]:
df_pandas['TOOL_NAME'].unique()

array(['EKT72_PM1', 'EKT72_PM2'], dtype=object)

In [None]:
# df_pandas[df_pandas['TOOL_NAME']  == 'EKT72_PM2']

In [None]:
# df_pandas[df_pandas['label']==1]['WAFER_ID'].unique()

In [6]:
df1 = ps.from_pandas(df_pandas).to_spark()
df1.count()

  fields = [
  for column, series in pdf.iteritems():


550911

In [7]:
df1.show()

+---------+-------+------+--------+--------------+--------+---------+----------+--------------------+--------+-------------------+--------------------+----------+------+----------------+-----+
| WAFER_ID|TOOL_ID|RUN_ID|EQP_NAME|    PRODUCT_ID|  PRODG1|TOOL_NAME|    LOT_ID|         RECIPE_NAME| OPER_NO|         START_TIME|     parametric_name| CASE_INFO|STATUS|STATISTIC_RESULT|label|
+---------+-------+------+--------+--------------+--------+---------+----------+--------------------+--------+-------------------+--------------------+----------+------+----------------+-----+
|NGE186-07|  11341|149770|   EKT72|AFKN2J01N.0U01|L11CD02A|EKT72_PM1|NGE186.000|NEW-DRM/P1/110NM/...|1F.EEK10|2023-06-16 02:15:06|PROCESS_GAS_10_CO...|2023-06-16|NORMAL|             0.0|    0|
|NGE186-07|  11341|149770|   EKT72|AFKN2J01N.0U01|L11CD02A|EKT72_PM1|NGE186.000|NEW-DRM/P1/110NM/...|1F.EEK10|2023-06-16 02:15:06|PROCESS_GAS_3_C4F...|2023-06-16|NORMAL|             0.0|    0|
|NGE186-07|  11341|149770|   EKT72|

In [8]:
type(df1)

pyspark.sql.dataframe.DataFrame

In [9]:
############################################################################
##################################FDC数据预处理#############################
############################################################################
def _pre_process(df):
    """
    param df: 从数据库中读取出来的某个CASE数据
    return: 数据预处理，后面要根据实际情况统一添加
    """
    # 只选出会用到的列
    df = df.select('WAFER_ID', 'TOOL_ID', 'RUN_ID', 'EQP_NAME', 'PRODUCT_ID', 'PRODG1', 'TOOL_NAME',
                   'OPER_NO', 'parametric_name', 'STATISTIC_RESULT', 'label')
    # 剔除NA值
    df = df.filter(col('STATISTIC_RESULT').isNotNull())
    # 按照所有的行进行去重
    df1 = df.dropDuplicates()
    # 选最新的RUN
    df2 = df1.groupBy('WAFER_ID', 'OPER_NO', 'TOOL_ID').agg(max('RUN_ID').alias('RUN_ID'))
    df_run = df1.join(df2.dropDuplicates(subset=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID']),
                      on=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID'], how='inner')
    return df_run



def commonality_analysis(df_run, grpby_list):
    """
    param df_run: 数据预处理后的数据
    return: 共性分析后的结果， 返回bad wafer前十的组合
    """
    grps = (df_run.groupBy(grpby_list)
            .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                 countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
            .orderBy('bad_num', ascending=False))

    # 单站点+单腔室的情况
    if grps.count() == 1:
        return grps
    else:
        grps = grps.filter(grps['bad_num'] > 0)
        window_sep = Window().orderBy(col("bad_num").desc())
        ranked_df = grps.withColumn("rank", rank().over(window_sep))
        grpss = ranked_df.filter(col("rank") <= 10).drop("rank")
        return grpss

In [10]:
df_run = _pre_process(df1)
print(df_run.count())

550896


In [None]:
grpby_list =['PRODG1', 'TOOL_NAME', 'OPER_NO']

In [35]:
grpby_list =['TOOL_NAME', 'OPER_NO']

In [50]:
grpby_list =['EQP_NAME', 'OPER_NO']

In [71]:
grpby_list =['PRODUCT_ID', 'EQP_NAME', 'PRODG1', 'OPER_NO']

In [95]:
grpby_list =['OPER_NO']

In [131]:
grpby_list =['OPER_NO', 'PRODUCT_ID']

In [132]:
common_res = commonality_analysis(df_run, grpby_list)
common_res.show()

+--------+--------------+-----------+--------+-------+
| OPER_NO|    PRODUCT_ID|wafer_count|good_num|bad_num|
+--------+--------------+-----------+--------+-------+
|1F.EEK10|AFKNBM01N.0B01|       1000|       0|   1000|
|1F.EEK10|AFKN2J01N.0U01|        959|     234|    725|
|1F.EEK10|AFKN4X01N.0B01|        474|     175|    299|
|1F.EEK10|AMKNGW01N.0C01|        224|       0|    224|
|1F.EEK10|AFGN5101N.0S01|        101|       0|    101|
|1F.EEK10|AFKN6201N.0A01|         75|       0|     75|
|1F.EEK10|AFGN4201N.0B01|         76|       1|     75|
|1F.EEK10|AFGN1501N.0C02|        175|     100|     75|
|1F.EEK10|AMKNXY01N.0A01|         75|       1|     74|
|1F.EEK10|AMKNS301N.0A01|         25|       0|     25|
+--------+--------------+-----------+--------+-------+



In [None]:
# grpby_list1 = ['PRODG1', 'OPER_NO', 'TOOL_NAME']
# common_res1 = commonality_analysis(df_run, grpby_list1)
# common_res1.show()

In [133]:
###########################################################################
#################################获取样本数据#########################
############################################################################
def get_data_list(common_res, grpby_list, big_or_small='big'):
    """
    param common_res: 共性分析后的结果, 按照大样本或者小样本条件筛选出组合
    param grpby_list: 'PRODG1',  'EQP_NAME',  'OPER_NO',  'PRODUCT_ID', 'TOOL_NAME' 这5个字段自由搭配
    param big_or_small: big或者small
    return: 对应组合的字典形式, 包在一个大列表中
    """
    assert big_or_small in ['big', 'small'], "只能选择big或者small, 请检查拼写"
    if big_or_small == 'big':
        good_bad_grps = common_res.filter("good_num >= 3 AND bad_num >= 3")
    else:
        good_bad_grps = common_res.filter("bad_num >= 1 AND wafer_count >=2")
    good_bad_grps = good_bad_grps.orderBy(col("bad_num").desc(), col("wafer_count").desc(), col("good_num").desc()).limit(10)

    data_list = good_bad_grps[grpby_list].collect()
    data_dict_list = [row.asDict() for row in data_list]
    return data_dict_list


def get_train_data(df_run, data_dict_list):
    """
    param df_run: 数据预处理后的数据
    param data_dict: 筛选后的字典结果
    return: 从原始数据中过滤出真正用来建模的组合数据
    """
    # 获取第一个数据字典
    first_data_dict = data_dict_list[0]

    # 动态构建过滤条件
    conditions = " AND ".join(["{} == '{}'".format(col, first_data_dict[col]) for col in first_data_dict])
    print("conditions1", conditions)
    df_s = df_run.filter(conditions)

    for i in range(1, len(data_dict_list)):
        data_dict = data_dict_list[i]
        conditions = " AND ".join(["{} == '{}'".format(col, data_dict[col]) for col in data_dict])
        print("conditions2", conditions)
        df_m = df_run.filter(conditions)
        df_s = df_s.union(df_m)
    return df_s

In [134]:
data_dict_list_bs = get_data_list(common_res, grpby_list, big_or_small='big')
data_dict_list_bs

[{'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN2J01N.0U01'},
 {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFKN4X01N.0B01'},
 {'OPER_NO': '1F.EEK10', 'PRODUCT_ID': 'AFGN1501N.0C02'}]

In [135]:
df_run_bs = get_train_data(df_run, data_dict_list_bs)
df_run_bs.count()

conditions1 OPER_NO == '1F.EEK10' AND PRODUCT_ID == 'AFKN2J01N.0U01'
conditions2 OPER_NO == '1F.EEK10' AND PRODUCT_ID == 'AFKN4X01N.0B01'
conditions2 OPER_NO == '1F.EEK10' AND PRODUCT_ID == 'AFGN1501N.0C02'


113455

In [118]:
############################################################################
#########################获取传入的整个数据中的所有bad_wafer个数############
############################################################################
def get_all_bad_wafer_num(df):
    """
    param df: 筛选后的数据
    return: 数据中所有bad_wafer的数量
    """
    return df.filter("label == 1").select('WAFER_ID').distinct().count()

In [136]:
bad_wafer_num_big_sample = get_all_bad_wafer_num(df_run_bs)
bad_wafer_num_big_sample

1099

In [137]:
grpby_list

['OPER_NO', 'PRODUCT_ID']

In [138]:
############################################################################
#####################对good>=3和bad>=3的数据，用rf建模######################
############################################################################
def get_pivot_table(df, grpby_list):
    """
    param df: 大样本组合的数据
    param grpby_list: 分组字段
    return: 表格透视后的结果
    """
    index_cols = ['WAFER_ID', 'label']
    columns_cols = grpby_list + ['parametric_name']
    df_pivot = df.dropna(axis=0).pivot_table(index=index_cols, 
                                             columns=columns_cols,
                                             values=['STATISTIC_RESULT'])
    df_pivot.columns = df_pivot.columns.map('#'.join)
    df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
    return df_pivot


def fit_rf_big_sample(df, grpby_list):
    """
    param df: 大样本组合的数据
    param grpby_list: 分组字段
    return: RandomForest建模后的结果
    """
    # 动态构建 schema_all
    struct_fields  = [StructField(col, StringType(), True) for col in grpby_list]
    struct_fields.extend([StructField("bad_wafer", IntegerType(), True),
                          StructField("roc_auc_score", FloatType(), True),
                          StructField("features", StringType(), True),
                          StructField("importance", FloatType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):
        # 表格透视
        df_pivot = get_pivot_table(df=df_run, grpby_list=grpby_list)

        # 定义自变量和因变量
        X_train = df_pivot[df_pivot.columns.difference(['WAFER_ID', 'label']).tolist()]
        y_train = df_pivot[['label']]

        z_ratio = y_train.value_counts(normalize=True)
        good_ratio = z_ratio[0]
        bad_ratio = z_ratio[1]
        if abs(good_ratio - bad_ratio) > 0.7:
            undersampler = ClusterCentroids(random_state=101)
            X_train, y_train = undersampler.fit_resample(X_train, y_train)

        # 网格搜索
        pipe = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value=-999)),
            ('scaler', StandardScaler()),
            ('model', RandomForestClassifier(random_state=2024))])
        param_grid = {'model__n_estimators': [*range(50, 100, 10)],
                      'model__max_depth': [*range(10, 50, 10)]}
        grid = GridSearchCV(estimator=pipe, scoring='roc_auc', param_grid=param_grid, cv=3, n_jobs=-1)
        grid.fit(X_train.values, y_train.values.ravel())
        roc_auc_score_ = grid.best_score_

        # 特征重要度、结果汇总
        small_importance_res = pd.DataFrame({
            'features': X_train.columns,
            'importance': grid.best_estimator_.steps[2][1].feature_importances_})
        
        sample_res_dict = {'bad_wafer': sum(df_pivot['label']),
                           'roc_auc_score': roc_auc_score_}
        sample_res_dict.update({col: df_run[col].unique() for col in grpby_list})
        small_sample_res = pd.DataFrame(sample_res_dict)
    
        return pd.concat([small_importance_res, small_sample_res])
    return df.groupby(grpby_list).apply(get_model_result)

In [139]:
res = fit_rf_big_sample(df=df_run_bs, grpby_list=grpby_list)
res.show()

+-------+----------+---------+-------------+--------------------+-------------+
|OPER_NO|PRODUCT_ID|bad_wafer|roc_auc_score|            features|   importance|
+-------+----------+---------+-------------+--------------------+-------------+
|   null|      null|     null|         null|STATISTIC_RESULT#...|   0.10018185|
|   null|      null|     null|         null|STATISTIC_RESULT#...|  0.014608934|
|   null|      null|     null|         null|STATISTIC_RESULT#...|          0.0|
|   null|      null|     null|         null|STATISTIC_RESULT#...|  0.124218196|
|   null|      null|     null|         null|STATISTIC_RESULT#...|          0.0|
|   null|      null|     null|         null|STATISTIC_RESULT#...|  3.884744E-6|
|   null|      null|     null|         null|STATISTIC_RESULT#...|  3.771675E-4|
|   null|      null|     null|         null|STATISTIC_RESULT#...|   0.08442577|
|   null|      null|     null|         null|STATISTIC_RESULT#...|         0.02|
|   null|      null|     null|         n

In [123]:
respp = res.toPandas()

In [None]:
feature_importance_table = respp[['features', 'importance']].dropna(axis=0)
feature_importance_table

In [None]:
feature_importance_table['features'].iloc[0].split("#")

In [None]:
grpby_list

In [None]:
n_feats = len(grpby_list)
for i in range(n_feats):
    df[grpby_list[i]] = split_features(df, i + 1)
    
df['parametric_name'] = split_features(df, n_feats + 1)
df['step'] = split_features(df, n_feats + 2)
df['stats'] = split_features(df, n_feats + 3)

In [None]:
df

In [124]:
#####################################################################################
#########################对good>=3和bad>=3建模后的结果进行整合############################
#####################################################################################
def split_score_big_sample(df, grpby_list):
    """
    param df: RandomForest建模后的结果
    param grpby_list: 分组字段
    return: roc_auc分数结果
    """
    # 动态构建 schema_all
    struct_fields  = [StructField(col, StringType(), True) for col in grpby_list]
    struct_fields.extend([StructField("bad_wafer", IntegerType(), True),
                          StructField("roc_auc_score", FloatType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(model_results):
        select_expr = grpby_list + ['bad_wafer', 'roc_auc_score']
        sample_res = model_results[select_expr].dropna(axis=0)
        sample_res = sample_res[sample_res['roc_auc_score'] > 0.6]
        return sample_res
    return df.groupby(grpby_list).apply(get_result)


def split_features(df, index) -> str:
    """
    param df: RandomForest建模后的feature_importance_table
    param index: 顺序值
    return: 字段属性值
    """
    return df['features'].apply(lambda x: x.split('#')[index])


def get_split_feature_importance_table(df, grpby_list):
    """
    param df: RandomForest建模后的feature_importance_table
    param grpby_list: OPER_NO+TOOL_NAME+PRODG1或者OPER_NO+TOOL_NAME
    return: 分裂features后的表
    """
    n_feats = len(grpby_list)
    for i in range(n_feats):
        df[grpby_list[i]] = split_features(df, i + 1)

    df['parametric_name'] = split_features(df, n_feats + 1)
    df['step'] = split_features(df, n_feats + 2)
    df['stats'] = split_features(df, n_feats + 3)
    df = df.drop(['features'], axis=1).reset_index(drop=True)
    return df


def add_feature_stats(df, grpby_list):
    """
    param df: 经过处理后的feature_importance_table
    return: 新增一列，含有参数的所有统计特征:feature_stats
    """
    grpby_list_extend = grpby_list + ['parametric_name', 'step']
    feature_stats = df.groupby(grpby_list_extend)['stats'].unique().reset_index()
    feature_stats['stats'] = [feature_stats['stats'].iloc[i].tolist() for i in range(len(feature_stats))]
    feature_stats['stats'] = feature_stats['stats'].apply(lambda x: "#".join(x))
    feature_stats = feature_stats.assign(parametric_name=lambda x: x['parametric_name']+str('#')+x['step']).drop('step', axis=1)
    return feature_stats
    
    
def split_calculate_features_big_sample(df, grpby_list):
    """
    param df: RandomForest建模后的结果
    param grpby_list: 分组字段
    return: features和importance结果
    """
    # 动态构建 schema_all
    struct_fields  = [StructField(col, StringType(), True) for col in grpby_list]
    struct_fields.extend([StructField("parametric_name", StringType(), True),
                          StructField("importance", FloatType(), True),
                          StructField("stats", StringType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(model_results):
        # 先从随机森林的模型结果中取出包含features和importance的dataframe
        feature_importance_table = model_results[['features', 'importance']].dropna(axis=0)

        # 分裂features
        feature_importance_res_split = get_split_feature_importance_table(df=feature_importance_table, grpby_list=grpby_list)

        # 去除importance为0的组合
        feature_importance_res_split_drop = feature_importance_res_split.query("importance > 0").reset_index(drop=True)

        # 取每一种组合结果的前60%或者100%
        feature_importance_res_split_nlargest = (feature_importance_res_split_drop.groupby(by=grpby_list)
                                            .apply(lambda x: x.nlargest(int(x.shape[0]*0.6), 'importance') if x.shape[0]>1 else x.nlargest(int(x.shape[0]*1), 'importance'))
                                            .reset_index(drop=True))

        # 新增一列，含有参数的所有统计特征:feature_stats
        feature_stats = add_feature_stats(df=feature_importance_res_split_drop, grpby_list=grpby_list)

        # 对同一种组合里的同一个参数进行求和:feature_importance_groupby
        feature_importance_groupby = (feature_importance_res_split_nlargest.groupby(grpby_list + ['parametric_name', 'step'])['importance']
                                                                           .sum().reset_index())
        feature_importance_groupby = (feature_importance_groupby.assign(parametric_name=lambda x: x['parametric_name'] + str('#') + x['step'])
                                                                .drop('step', axis=1))

        # feature_stats和feature_importance_groupby连接
        grpby_stats = pd.merge(feature_stats, feature_importance_groupby, on=grpby_list + ['parametric_name']).dropna().reset_index(drop=True)
        return grpby_stats
    return df.groupby(grpby_list).apply(get_result)

    

def get_finall_results_big_sample(s_res, f_res, grpby_list, bad_wafer_num):
    """
    param s_res: roc_auc分数结果
    param f_res: features和importance结果
    param bad_wafer_num: 数据中所有bad_wafer的数量
    return: 最后的建模结果
    """
    # feature_importance_groupby和sample_res连接
    roc_auc_score_all = s_res.agg({"roc_auc_score": "sum"}).collect()[0][0]
    s_res = s_res.withColumn("roc_auc_score_ratio", col("roc_auc_score")/roc_auc_score_all)
    s_res = s_res.withColumn("bad_ratio", col("bad_wafer") / bad_wafer_num)

    df_merge = s_res.join(f_res, on=grpby_list, how='left')
    df_merge = df_merge.withColumn('weight_original', col('roc_auc_score_ratio') * col('bad_ratio') * col('importance'))

    # 最后再次进行一次归一化
    weight_all = df_merge.agg({"weight_original": "sum"}).collect()[0][0]
    df_merge = df_merge.withColumn("weight", col("weight_original") / weight_all)
    df_merge = df_merge.select(grpby_list + ['parametric_name', 'weight', 'stats']).orderBy('weight', ascending=False)
    return df_merge

In [140]:
grpby_list

['OPER_NO', 'PRODUCT_ID']

In [141]:
s_res = split_score_big_sample(df=res, grpby_list=grpby_list)
s_res.show()

+--------+--------------+---------+-------------+
| OPER_NO|    PRODUCT_ID|bad_wafer|roc_auc_score|
+--------+--------------+---------+-------------+
|1F.EEK10|AFGN1501N.0C02|       75|   0.99627453|
|1F.EEK10|AFKN2J01N.0U01|      725|    0.8621095|
|1F.EEK10|AFKN4X01N.0B01|      299|          1.0|
+--------+--------------+---------+-------------+



In [142]:
f_res = split_calculate_features_big_sample(df=res, grpby_list=grpby_list)
f_res.show()

+--------+--------------+--------------------+------------+----------+
| OPER_NO|    PRODUCT_ID|     parametric_name|  importance|     stats|
+--------+--------------+--------------------+------------+----------+
|1F.EEK10|AFGN1501N.0C02|APC_POSITION#AOTU...| 0.009099345|MEAN#RANGE|
|1F.EEK10|AFGN1501N.0C02|APC_POSITION#AOTU...|0.0069474624|MEAN#RANGE|
|1F.EEK10|AFGN1501N.0C02|CENTER_GAS_PRESSU...|0.0115505885|      MEAN|
|1F.EEK10|AFGN1501N.0C02|CHAMBER_PRESSURE#...|0.0072822515|     RANGE|
|1F.EEK10|AFGN1501N.0C02|CHAMBER_PRESSURE#...|0.0033773556|MEAN#RANGE|
|1F.EEK10|AFGN1501N.0C02|EDGE_GAS_PRESSURE...|0.0036432669|      MEAN|
|1F.EEK10|AFGN1501N.0C02|EDGE_GAS_PRESSURE...|0.0064301947|      MEAN|
|1F.EEK10|AFGN1501N.0C02|EDGE_HE_FLOW#AOTU...| 0.006049666|      MEAN|
|1F.EEK10|AFGN1501N.0C02|EDGE_HE_FLOW#AOTU...|0.0051699295|      MEAN|
|1F.EEK10|AFGN1501N.0C02|EDGE_HE_FLOW#AOTU...| 0.012448553|      MEAN|
|1F.EEK10|AFGN1501N.0C02|ESC_CURRENT#AOTU_...|  0.11816658|  MAX#MEAN|
|1F.EE

In [143]:
model_res_bs = get_finall_results_big_sample(s_res=s_res, f_res=f_res, grpby_list=grpby_list, bad_wafer_num=bad_wafer_num_big_sample)
model_res_bs.show()

+--------+--------------+--------------------+--------------------+----------+
| OPER_NO|    PRODUCT_ID|     parametric_name|              weight|     stats|
+--------+--------------+--------------------+--------------------+----------+
|1F.EEK10|AFKN2J01N.0U01|LO_RF_VPP#AOTU_ST...| 0.10312683627148242|MEAN#SLOPE|
|1F.EEK10|AFKN2J01N.0U01|LO_RF_VPP#STEP2_MINI| 0.09257263611968515|      MEAN|
|1F.EEK10|AFKN2J01N.0U01|LO_RF_VPP#STEP2_M...| 0.07835981532533769|      MEAN|
|1F.EEK10|AFKN2J01N.0U01|LO_RF_REF_POWER#A...|0.049023116038479884|  MAX#MEAN|
|1F.EEK10|AFKN2J01N.0U01|PROCESS_GAS_8_O2#...| 0.04796866126622622|      MEAN|
|1F.EEK10|AFKN4X01N.0B01|LO_RF_POWER#AOTU_...| 0.04223486028603683|      MEAN|
|1F.EEK10|AFKN4X01N.0B01|CENTER_GAS_PRESSU...|  0.0416582018575752|      MEAN|
|1F.EEK10|AFKN2J01N.0U01|LOWER_TEMPERATURE...|0.038791287698234575|      MEAN|
|1F.EEK10|AFKN4X01N.0B01|APC_POSITION#AOTU...| 0.03359729720614707|MEAN#RANGE|
|1F.EEK10|AFKN4X01N.0B01|ESC_CURRENT#AOTU_...| 0.032

In [129]:
#####################################################################################
#############################将建模后的结果增加特定的列####################################
#####################################################################################
def add_certain_column(df, by, request_id, grpby_list):
    """
    param df: 最后的建模结果
    param by: 分组字段, 手动增加一列add
    param request_id: 传入的request_id
    return: 最后的建模结果增加特定的列
    """
    # 动态构建 schema_all
    struct_fields  = [StructField(col, StringType(), True) for col in grpby_list]
    struct_fields.extend([StructField("stats", StringType(), True),
                            StructField("parametric_name", StringType(), True),
                            StructField("weight", FloatType(), True),
                            StructField("request_id", StringType(), True),
                            StructField("weight_percent", FloatType(), True),
                            StructField("index_no", IntegerType(), True)])
    schema_all = StructType(struct_fields)

    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_result(final_res):
        final_res['weight'] = final_res['weight'].astype(float)
        final_res = final_res.query("weight > 0")
        final_res['request_id'] = request_id
        final_res['weight_percent'] = final_res['weight'] * 100
        final_res = final_res.sort_values('weight', ascending=False)
        final_res['index_no'] = [i + 1 for i in range(len(final_res))]
        final_res = final_res.drop('add', axis=1)
        return final_res
    return df.groupby(by).apply(get_result)

In [130]:
request_id = 'sf'
final_res_bs = model_res_bs.withColumn('add', lit(0))
final_res_add_columns = add_certain_column(df=final_res_bs, by='add', request_id=request_id, grpby_list=grpby_list)
final_res_add_columns.show()

+--------------+--------+----------+--------------------+-----------+----------+--------------+--------+
|    PRODUCT_ID| OPER_NO|     stats|     parametric_name|     weight|request_id|weight_percent|index_no|
+--------------+--------+----------+--------------------+-----------+----------+--------------+--------+
|AFKN2J01N.0U01|1F.EEK10|MEAN#SLOPE|LO_RF_VPP#AOTU_ST...| 0.10312684|        sf|     10.312684|       1|
|AFKN2J01N.0U01|1F.EEK10|      MEAN|LO_RF_VPP#STEP2_MINI| 0.09257264|        sf|      9.257263|       2|
|AFKN2J01N.0U01|1F.EEK10|      MEAN|LO_RF_VPP#STEP2_M...| 0.07835981|        sf|     7.8359814|       3|
|AFKN2J01N.0U01|1F.EEK10|  MAX#MEAN|LO_RF_REF_POWER#A...|0.049023118|        sf|      4.902312|       4|
|AFKN2J01N.0U01|1F.EEK10|      MEAN|PROCESS_GAS_8_O2#...| 0.04796866|        sf|      4.796866|       5|
|AFKN4X01N.0B01|1F.EEK10|      MEAN|LO_RF_POWER#AOTU_...| 0.04223486|        sf|      4.223486|       6|
|AFKN4X01N.0B01|1F.EEK10|      MEAN|CENTER_GAS_PRESSU..

In [145]:
final_res_add_columns.columns

['OPER_NO',
 'PRODUCT_ID',
 'stats',
 'parametric_name',
 'weight',
 'request_id',
 'weight_percent',
 'index_no']

--------------------------------------------- 

In [None]:
def fit_big_data_model(df_run, data_dict_list_bs, grpby_list, request_id):

    df1 = None
    df2 = None
    
    # 1. 获取用于建模的大样本数据
    df_run_bs = get_train_data(df_run, data_dict_list_bs)
    if df_run_bs.count() == 0:
        msg = '数据库中暂无此类数据!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    # 2. 获取所有bad wafer数量
    bad_wafer_num_big_sample = get_all_bad_wafer_num(df_run_bs)
    if bad_wafer_num_big_sample < 3:
        msg = '数据库中实际BAD_WAFER数量小于3片, 请提供更多的BAD_WAFER数量!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2


    # 3. 对挑选出的大样本数据进行建模
    res = fit_rf_big_sample(df=df_run_bs, by=grpby_list)
    if res.count() == 0:
        msg = '算法内部暂时异常!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2


    # 4. 将建模结果进行整合
    s_res = split_score_big_sample(df=res, by=['PRODG1', 'OPER_NO', 'TOOL_NAME'])
    if s_res.count() == 0:
        msg = '算法运行评分结果较低, 暂无输出, 建议增加BAD_WAFER数量'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    f_res = split_calculate_features_big_sample(df=res, by=grpby_list)
    if f_res.count() == 0:
        msg = '算法结果求和暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2


    model_res_bs = get_finall_results_big_sample(s_res=s_res, f_res=f_res, bad_wafer_num=bad_wafer_num_big_sample)
    if model_res_bs.count() == 0:
        msg = '算法结果拼接暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2

    # 7. 增加特定的列
    final_res_bs = model_res_bs.withColumn('add', lit(0))
    final_res_add_columns = add_certain_column(df=final_res_bs, by='add', request_id=request_id)
    if final_res_add_columns.count() == 0:
        msg = '算法结果增加列暂时异常'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        return df1, df2
    else:  
        return df1, final_res_add_columns

In [None]:
# 主程序
try:
    # 从数据库中获取数据
#     df1 = get_data_from_doris(select_condition_list=select_condition_list, table_name=table_name)
#     print(df1.count())
#     if df1.count() == 0:
#         msg = '解析SQL获取数据异常: 数据库中可能没有数据!'
#         df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
#         df1 = spark.createDataFrame(df_kafka)
#         raise ValueError

    # 1. 站点融合和数据预处理
    df1 = integrate_operno(df=df1, merge_operno_list=merge_operno)
    print(df1.count())
    df_run = _pre_process(df1)
    print(df_run.count())
    if df_run.count() == 0:
        msg = '该条件下数据库中暂无数据，请检查！'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        raise ValueError

    # 2. 进行共性分析
    common_res = commonality_analysis(df_run, grpby_list)
    common_res.show()
    if common_res.count() == 0:
        msg = '共性分析结果异常!'
        df_kafka = pd.DataFrame({"code": 1, "msg": f'{msg}', "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)
        raise ValueError

    # 3. 挑选出数据：bad和good要同时大于3
    data_dict_list_bs = get_data_list(common_res, grpby_list, big_or_small='big')
    print("data_dict_list_bs:", data_dict_list_bs)
    if len(data_dict_list_bs) != 0:
        print("****************大样本算法调用****************")
        df1, final_res_add_columns = fit_big_data_model(df_run, data_dict_list_bs, grpby_list, request_id)
    else:        
        print("****************小样本算法调用****************")
        df1, final_res_add_columns = fit_small_data_model(df_run, common_res, grpby_list, request_id)
    

    if df1 is not None:
        raise ValueError
    else:
        # final_res_add_columns 是最后的结果，要写回数据库
        # ddd = final_res_add_columns.toPandas()
        # user ="root"
        # host = "10.52.199.81"
        # password = "Nexchip%40123"
        # db = "etl"
        # port = 9030
        # engine = create_engine("mysql+pymysql://{user}:{password}@{host}:{port}/{db}".format(user = user,
        #                                                                                     password = password,
        #                                                                                     host = host,
        #                                                                                     port = port,
        #                                                                                     db = db))
        # doris_stream_load_from_df(ddd, engine, "results")

        # # 最终成功的话，就会输出下面这条
        print("运行成功")
        df_kafka = pd.DataFrame({"code": 0, "msg": "运行成功", "requestId": request_id}, index=[0])
        df1 = spark.createDataFrame(df_kafka)

except ValueError as ve:
    pass

except Exception as e:
    df_kafka = pd.DataFrame({"code": 1, "msg": f"主程序发生异常: {str(e)}", "requestId": request_id}, index=[0])
    df1 = spark.createDataFrame(df_kafka)

In [171]:
# path = 'D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/uva_algorithm/small_samples_data/SMALL_SAMPLE_UVA_ORIGINAL_DATA.csv'
# dfee = pd.read_csv(path)
# print(dfee.shape)

# bad_ = ['NBX255-05', 'NBX255-12', 'NBX255-15', 'NBX255-25', 'NAZ749-25', 'NAZ909-16', 'NAZ909-17', 'NAZ909-20']
# good_ = ['NBX255-03', 'NAZ749-08', 'NAZ749-09', 'NAZ749-10', 'NAZ909-06']

# def label_wafer(wafer_id):
#     if wafer_id in bad_:
#         return 1  
#     elif wafer_id in good_:
#         return 0  
#     else:
#         return None  

# dfee['label'] = dfee['WAFER_ID'].apply(label_wafer)

# dfee['label'].value_counts()

# dfee.to_csv('D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/uva_algorithm/small_samples_data/SMALL_SAMPLE_UVA_ORIGINAL_DATA_labeled.csv.go', index=0)