In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import pyspark.pandas as ps
import requests
from pca import pca
import json

from sqlalchemy import create_engine

from pyspark.sql.functions import pandas_udf, PandasUDFType, max, col, countDistinct, when, rank, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.window import Window

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import ClusterCentroids


# from backend_spark.doris_common.doris_client import DorisClient
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional



In [3]:
import findspark
from pyspark.sql import SparkSession
findspark.init() 
spark = SparkSession \
    .builder \
    .appName("ywj") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .master("local[*]") \
    .getOrCreate()

In [4]:
df1_pandas = pd.read_csv("D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisData/small_samples/small2.csv")
df1_pandas

Unnamed: 0,TOOL_ID,TOOL_NAME,RUN_ID,EQP_NAME,CASE_INFO,PRODUCT_ID,PRODG1,OPER_NO,LOT_ID,WAFER_ID,...,LOWER_OUTLIER,UPPER_OUTLIER,RULES_ENABLED,ALARM_RULE,RESULT,STATUS,REGION,ERROR_MSG,STATISTIC_RESULT,VERSION
0,9289,PBT01_CGHA_4-34,351230,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-05,...,,,1,,249.9848,NORMAL,LOWER_NORMAL,,249.9848,3
1,9287,PBT01_CGHA_4-14,360329,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-06,...,,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3
2,9279,PBT01_CLHA_4-12,360246,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-06,...,,,1,,150.0075,NORMAL,LOWER_NORMAL,,150.0075,4
3,9287,PBT01_CGHA_4-14,329563,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,,1,,249.98625,NORMAL,LOWER_NORMAL,,249.98625,3
4,9287,PBT01_CGHA_4-14,362271,PBT01,2023-12-20,AFPNR901N.0B0L,L2800Z2N,1V.PPB10,NBX293.200,NBX293-06,...,,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3
5,9279,PBT01_CLHA_4-12,329480,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,,1,,149.937083,NORMAL,LOWER_NORMAL,,149.937083,4
6,9287,PBT01_CGHA_4-14,323876,PBT01,2023-09-28,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX220.150,NBX220-06,...,,,1,,249.9825,NORMAL,LOWER_NORMAL,,249.9825,3
7,9280,PBT01_CLHA_4-21,317196,PBT01,2023-09-25,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX219.130,NBX219-08,...,,,1,,149.998261,NORMAL,LOWER_NORMAL,,149.998261,4
8,9288,PBT01_CGHA_4-24,312691,PBT01,2023-09-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX220.040,NBX220-20,...,,,1,,249.98875,NORMAL,UPPER_NORMAL,,249.98875,3
9,9288,PBT01_CGHA_4-24,355016,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-08,...,,,1,,249.982,NORMAL,LOWER_NORMAL,,249.982,3


In [5]:
df1 = ps.from_pandas(df1_pandas).to_spark()
df1.count()

24

In [6]:
############################################
######## 1. 客户只定义了bad_wafer = []是什么  ########
############################################
# 将传进来的BAD_WAFER, 用 | 连接起来，
# F.col('WAFER_ID').like('NDJ065%') | F.col('WAFER_ID').like('NDJ067%') 作为条件传入增加label
# 同时将isin模式也作为条件传入增加label

def get_label_single(df, bad_wafer):
    like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_like_conditions = " | ".join(like_conditions)
    isin_conditions = "col('WAFER_ID').isin(bad_wafer)"
    df = df.withColumn('label', 
                when( eval(all_like_conditions) | eval(isin_conditions), int(1)).otherwise(int(0)))
    return df


############################################
## 2. 客户定义了bad_wafer = [] 和 good_wafer = []######
############################################
# 将传进来的BAD_WAFER, 用 | 连接起来，
# 将传进来的GOOD_WAFER, 也用 | 连接起来，
# 同时将isin模式也作为条件传入增加label

def get_label_double(df, bad_wafer, good_wafer):
    good_like_conditions = [f"col('WAFER_ID').like('{good}')" for good in good_wafer]
    all_good_like_conditions = " | ".join(good_like_conditions)
    good_isin_conditions = "col('WAFER_ID').isin(good_wafer)"

    bad_like_conditions = [f"col('WAFER_ID').like('{bad}')" for bad in bad_wafer]
    all_bad_like_conditions = " | ".join(bad_like_conditions)
    bad_isin_conditions = "col('WAFER_ID').isin(bad_wafer)"

    df = df.withColumn('label',  when(eval(all_good_like_conditions) | eval(good_isin_conditions), int(0)).when(eval(all_bad_like_conditions) | eval(bad_isin_conditions), int(1)).otherwise(222333))
    df = df.filter(df['label'] != int(222333))
    return df

In [7]:
good = ["NBX221-13","NBX220-06","NBX272-19","NBX219-08"]
bad  = ["NBX265-05","NBX265-06","NBX265-07","NBX265-08","NBX293-06","NBX293-07","NAZ998-21","NBX220-20"]

df1 = get_label_double(df1, bad, good)

In [8]:
df1.count()

24

In [9]:
bad_wafer_num = df1.filter("label == 1").select('WAFER_ID').distinct().count()
good_wafer_num = df1.filter("label == 0").select('WAFER_ID').distinct().count()
print(bad_wafer_num)
print(good_wafer_num)

8
4


In [10]:
############################################################################
##################################FDC数据预处理###############################
############################################################################
def _pre_process(df):
    """
    param df: 从数据库中读取出来的某个CASE数据
    return: 数据预处理，后面要根据实际情况统一添加
    """
    # 只选出会用到的列
    df = df.select('WAFER_ID', 'TOOL_ID', 'RUN_ID', 'EQP_NAME', 'PRODUCT_ID', 'PRODG1', 
                'TOOL_NAME', 'OPER_NO', 'parametric_name', 'STATISTIC_RESULT', 'label')
    # 剔除NA值
    df = df.filter(col('STATISTIC_RESULT').isNotNull())
    # 按照所有的行进行去重
    df1 = df.dropDuplicates()
    # 选最新的RUN
    df2 = df1.groupBy('WAFER_ID', 'OPER_NO', 'TOOL_ID').agg(max('RUN_ID').alias('RUN_ID'))
    df_run = df1.join(df2.dropDuplicates(subset=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID']),
                                on=['WAFER_ID', 'OPER_NO', 'TOOL_ID', 'RUN_ID'], how='inner')
    return df_run


def commonality_analysis(df_run, grpby_list):
    """
    param df_run: 数据预处理后的数据
    return: 共性分析后的结果， 返回bad wafer前五的组合
    """
    grps = (df_run.groupBy(grpby_list)
                        .agg(countDistinct('WAFER_ID').alias('wafer_count'),
              countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
              countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
              .orderBy('bad_num', ascending=False))

    # 单站点+单腔室的情况
    if grps.count() == 1:
        return grps
    else: 
        grps = grps.filter(grps['bad_num'] > 0)
        window_sep = Window().orderBy(col("bad_num").desc())
        ranked_df = grps.withColumn("rank", rank().over(window_sep))
        grpss = ranked_df.filter(col("rank") <= 5).drop("rank")
    return grpss

In [11]:
df_run = _pre_process(df1)
print(df_run.count())

24


In [12]:
grpby_list = ['PRODG1', 'OPER_NO', 'TOOL_NAME']
common_res = commonality_analysis(df_run, grpby_list)
common_res.show()

+--------+--------+---------------+-----------+--------+-------+
|  PRODG1| OPER_NO|      TOOL_NAME|wafer_count|good_num|bad_num|
+--------+--------+---------------+-----------+--------+-------+
|L2800Z2N|1V.PPB10|PBT01_CGHA_4-14|          4|       3|      1|
|L2800Z2N|1V.PPB10|PBT01_CLHA_4-12|          4|       3|      1|
|L2800Z3N|1V.PPB10|PBT01_CLHA_4-12|          1|       0|      1|
|L2800Z1N|1V.PPB10|PBT01_CGHA_4-14|          1|       0|      1|
|L2800Z3N|1V.PPB10|PBT01_CGHA_4-13|          1|       0|      1|
|L2800Z3N|1V.PPB10|PBT01_CLHA_4-11|          1|       0|      1|
|L2800Z1N|1V.PPB10|PBT01_CLHA_4-12|          1|       0|      1|
|L2800Z3N|1V.PPB10|PBT01_CLHA_4-21|          1|       0|      1|
|L2800Z3N|1V.PPB10|PBT01_CLHA_4-31|          1|       0|      1|
|L2800Z2N|1V.PPB10|PBT01_CLHA_4-11|          1|       0|      1|
|L2800Z2N|1V.PPB10|PBT01_CGHA_4-13|          1|       0|      1|
|L2800Z2N|1V.PPB10|PBT01_CLHA_4-21|          2|       1|      1|
|L2800Z3N|1V.PPB10|PBT01_

In [None]:
# df_cr = common_res.orderBy(col("bad_num").desc(), col("wafer_count").desc(), col("good_num").desc()).limit(5)
# df_cr.show()

In [13]:
############################################################################
#################################获取小样本的数据###########################
############################################################################
def get_data_list_small_sample(common_res, grpby_list):
    """
    param common_res: 共性分析后的结果, 按照good_num >= 1 AND bad_num >= 1筛选出组合
    return: 对应组合的字典形式，包在一个大列表中
    """
    try:
        good_bad_grps = common_res.filter("good_num >= 1 AND bad_num >= 1")
        
        # 避免这样的组合太多，按照bad_num, wafer_count, good_num降序排名后取前5条
        good_bad_grps = good_bad_grps.orderBy(col("bad_num").desc(), col("wafer_count").desc(), col("good_num").desc()).limit(5)

        if 'PRODG1' in grpby_list:
            data_list = good_bad_grps['PRODG1', 'OPER_NO', 'TOOL_NAME'].collect()
        else:
            data_list = good_bad_grps['OPER_NO', 'TOOL_NAME'].collect()

        data_dict_list = [row.asDict() for row in data_list]
        return data_dict_list
    except Exception as e:
        return None
    

def get_train_data(df_run, data_dict_list):
    """
    param df_run: 数据预处理后的数据
    param data_dict: 筛选后的字典结果
    return: 从原始数据中过滤出真正用来建模的组合数据
    """
    try:
        if len(data_dict_list[0]) == 3:
            prod, oper, tool = data_dict_list[0]['PRODG1'], data_dict_list[0]['OPER_NO'], data_dict_list[0]['TOOL_NAME']
            df_s = df_run.filter("PRODG1 == '{}' AND OPER_NO == '{}' AND TOOL_NAME == '{}'".format(prod, oper, tool))
            for i in range(1, len(data_dict_list)):
                prod, oper, tool = data_dict_list[i]['PRODG1'], data_dict_list[i]['OPER_NO'], data_dict_list[i]['TOOL_NAME']
                df_m = df_run.filter("PRODG1 == '{}' AND OPER_NO == '{}' and TOOL_NAME == '{}'".format(prod, oper, tool))
                df_s = df_s.union(df_m)
        else: 
            oper, tool = data_dict_list[0]['OPER_NO'], data_dict_list[0]['TOOL_NAME']
            df_s = df_run.filter("OPER_NO == '{}' AND TOOL_NAME == '{}'".format(oper, tool))
            for i in range(1, len(data_dict_list)):
                oper, tool = data_dict_list[i]['OPER_NO'], data_dict_list[i]['TOOL_NAME']
                df_m = df_run.filter("OPER_NO == '{}' and TOOL_NAME == '{}'".format(oper, tool))
                df_s = df_s.union(df_m)
        return df_s
    except Exception as e:
        return None

In [14]:
data_dict_list_ss = get_data_list_small_sample(common_res=common_res, grpby_list=grpby_list)
data_dict_list_ss

[{'PRODG1': 'L2800Z2N', 'OPER_NO': '1V.PPB10', 'TOOL_NAME': 'PBT01_CGHA_4-14'},
 {'PRODG1': 'L2800Z2N', 'OPER_NO': '1V.PPB10', 'TOOL_NAME': 'PBT01_CLHA_4-12'},
 {'PRODG1': 'L2800Z2N', 'OPER_NO': '1V.PPB10', 'TOOL_NAME': 'PBT01_CLHA_4-21'},
 {'PRODG1': 'L2800Z2N', 'OPER_NO': '1V.PPB10', 'TOOL_NAME': 'PBT01_CGHA_4-24'}]

In [None]:
df_run_ss = get_train_data(df_run=df_run, data_dict_list=data_dict_list_ss)
print(df_run_ss.count())

In [16]:
def fit_pca_small_sample(df, by):
    """
    :param df: 小样本组合的数据
    :param by: 分组字段
    :return: PCA建模后的结果
    """
    try:
        schema_all = StructType(
            [StructField("feature", StringType(), True),
             StructField("loading", FloatType(), True),
             StructField("bad_wafer", IntegerType(), True)])

        @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
        def get_model_results(df_run):
            if len(by) == 3: 
                df_pivot = df_run.dropna(axis=0).pivot_table(index=['WAFER_ID', 'label'], 
                                                                 columns=['PRODG1', 'OPER_NO', 'TOOL_NAME', 'parametric_name'],
                                                                 values=['STATISTIC_RESULT'])
            else:
                df_pivot = df_run.dropna(axis=0).pivot_table(index=['WAFER_ID', 'label'], 
                                                             columns=['OPER_NO', 'TOOL_NAME', 'parametric_name'],
                                                             values=['STATISTIC_RESULT'])

            df_pivot.columns = df_pivot.columns.map('#'.join)
            df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)

            # 定义自变量
            x_train = df_pivot[df_pivot.columns.difference(['WAFER_ID', 'label']).tolist()]

            # 建立模型
            model = pca(n_components=0.8, verbose=None)
            results = model.fit_transform(x_train)
            res_top = results['topfeat']
            res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
            res_top_select['loading'] = abs(res_top_select['loading'])
            res_top_select['bad_wafer'] = sum(df_pivot['label'])
            return res_top_select
        return df.groupby(by).apply(get_model_results)
    except Exception as e:
        print(str(e))
        return None

In [17]:
res = fit_pca_small_sample(df=df_run_ss, by=grpby_list)
res.count()

Py4JJavaError: An error occurred while calling o315.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 135.0 failed 1 times, most recent failure: Lost task 0.0 in stage 135.0 (TID 392) (IKAS-NB-203.oa.ikasinfo.com executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:599)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:581)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:107)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:88)
	... 21 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:599)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:581)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:107)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:50)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.hashAgg_doAggregateWithoutKey_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage18.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
Caused by: java.io.EOFException
	at java.io.DataInputStream.readInt(DataInputStream.java:392)
	at org.apache.spark.sql.execution.python.PythonArrowOutput$$anon$1.read(PythonArrowOutput.scala:88)
	... 21 more


In [None]:
res_pandas = res.toPandas()

In [None]:
df_run_ss_pandas = df_run_ss.toPandas()
df_run_ss_pandas

In [None]:
i = 3
prod, oper, tool = data_dict_list_ss[i]['PRODG1'], data_dict_list_ss[i]['OPER_NO'], data_dict_list_ss[i]['TOOL_NAME']
df_run_ss_pandas1 = df_run_ss_pandas.query("PRODG1 == '{}' & OPER_NO == '{}' & TOOL_NAME == '{}'".format(prod, oper, tool))
df_run_ss_pandas1

In [None]:
df_pivot = df_run_ss_pandas1.dropna(axis=0).pivot_table(index=['WAFER_ID', 'label'],
                                                        columns=['PRODG1', 'OPER_NO', 'TOOL_NAME', 'parametric_name'],
                                                        values=['STATISTIC_RESULT'])
df_pivot.columns = df_pivot.columns.map('#'.join)
df_pivot = df_pivot.fillna(df_pivot.mean()).reset_index(drop=False)
df_pivot

In [None]:
# 定义自变量
x_train = df_pivot[df_pivot.columns.difference(['WAFER_ID', 'label']).tolist()]

# 建立模型
model = pca(n_components=0.8, verbose=None)
results = model.fit_transform(x_train)
res_top = results['topfeat']
res_top_select = res_top[res_top['type'] == 'best'][['feature', 'loading']]
res_top_select['loading'] = abs(res_top_select['loading'])
res_top_select['bad_wafer'] = sum(df_pivot['label'])

In [None]:
res_top_select