In [2]:
import warnings
import pandas as pd
import os
warnings.filterwarnings('ignore')
from scipy.stats import ttest_ind, mannwhitneyu, shapiro, levene

In [3]:
import pyspark.pandas as ps
from pyspark.sql.functions import pandas_udf, PandasUDFType, countDistinct, when, col, rank, lit



In [4]:
# spark集群模式
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '4g') \
    .config('spark.driver.cores', '8') \
    .config('spark.executor.memory', '4g') \
    .config('spark.executor.cores', '8') \
    .config('spark.cores.max', '8') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [171]:
df_run_pandas = pd.read_csv('C:/Users/yang.wenjun/Desktop/晶合FDC-freedemo资料/df_defect.csv')
df_run_pandas

Unnamed: 0,WAFER_ID,OPER_NO,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,INSPECTION_TIME,label
0,NA0298-24,5FP10,NA02.000,3831272,EMNDE015FP10,6,2023/5/4 23:52,0
1,NA0302-24,5FP10,NA02.000,3828393,EMNDE015FP10,10,2023/5/5 23:52,1
2,NA0304-13,5FP10,NA02.000,3825514,EMNDE015FP10,3,2023/5/6 23:52,0
3,NA0298-25,5FP10,NA02.000,3822635,EMNDE015FP10,9,2023/5/7 23:52,1
4,NA0302-25,5FP10,NA12.000,3819756,EMNDE015FP10,3,2023/5/8 23:52,0
5,NA0304-14,5FP10,NA12.000,3816877,EMNDE015FP10,9,2023/5/9 23:52,1
6,NA0298-26,5FP10,NA12.000,3813998,EMNDE015FP10,1,2023/5/10 23:52,0
7,NA0302-26,5FP10,NA12.000,3811119,EMNDE015FP10,5,2023/5/11 23:52,0
8,NA0304-15,5FP10,NA12.000,3808240,EMNDE015FP10,9,2023/5/12 23:52,1
9,NA0298-27,5FP10,NA12.000,3805361,EMNH015FP15,8,2023/5/13 23:52,0


In [124]:
# df_run_pandas.loc[df_run_pandas['label'] == 0, 'RANDOM_DEFECTS'].tolist()

In [172]:
df_run = ps.from_pandas(df_run_pandas).to_spark()
df_run.count()

46

In [173]:
grpby_list = ['OPER_NO', 'RECIPE_ID']
grps = (df_run.groupBy(grpby_list)
                .agg(countDistinct('WAFER_ID').alias('wafer_count'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 0, 1)).alias('good_num'),
                     countDistinct('WAFER_ID', when(df_run['label'] == 1, 1)).alias('bad_num'))
                .orderBy('bad_num', ascending=False))
grps.show()

+-------+------------+-----------+--------+-------+
|OPER_NO|   RECIPE_ID|wafer_count|good_num|bad_num|
+-------+------------+-----------+--------+-------+
|  5FP10| EMNH015FP15|         11|       3|      8|
|  5FP10|EMNDE015FP10|          9|       5|      4|
|  5FP15| EMNH015FP18|         18|      14|      4|
|  5FP15|EMNH015FP222|          4|       1|      3|
|  5FP14|EMNH015FP222|          4|       3|      1|
+-------+------------+-----------+--------+-------+



In [161]:
oper, recipe = '5FP15', 'EMNH015FP222'
df_specific = df_run.filter(f"OPER_NO == '{oper}' AND RECIPE_ID == '{recipe}'")
df_specific.show()

+---------+-------+--------+----------+------------+--------------+---------------+-----+
| WAFER_ID|OPER_NO|  LOT_ID|RECIPE_KEY|   RECIPE_ID|RANDOM_DEFECTS|INSPECTION_TIME|label|
+---------+-------+--------+----------+------------+--------------+---------------+-----+
|NA0302-30|  5FP15|NA12.000|   3721870|EMNH015FP222|           757|2023/6/11 23:52|    1|
|NA0298-31|  5FP15|NA12.000|   3718991|EMNH015FP222|           767|2023/6/12 23:52|    1|
|NA0302-31|  5FP15|NA12.000|   3716112|EMNH015FP222|           777|2023/6/13 23:52|    1|
|NA0298-32|  5FP15|NA12.000|   3713233|EMNH015FP222|           787|2023/6/14 23:52|    0|
+---------+-------+--------+----------+------------+--------------+---------------+-----+



In [None]:
df_specific_pandas = df_specific.toPandas()

In [167]:
# def get_model_result(df_run):

#     good_wafers = df_run.loc[df_run['label'] == 0, 'RANDOM_DEFECTS'].tolist()
#     bad_wafers = df_run.loc[df_run['label'] == 1, 'RANDOM_DEFECTS'].tolist()
#     print(good_wafers)
#     print(bad_wafers)

#     if len(good_wafers) == 0 or len(bad_wafers) == 0:
#         return pd.DataFrame()

#     p_shapiro_good, p_shapiro_bad, p_levene = do_normality_tests(good_wafers, bad_wafers)
#     statistic, p_value = get_difference_results(good_wafers, bad_wafers, p_shapiro_good, p_shapiro_bad, p_levene)

#     importance_dict = {'statistic': statistic, 'importance': 1 - p_value}
#     importance_dict.update({col_: df_run[col_].values[0] for col_ in grpby_list})
#     importance_res = pd.DataFrame(importance_dict, index=[0])
#     return importance_res

# res = get_model_result(df_specific_pandas)
# type(res)

In [148]:
# good_wafers = df_specific.filter(col("label") == 0).select("RANDOM_DEFECTS").rdd.flatMap(lambda x: x).collect()
# bad_wafers = df_specific.filter(col("label") == 1).select("RANDOM_DEFECTS").rdd.flatMap(lambda x: x).collect()

In [174]:
def extend_wafers(good_wafers, bad_wafers):
    if len(good_wafers) < 3:
        n = len(good_wafers)
        good_wafers.extend([0]*(3-n))
    
    if len(bad_wafers) < 3:
        n = len(bad_wafers)
        bad_wafers.extend([0]*(3-n))
    return good_wafers, bad_wafers

def do_normality_tests(good_wafers, bad_wafers):
    
    good_wafers, bad_wafers = extend_wafers(good_wafers=good_wafers, bad_wafers=bad_wafers)
    
    # Shapiro-Wilk test: Normality Assumption
    _, p_shapiro_good = shapiro(good_wafers)
    _, p_shapiro_bad = shapiro(bad_wafers)

    # Levene's test: Homogeneity of Variance Assumption
    _, p_levene = levene(good_wafers, bad_wafers)
    return p_shapiro_good, p_shapiro_bad, p_levene

def get_difference_results(good_wafers, bad_wafers, p_shapiro_good, p_shapiro_bad, p_levene, alpha=0.05):
    
    good_wafers, bad_wafers = extend_wafers(good_wafers=good_wafers, bad_wafers=bad_wafers)
    
    if p_shapiro_good > alpha and p_shapiro_bad > alpha and p_levene > alpha:
        statistic, p_value = ttest_ind(good_wafers, bad_wafers, equal_var=True)
        
    elif p_shapiro_good > alpha and p_shapiro_bad > alpha and p_levene < alpha:
        statistic, p_value = ttest_ind(good_wafers, bad_wafers, equal_var=False)
        
    else:
        statistic, p_value = mannwhitneyu(good_wafers, bad_wafers)
    return statistic, p_value

In [175]:
good_wafers = [1,2, 3]
bad_wafers = [3, 4, 0]
p_shapiro_good, p_shapiro_bad, p_levene = do_normality_tests(good_wafers, bad_wafers)
get_difference_results(good_wafers, bad_wafers, p_shapiro_good, p_shapiro_bad, p_levene)

(-0.25000000000000006, 0.8149020114591812)

In [177]:
from pyspark.sql.types import StringType, IntegerType, FloatType, StructType, StructField
def fit_defect_model(df, grpby_list):
    struct_fields = [StructField(col_, StringType(), True) for col_ in grpby_list]
    struct_fields.extend([StructField("statistic", FloatType(), True),
                          StructField("importance", FloatType(), True)])
    schema_all = StructType(struct_fields)
    
    @pandas_udf(returnType=schema_all, functionType=PandasUDFType.GROUPED_MAP)
    def get_model_result(df_run):

        good_wafers = df_run.loc[df_run['label'] == 0, 'RANDOM_DEFECTS'].tolist()
        bad_wafers = df_run.loc[df_run['label'] == 1, 'RANDOM_DEFECTS'].tolist()
        
        if len(good_wafers) == 0 or len(bad_wafers) == 0:
            return pd.DataFrame()
            
        p_shapiro_good, p_shapiro_bad, p_levene = do_normality_tests(good_wafers, bad_wafers)
        statistic, p_value = get_difference_results(good_wafers, bad_wafers, p_shapiro_good, p_shapiro_bad, p_levene)

        importance_dict = {'statistic': statistic, 'importance': 1 - p_value}
        importance_dict.update({col_: df_run[col_].values[0] for col_ in grpby_list})
        importance_res = pd.DataFrame(importance_dict, index=[0])
        return importance_res
    return df.groupby(grpby_list).apply(get_model_result)

In [178]:
res = fit_defect_model(df=df_run, grpby_list=['OPER_NO', 'RECIPE_ID'])
res.toPandas()

Unnamed: 0,OPER_NO,RECIPE_ID,statistic,importance
0,5FP10,EMNDE015FP10,0.0,0.982549
1,5FP10,EMNH015FP15,0.0,0.987879
2,5FP14,EMNH015FP222,9.0,0.923477
3,5FP15,EMNH015FP18,0.0,0.996865
4,5FP15,EMNH015FP222,3.0,0.342095


#### 检验

In [57]:
shapiro([1,2, 30])  #  Data must be at least length 3.

ShapiroResult(statistic=0.7758302092552185, pvalue=0.058016655848620036)

In [67]:
levene([1, 2, 9], [3, 200, 56])  # 方差齐性检验, 也最好都是三个

LeveneResult(statistic=2.2387189771872653, pvalue=0.2089273956185376)

#### equal_var=True的情况，必须其中一个的数量>=2

In [17]:
ttest_ind([1, 0], [3], equal_var=True)

TtestResult(statistic=-2.886751345948129, pvalue=0.21229561500965655, df=1.0)

In [23]:
ttest_ind([1], [3, 4], equal_var=True)

TtestResult(statistic=-2.886751345948129, pvalue=0.21229561500965655, df=1.0)

#### equal_var=False的情况，两个的数量都要>=2

In [25]:
ttest_ind([1, 0], [3, 5], equal_var=False)

TtestResult(statistic=-3.1304951684997055, pvalue=0.12904783554828336, df=1.4705882352941178)

#### mannwhitneyu检验，如果都只有一个值，结果永远是pvalue=1.0，无效, 最好两个以上

In [53]:
mannwhitneyu([-111196, -56], [200, 3])

MannwhitneyuResult(statistic=0.0, pvalue=0.3333333333333333)

In [52]:
# 进行 Mann-Whitney U 检验, P值越小越显著