In [18]:
import numpy as np
import pyspark
import pandas as pd
from imblearn.under_sampling import ClusterCentroids
from pca import pca
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType
from pyspark.sql.functions import pandas_udf, PandasUDFType, lit, col, countDistinct, when, monotonically_increasing_id, explode, split 
from typing import List, Dict, Union

import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
import os
import pyspark.pandas as ps
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
warnings.filterwarnings('ignore')

spark = SparkSession.builder \
    .appName("pandas_udf222") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '8g') \
    .config('spark.driver.cores', '12') \
    .config('spark.executor.memory', '8g') \
    .config('spark.executor.cores', '12') \
    .config('spark.cores.max', '12') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()



In [18]:
df_run_pandas = pd.read_csv('D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/defect_algorithm/defect_by_wafer_labeled_6.csv')
df_run_pandas['INSPECTION_TIME'] = pd.to_datetime(df_run_pandas['INSPECTION_TIME'])

df_run_pandas

Unnamed: 0,WAFER_ID,OPE_NO,PRODG1,PRODUCT_ID,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,DEFECTS,ADDER_DEFECTS,CLUSTERS,ADDER_RANDOM_DEFECTS,ADDER_CLUSTERS,INSPECTION_TIME,label
0,NA0299-02,5FP10,KLKL,AJ.KLL,NA0299.000,3828393,EMNDE015FP10,6.0,643.0,574.0,4.0,536.0,4.0,2023-04-10 12:30:00,1
1,NA0301-13,5FP10,KLKL,AJ.KLL,NA0301.000,3827258,EMNDE015FP10,3.0,422.0,647.0,3.0,597.0,0.0,2023-04-10 12:30:00,1
2,NA0303-02,5FP10,KLKL,AJ.KLL,NA0303.000,3831272,EMNDE015FP10,6.0,635.0,104.0,7.0,231.0,3.0,2023-04-10 07:43:00,1
3,NA0304-24,5FP10,KLKL,AJ.KLL,NA0304.000,3827258,EMNDE015FP10,9.0,514.0,220.0,2.0,408.0,2.0,2023-04-10 13:22:00,1
4,NA0305-02,5FP10,KLKL,AJ.KLL,NA0305.000,3827258,EMNDE015FP10,11.0,805.0,299.0,9.0,753.0,2.0,2023-04-10 13:14:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,NAZ541-15,1CG40,DFDF,VB.JJJJJJJ1000,NAZ541.300,5766658,FYZK7011CG40S5,2262.0,812.0,257.0,3.0,814.0,0.0,2023-09-13 07:53:00,0
2236,NAZ541-15,1CZ10,DFDF,VB.JJJJJJJ1000,NAZ541.300,5112663,FYZK7011CZ1007,809.0,938.0,77.0,6.0,327.0,4.0,2023-08-30 21:00:00,0
2237,NAZ541-15,1CG40,DFDF,VB.JJJJJJJ1000,NAZ541.300,4769683,FYZK7011CG40,,129.0,766.0,2.0,613.0,1.0,2023-08-27 23:52:00,0
2238,NAZ541-15,1CG20,DFDF,VB.JJJJJJJ1000,NAZ541.300,4871798,FYZK7011CG20,,25.0,83.0,6.0,214.0,2.0,2023-08-26 20:36:00,0


In [26]:
grpby_list = ['PRODG1', 'PRODUCT_ID']
prodg = 'KLKL'
product_id = 'Ab.KMM01'
df_run_select1 = df_run_pandas.query(f"PRODG1 == '{prodg}' & PRODUCT_ID == '{product_id}'")
df_run_select1

Unnamed: 0,WAFER_ID,OPE_NO,PRODG1,PRODUCT_ID,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,DEFECTS,ADDER_DEFECTS,CLUSTERS,ADDER_RANDOM_DEFECTS,ADDER_CLUSTERS,INSPECTION_TIME,label
192,NAY602-03,BHP20,KLKL,Ab.KMM01,NAY602.320,4295777,EMNC301BHP20,32.0,260.0,557.0,7.0,401.0,3.0,2023-04-29 02:51:00,1
193,NAY602-03,BHP10,KLKL,Ab.KMM01,NAY602.320,4288839,EMNC301BHP10,68.0,415.0,56.0,2.0,976.0,1.0,2023-04-28 23:41:00,1
194,NAY602-03,GRP40,KLKL,Ab.KMM01,NAY602.320,4342788,EMNC301GRP40,67.0,778.0,267.0,2.0,97.0,3.0,2023-04-21 17:50:00,1
195,NAY602-03,GRP40,KLKL,Ab.KMM01,NAY602.320,4338705,EMNC301GRP40,115.0,415.0,319.0,1.0,635.0,3.0,2023-04-21 14:53:00,1
196,NAY602-03,GRP30,KLKL,Ab.KMM01,NAY602.320,4367615,EMNC301GRP30,461.0,464.0,578.0,1.0,616.0,0.0,2023-04-20 13:44:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351,NAZ283-09,1FG10,KLKL,Ab.KMM01,NAZ283.070,3616503,EMN8M011FG10,5.0,,,2.0,,1.0,2023-01-05 15:09:00,0
1352,NAZ283-15,1FG10,KLKL,Ab.KMM01,NAZ283.220,2329057,EMND7011FG10,27.0,,,2.0,,0.0,2023-02-15 20:26:00,0
1353,NAZ289-14,1NG10,KLKL,Ab.KMM01,NAZ289.120,3468978,FPNR9011NG10_SP2ETCH_1211,225.0,,,5.0,,3.0,2023-03-09 13:43:00,0
1354,NAZ289-14,2NG10,KLKL,Ab.KMM01,NAZ289.120,4108127,FPNR9012NG10_2NREWORK0307,945.0,461.0,162.0,0.0,,0.0,2023-03-07 17:19:00,0


In [24]:
wafer = 'NAZ283-09'
df_run_select1.query(f"WAFER_ID == '{wafer}'").sort_values("INSPECTION_TIME")

Unnamed: 0,WAFER_ID,OPE_NO,PRODG1,PRODUCT_ID,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,DEFECTS,ADDER_DEFECTS,CLUSTERS,ADDER_RANDOM_DEFECTS,ADDER_CLUSTERS,INSPECTION_TIME,label
1351,NAZ283-09,1FG10,KLKL,Ab.KMM01,NAZ283.070,3616503,EMN8M011FG10,5.0,,,2.0,,1.0,2023-01-05 15:09:00,0


In [14]:
idx = df_run_select1.groupby(['WAFER_ID', 'label'] + grpby_list)['INSPECTION_TIME'].idxmax()

In [25]:
df_sort = df_run_select1.loc[idx, :]

wafer = 'NAZ283-09'
df_sort.query(f"WAFER_ID == '{wafer}'").sort_values("INSPECTION_TIME")

Unnamed: 0,WAFER_ID,OPE_NO,PRODG1,PRODUCT_ID,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,DEFECTS,ADDER_DEFECTS,CLUSTERS,ADDER_RANDOM_DEFECTS,ADDER_CLUSTERS,INSPECTION_TIME,label
1351,NAZ283-09,1FG10,KLKL,Ab.KMM01,NAZ283.070,3616503,EMN8M011FG10,5.0,,,2.0,,1.0,2023-01-05 15:09:00,0


In [27]:
df_sort

Unnamed: 0,WAFER_ID,OPE_NO,PRODG1,PRODUCT_ID,LOT_ID,RECIPE_KEY,RECIPE_ID,RANDOM_DEFECTS,DEFECTS,ADDER_DEFECTS,CLUSTERS,ADDER_RANDOM_DEFECTS,ADDER_CLUSTERS,INSPECTION_TIME,label
198,NAY602-03,BH90WA12S1,KLKL,Ab.KMM01,NAY602.320,4323337,C90WA12A@@BH90WA12S1,1.0,251.0,420.0,0.0,670.0,3.0,2023-04-29 13:43:00,1
205,NAY602-09,BSP20,KLKL,Ab.KMM01,NAY602.300,4236823,EMNC301BSP20,434.0,445.0,999.0,3.0,931.0,2.0,2023-06-16 17:47:00,1
217,NAY602-15,BTP50,KLKL,Ab.KMM01,NAY602.300,3368402,EMNC301BTP50,,855.0,320.0,1.0,144.0,0.0,2023-07-10 02:35:00,1
241,NAY602-17,HEFEI,KLKL,Ab.KMM01,NAY602.250,4020594,Z-QBBC-50,0.0,552.0,634.0,9.0,71.0,4.0,2023-02-24 11:39:00,1
246,NAY602-21,HEFEI,KLKL,Ab.KMM01,NAY602.280,4076943,Z-QBBC-50,0.0,233.0,616.0,3.0,405.0,2.0,2023-03-03 13:23:00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1346,NAZ282-11,2MP15,KLKL,Ab.KMM01,NAZ282.140,2979318,GGNBK012MP15,11.0,,,7.0,,0.0,2023-02-10 12:34:00,0
1347,NAZ282-13,NKP30,KLKL,Ab.KMM01,NAZ282.100,3020000,GGNBK01NKP30,33.0,,,7.0,,0.0,2023-05-05 10:06:00,0
1351,NAZ283-09,1FG10,KLKL,Ab.KMM01,NAZ283.070,3616503,EMN8M011FG10,5.0,,,2.0,,1.0,2023-01-05 15:09:00,0
1352,NAZ283-15,1FG10,KLKL,Ab.KMM01,NAZ283.220,2329057,EMND7011FG10,27.0,,,2.0,,0.0,2023-02-15 20:26:00,0


In [3]:
# 定义数据结构
schema = StructType([
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Sex", StringType(), True)
])

# 创建数据
data = [("Alice", 34, 'M'), ("Bob", None, 'F'), ("Charlie", 27, 'f')]

# 使用 createDataFrame 函数创建数据框
df = spark.createDataFrame(data, schema)
df.show()

+-------+----+---+
|   Name| Age|Sex|
+-------+----+---+
|  Alice|  34|  M|
|    Bob|null|  F|
|Charlie|  27|  f|
+-------+----+---+



In [4]:
df.filter("algorithm_satisfied==False").show()

AnalysisException: Column 'algorithm_satisfied' does not exist. Did you mean one of the following? [Age, Name, Sex]; line 1 pos 0;
'Filter ('algorithm_satisfied = false)
+- LogicalRDD [Name#0, Age#1, Sex#2], false


In [40]:
df.dropna(subset=['Name', 'Age', 'Sex'], how='any').show()

+-------+---+---+
|   Name|Age|Sex|
+-------+---+---+
|  Alice| 34|  M|
|Charlie| 27|  f|
+-------+---+---+



In [19]:
data = [("John", ["apple", "banana", "orange"]),
        ("Anna", ["grape", "peach"]),
        ("Mike", ["watermelon", "strawberry", "kiwi"])]

df = spark.createDataFrame(data, ["name", "fruits"])
df.show()

fixed_list = ["ll", "kk", "jj"]
df_with_list = df.withColumn("hung", lit("ll, kk, jj"))
df_with_list.show()

df_with_array = df_with_list.withColumn("hung", split("hung", ","))
df_with_array.show()

+----+--------------------+
|name|              fruits|
+----+--------------------+
|John|[apple, banana, o...|
|Anna|      [grape, peach]|
|Mike|[watermelon, stra...|
+----+--------------------+

+----+--------------------+----------+
|name|              fruits|      hung|
+----+--------------------+----------+
|John|[apple, banana, o...|ll, kk, jj|
|Anna|      [grape, peach]|ll, kk, jj|
|Mike|[watermelon, stra...|ll, kk, jj|
+----+--------------------+----------+

+----+--------------------+--------------+
|name|              fruits|          hung|
+----+--------------------+--------------+
|John|[apple, banana, o...|[ll,  kk,  jj]|
|Anna|      [grape, peach]|[ll,  kk,  jj]|
|Mike|[watermelon, stra...|[ll,  kk,  jj]|
+----+--------------------+--------------+



In [20]:
exploded_df = df_with_array.select("name", explode("hung").alias("fruhungit"))

exploded_df.show()

+----+---------+
|name|fruhungit|
+----+---------+
|John|       ll|
|John|       kk|
|John|       jj|
|Anna|       ll|
|Anna|       kk|
|Anna|       jj|
|Mike|       ll|
|Mike|       kk|
|Mike|       jj|
+----+---------+



In [6]:
exploded_df = df.select("name", explode("fruits").alias("fruit"))

exploded_df.show()

+----+----------+
|name|     fruit|
+----+----------+
|John|     apple|
|John|    banana|
|John|    orange|
|Anna|     grape|
|Anna|     peach|
|Mike|watermelon|
|Mike|strawberry|
|Mike|      kiwi|
+----+----------+

