In [1]:
import pandas as pd
import pyspark.pandas as ps
import requests
import json

from sqlalchemy import create_engine
from pca import pca
from pyspark.sql.functions import pandas_udf, PandasUDFType, max, col, countDistinct, when, rank, lit
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.window import Window

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import ClusterCentroids

# from backend_spark.doris_common.doris_client import DorisClient
from functools import reduce
from pyspark.sql import DataFrame
from typing import Optional



In [2]:
import warnings
import os
import pandas as pd
import pyspark.pandas as ps
from pyspark.sql import SparkSession

os.environ['PYSPARK_PYTHON'] = '/usr/local/python-3.9.13/bin/python3'
warnings.filterwarnings('ignore')

spark = SparkSession.builder \
    .appName("pandas_udf") \
    .config('spark.sql.session.timeZone', 'Asia/Shanghai') \
    .config("spark.scheduler.mode", "FAIR") \
    .config('spark.driver.memory', '4g') \
    .config('spark.driver.cores', '10') \
    .config('spark.executor.memory', '4g') \
    .config('spark.executor.cores', '10') \
    .config('spark.cores.max', '10') \
    .config('spark.driver.host', '192.168.22.28') \
    .master("spark://192.168.12.47:7077,192.168.12.48:7077") \
    .getOrCreate()

In [3]:
df_pandas = pd.read_csv(
    "D:/Jupyterfiles/晶合MVAFDC_general开发/MVAanlysisDevelop/uva_algorithm/small_samples_data/small2_labeled.csv")
print(df_pandas.shape)

df1_ = ps.from_pandas(df_pandas).to_spark()
df1_.show()

(24, 39)
+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+---------------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+------+------------+---------+----------------+-------+-----+
+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+---------------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+-

In [4]:
# json_config = {
#     "requestId": "uva",
#     "requestParam": [
#         {'dateRange': [{'start': "2023-12-01 00:00:00", 'end': "2024-01-15 00:00:00"}],
#          'lot': [],
#          'operNo': ["1G.EEG1R", "1G.PPB10"],
#          'prodg1': [],
#          'productId': [],
#          'eqp': [],
#          'tool': [],
#          'recipeName': [],
#          'waferId': {'good': ["NBX392-15", "NBX392-20", "NBX392-24", "NBX391-24", "NBX391-25", "NBX548-09",
#                               "NBX391-01", "NBX391-02", "NBX391-13", "NBX391-17"],
#                      'bad': ["NBX500-10", "NBX500-01", "NBX500-09"]},
#          'uploadId': '20240110170016023',
#          'grpby_list': ['PRODUCT_ID', 'EQP_NAME'],
#          'mergeOperno': [],
#          'mergeProdg1': [{"L2800Z3N_L2800Z2N": ["L2800Z3N", "L2800Z2N"]}],
#          'mergeProductId': [{"AFPNR901N.0B0L_AFPNR901N.0B0J": ["AFPNR901N.0B0L", "AFPNR901N.0B0J"]}],
#          'mergeEqp': [],
#          'mergeChamber': [{"PBT01_CGHA_4-14_PBT01_CLHA_4-12": ["PBT01_CGHA_4-14", "PBT01_CLHA_4-12"]},
#                          {"PBT01_CGHA_4-34_PBT01_CLHA_4-21": ["PBT01_CGHA_4-34", "PBT01_CLHA_4-21"]}],
#          }
#     ]
# }

json_config = {
    "requestId": "uva",
    "requestParam": [
        {'dateRange': [{'start': "2023-12-01 00:00:00", 'end': "2024-01-15 00:00:00"}],
         'lot': [],
         'operNo': ["1G.EEG1R", "1G.PPB10"],
         'prodg1': [],
         'productId': [],
         'eqp': [],
         'tool': [],
         'recipeName': [],
         'waferId': {'good': ["NBX392-15", "NBX392-20", "NBX392-24", "NBX391-24", "NBX391-25", "NBX548-09",
                              "NBX391-01", "NBX391-02", "NBX391-13", "NBX391-17"],
                     'bad': ["NBX500-10", "NBX500-01", "NBX500-09"]},
         'uploadId': '20240110170016023',
         'grpby_list': ['PRODUCT_ID', 'EQP_NAME'],
         'mergeOperno': [],
         'mergeProdg1': [],
         'mergeProductId': [],
         'mergeEqp': [],
         'mergeChamber': [],
         }
    ]
}
df_info_ = pd.DataFrame({"requestId": [json_config["requestId"]],
                             "requestParam": [json.dumps(json_config["requestParam"])]})
df_info_

Unnamed: 0,requestId,requestParam
0,uva,"[{""dateRange"": [{""start"": ""2023-12-01 00:00:00..."


In [5]:
request_id = df_info_["requestId"].values[0]
request_params = df_info_["requestParam"].values[0]
parse_dict = json.loads(request_params)

In [6]:
parse_dict

[{'dateRange': [{'start': '2023-12-01 00:00:00',
    'end': '2024-01-15 00:00:00'}],
  'lot': [],
  'operNo': ['1G.EEG1R', '1G.PPB10'],
  'prodg1': [],
  'productId': [],
  'eqp': [],
  'tool': [],
  'recipeName': [],
  'waferId': {'good': ['NBX392-15',
    'NBX392-20',
    'NBX392-24',
    'NBX391-24',
    'NBX391-25',
    'NBX548-09',
    'NBX391-01',
    'NBX391-02',
    'NBX391-13',
    'NBX391-17'],
   'bad': ['NBX500-10', 'NBX500-01', 'NBX500-09']},
  'uploadId': '20240110170016023',
  'grpby_list': ['PRODUCT_ID', 'EQP_NAME'],
  'mergeOperno': [],
  'mergeProdg1': [],
  'mergeProductId': [],
  'mergeEqp': [],
  'mergeChamber': []}]

In [7]:
merge_operno = parse_dict[0].get('mergeOperno')
merge_prodg1 = parse_dict[0].get('mergeProdg1')
merge_product = parse_dict[0].get('mergeProductId')
merge_eqp = parse_dict[0].get('mergeEqp')
merge_chamber = parse_dict[0].get('mergeChamber')

In [8]:
merge_operno_list = merge_prodg1
values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]

# Concatenate values from each dictionary
merged_values = ["_".join(list(rule.values())[0]) for rule in merge_operno_list]
merged_values

[]

In [21]:
for values, replacement_value in zip(values_to_replace, merged_values):
    df1_ = df1_.withColumn("PRODG1", when(col("PRODG1").isin(values), replacement_value).otherwise(col("PRODG1")))

In [22]:
df1_.toPandas()

Unnamed: 0,TOOL_ID,TOOL_NAME,RUN_ID,EQP_NAME,CASE_INFO,PRODUCT_ID,PRODG1,OPER_NO,LOT_ID,WAFER_ID,...,UPPER_OUTLIER,RULES_ENABLED,ALARM_RULE,RESULT,STATUS,REGION,ERROR_MSG,STATISTIC_RESULT,VERSION,label
0,9289,PBT01_CGHA_4-34,351230,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N_L2800Z2N,1V.PPB10,NBX265.000,NBX265-05,...,,1,,249.9848,NORMAL,LOWER_NORMAL,,249.9848,3,1
1,9287,PBT01_CGHA_4-14,360329,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N_L2800Z2N,1V.PPB10,NBX265.000,NBX265-06,...,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3,1
2,9279,PBT01_CLHA_4-12,360246,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N_L2800Z2N,1V.PPB10,NBX265.000,NBX265-06,...,,1,,150.0075,NORMAL,LOWER_NORMAL,,150.0075,4,1
3,9287,PBT01_CGHA_4-14,329563,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z3N_L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,1,,249.98625,NORMAL,LOWER_NORMAL,,249.98625,3,0
4,9287,PBT01_CGHA_4-14,362271,PBT01,2023-12-20,AFPNR901N.0B0L,L2800Z3N_L2800Z2N,1V.PPB10,NBX293.200,NBX293-06,...,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3,1
5,9279,PBT01_CLHA_4-12,329480,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z3N_L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,1,,149.937083,NORMAL,LOWER_NORMAL,,149.937083,4,0
6,9287,PBT01_CGHA_4-14,323876,PBT01,2023-09-28,AFPNR901N.0B0J,L2800Z3N_L2800Z2N,1V.PPB10,NBX220.150,NBX220-06,...,,1,,249.9825,NORMAL,LOWER_NORMAL,,249.9825,3,0
7,9280,PBT01_CLHA_4-21,317196,PBT01,2023-09-25,AFPNR901N.0B0J,L2800Z3N_L2800Z2N,1V.PPB10,NBX219.130,NBX219-08,...,,1,,149.998261,NORMAL,LOWER_NORMAL,,149.998261,4,0
8,9288,PBT01_CGHA_4-24,312691,PBT01,2023-09-13,AFPNR901N.0B0J,L2800Z3N_L2800Z2N,1V.PPB10,NBX220.040,NBX220-20,...,,1,,249.98875,NORMAL,UPPER_NORMAL,,249.98875,3,1
9,9288,PBT01_CGHA_4-24,355016,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N_L2800Z2N,1V.PPB10,NBX265.000,NBX265-08,...,,1,,249.982,NORMAL,LOWER_NORMAL,,249.982,3,1


In [9]:
def integrate_columns(df, merge_operno_list, merge_prodg1_list, merge_product_list, merge_eqp_list, merge_chamber_list):
    """
    Integrate columns in the DataFrame based on the provided list.
    
    :param df: The input DataFrame.
    :param merge_operno_list: A list of dictionaries where each dictionary contains values to be merged.
           Example: [{'2F.CDS10_XX.TDS01': ['2F.CDS10', 'XX.TDS01']},
                     {'2F.CDS20_XX.CDS20': ['2F.CDS20', 'XX.CDS20']}]
    :param merge_prodg1_list: A list of dictionaries for merging 'PRODG1' column in a similar fashion.
    :param merge_product_list: A list of dictionaries for merging 'PRODUCT_ID' column in a similar fashion.
    :param merge_eqp_list: A list of dictionaries for merging 'EQP_NAME' column in a similar fashion.
    :param merge_chamber_list: A list of dictionaries for merging 'TOOL_NAME' column in a similar fashion.
    
    :return: DataFrame with 'OPER_NO' and other specified columns integrated according to the merge rules.
    """
    if merge_operno_list is not None and len(merge_operno_list) > 0:
        # Extract values from each dictionary in merge_operno_list and create a list
        values_to_replace = [list(rule.values())[0] for rule in merge_operno_list]
        # Concatenate values from each dictionary
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_operno_list]

        # Replace values in 'OPER_NO' column based on the rules defined in merge_operno_list
        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("OPER_NO", when(col("OPER_NO").isin(values), replacement_value).otherwise(col("OPER_NO")))
            
    if merge_prodg1_list is not None and len(merge_prodg1_list) > 0:
        values_to_replace = [list(rule.values())[0] for rule in merge_prodg1_list]
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_prodg1_list]

        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("PRODG1", when(col("PRODG1").isin(values), replacement_value).otherwise(col("PRODG1")))
            
    if merge_product_list is not None and len(merge_product_list) > 0:
        values_to_replace = [list(rule.values())[0] for rule in merge_product_list]
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_product_list]

        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("PRODUCT_ID", when(col("PRODUCT_ID").isin(values), replacement_value).otherwise(col("PRODUCT_ID")))
            
    if merge_eqp_list is not None and len(merge_eqp_list) > 0:
        values_to_replace = [list(rule.values())[0] for rule in merge_eqp_list]
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_eqp_list]

        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("EQP_NAME", when(col("EQP_NAME").isin(values), replacement_value).otherwise(col("EQP_NAME")))
        
    if merge_chamber_list is not None and len(merge_chamber_list) > 0:
        values_to_replace = [list(rule.values())[0] for rule in merge_chamber_list]
        merged_values = ["_".join(list(rule.values())[0]) for rule in merge_chamber_list]

        for values, replacement_value in zip(values_to_replace, merged_values):
            df = df.withColumn("TOOL_NAME", when(col("TOOL_NAME").isin(values), replacement_value).otherwise(col("TOOL_NAME")))      
        
    return df

In [10]:
merge_operno = parse_dict[0].get('mergeOperno')
merge_prodg1 = parse_dict[0].get('mergeProdg1')
merge_product = parse_dict[0].get('mergeProductId')
merge_eqp = parse_dict[0].get('mergeEqp')
merge_chamber = parse_dict[0].get('mergeChamber')

In [11]:
df_merge = integrate_columns(df=df1_, merge_operno_list=merge_operno, 
                            merge_prodg1_list=merge_prodg1, 
                            merge_product_list=merge_product, 
                            merge_eqp_list=merge_eqp, 
                            merge_chamber_list=merge_chamber)
df_merge.show()

+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+---------------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+------+------------+---------+----------------+-------+-----+
+-------+---------------+------+--------+----------+--------------+--------+--------+----------+---------+--------------------+----------+---------------+----------+--------------------+-------------------+------------+-------------+--------------+-------+---------------+-------------------+-------------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+----------+-------------+------+---

In [13]:
df_merge.toPandas()

Unnamed: 0,TOOL_ID,TOOL_NAME,RUN_ID,EQP_NAME,CASE_INFO,PRODUCT_ID,PRODG1,OPER_NO,LOT_ID,WAFER_ID,...,UPPER_OUTLIER,RULES_ENABLED,ALARM_RULE,RESULT,STATUS,REGION,ERROR_MSG,STATISTIC_RESULT,VERSION,label
0,9289,PBT01_CGHA_4-34,351230,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-05,...,,1,,249.9848,NORMAL,LOWER_NORMAL,,249.9848,3,1
1,9287,PBT01_CGHA_4-14,360329,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-06,...,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3,1
2,9279,PBT01_CLHA_4-12,360246,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-06,...,,1,,150.0075,NORMAL,LOWER_NORMAL,,150.0075,4,1
3,9287,PBT01_CGHA_4-14,329563,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,1,,249.98625,NORMAL,LOWER_NORMAL,,249.98625,3,0
4,9287,PBT01_CGHA_4-14,362271,PBT01,2023-12-20,AFPNR901N.0B0L,L2800Z2N,1V.PPB10,NBX293.200,NBX293-06,...,,1,,249.984583,NORMAL,LOWER_NORMAL,,249.984583,3,1
5,9279,PBT01_CLHA_4-12,329480,PBT01,2023-10-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX221.100,NBX221-13,...,,1,,149.937083,NORMAL,LOWER_NORMAL,,149.937083,4,0
6,9287,PBT01_CGHA_4-14,323876,PBT01,2023-09-28,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX220.150,NBX220-06,...,,1,,249.9825,NORMAL,LOWER_NORMAL,,249.9825,3,0
7,9280,PBT01_CLHA_4-21,317196,PBT01,2023-09-25,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX219.130,NBX219-08,...,,1,,149.998261,NORMAL,LOWER_NORMAL,,149.998261,4,0
8,9288,PBT01_CGHA_4-24,312691,PBT01,2023-09-13,AFPNR901N.0B0J,L2800Z2N,1V.PPB10,NBX220.040,NBX220-20,...,,1,,249.98875,NORMAL,UPPER_NORMAL,,249.98875,3,1
9,9288,PBT01_CGHA_4-24,355016,PBT01,2023-12-16,AFPZM801N.0A01,L2800Z3N,1V.PPB10,NBX265.000,NBX265-08,...,,1,,249.982,NORMAL,LOWER_NORMAL,,249.982,3,1
