### Statistical Features VM

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import mean_squared_error
from scipy.stats import kurtosis, skew

In [11]:
# add the latest MRR column
def get_orig_data(dataset):
    
    train_orig = pd.read_csv(f'./data/{dataset}/train.csv', sep=',', index_col=0).reset_index(drop=True)
    test_orig = pd.read_csv(f'./data/{dataset}/test.csv', sep=',', index_col=0).reset_index(drop=True)

    short_sampled_orig = train_orig.iloc[316*2:, :].reset_index(drop=True)
    latest_mrr_sampled = train_orig.iloc[:-316*2, -1].rename('latest_mrr').reset_index(drop=True)
    addmrr_sampled_orig = pd.concat([short_sampled_orig, latest_mrr_sampled], axis=1)

    short_unsampled_orig = train_orig.iloc[316*1:, :].reset_index(drop=True)
    latest_mrr_unsampled = train_orig.iloc[:-316*1, -1].rename('latest_mrr').reset_index(drop=True)
    addmrr_unsampled_orig = pd.concat([short_unsampled_orig, latest_mrr_unsampled], axis=1)

    latest_mrr_test = pd.concat([train_orig.iloc[-316*1:, -1], test_orig.iloc[:-316*1, -1]], ignore_index=True).rename("latest_mrr").reset_index(drop=True)
    addmrr_test_orig = pd.concat([test_orig, latest_mrr_test], axis=1)
    
    return addmrr_sampled_orig, addmrr_unsampled_orig, addmrr_test_orig

# transfer into the input data
def get_stats_features(data):
    orig_X = data.iloc[:, :-2].to_numpy()
    orig_X = orig_X.reshape(-1, 316, orig_X.shape[1])
    orig_y = data.iloc[:, -2].tolist()
    data_y = [orig_y[i] for i in range(0, len(orig_y), 316)]
    # create the latest MRR for every samples
    latest_mrr = data.iloc[:, -1].tolist()
    latest_mrr = [latest_mrr[i] for i in range(0, len(latest_mrr), 316)]
    
    # calculate the statistics features
    means = np.mean(orig_X, axis=1)
    stds = np.std(orig_X, axis=1)
    medians = np.median(orig_X, axis=1)
    mins = np.min(orig_X, axis=1)
    maxs = np.max(orig_X, axis=1)
    kurts = kurtosis(orig_X, axis=1)
    skews = skew(orig_X, axis=1)
    stats_X = np.hstack([means, stds, medians, mins, maxs, kurts, skews])
    stats_X = np.nan_to_num(stats_X, nan=0.0)

    # add the latest MRR
    latest_mrr = np.array(latest_mrr).reshape(-1,1)
    data_X = np.concatenate((stats_X, latest_mrr), axis=1)

    return data_X, data_y

# fit different ML models
def fit_XGB(X_train, y_train):
    import xgboost as xgb
    params = {
        'objective': 'reg:squarederror',  # 回歸問題
        'max_depth': 3,                    # 樹的最大深度
        'learning_rate': 0.01,              # 學習率
        'n_estimators': 200                # 樹的數量
    }
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)
    return model

def fit_RF(X_train, y_train):
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=100, random_state=2)
    model.fit(X_train, y_train)
    return model

def fit_KNN(X_train, y_train):
    from sklearn.neighbors import KNeighborsRegressor
    model = KNeighborsRegressor(n_neighbors=5)
    model.fit(X_train, y_train)
    return model

def fit_MLP(X_train, y_train):
    from sklearn.neural_network import MLPRegressor
    model = MLPRegressor(hidden_layer_sizes=(128, 16), activation='relu', solver='adam', max_iter=2000, random_state=2)
    model.fit(X_train, y_train)
    return model

# generate the VM MRR prediction
def get_VM_predition(dataset, model_type):  # model_type: 'XGB'/'RF'/'KNN'/"MLP"
    
    addmrr_sampled_orig, addmrr_unsampled_orig, addmrr_test_orig = get_orig_data(dataset)
    
    # retain only sampled section for training
    sampled_train_orig = pd.concat([addmrr_sampled_orig.iloc[i:i+316] for i in range(0, len(addmrr_sampled_orig), 316*2)], ignore_index=True)
    unsampled_train_orig = pd.concat([addmrr_unsampled_orig.iloc[i:i+316] for i in range(316, len(addmrr_unsampled_orig), 316*2)], ignore_index=True)
    all_test_orig = pd.concat([unsampled_train_orig, addmrr_test_orig], ignore_index=True)
    extend_test_orig = pd.concat([addmrr_unsampled_orig.iloc[-316*4:], addmrr_test_orig], ignore_index=True)

    y_pred = {}
    X_train, y_train = get_stats_features(sampled_train_orig)
    # unsampled : unsampled wafer in training set / test : testing set / all : 'unsampled and test' / extend : 'testing set extended to past wafers'(for equipment state model)
    data_modes = {'unsampled':unsampled_train_orig, 'test':addmrr_test_orig, 'all':all_test_orig, 'extend':extend_test_orig}
    for mode, data in data_modes.items():
        match model_type:
            case 'XGB':
                model = fit_XGB(X_train, y_train)
            case 'RF':
                model = fit_RF(X_train, y_train)
            case 'KNN':
                model = fit_KNN(X_train, y_train)
            case 'MLP':
                model = fit_MLP(X_train, y_train)

        X_test, y_test = get_stats_features(data)
        pred = model.predict(X_test)
        mse = mean_squared_error(y_test, pred)

        print('------------------------------------------------')
        print(mode)
        print(f"Statistical Features + {model_type} VM test loss", round(mse, 3))
        y_pred[mode] = model.predict(X_test)

    return y_pred

A456

In [None]:
# XGB
dataset = 'A456'
model_type = 'XGB'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + XGB VM test loss 8.82
------------------------------------------------
test
Statistical Features + XGB VM test loss 14.96
------------------------------------------------
all
Statistical Features + XGB VM test loss 11.659
------------------------------------------------
extend
Statistical Features + XGB VM test loss 15.013


In [None]:
# RF
dataset = 'A456'
model_type = 'RF'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + RF VM test loss 4.381
------------------------------------------------
test
Statistical Features + RF VM test loss 12.192
------------------------------------------------
all
Statistical Features + RF VM test loss 7.993
------------------------------------------------
extend
Statistical Features + RF VM test loss 12.179


In [None]:
# KNN
dataset = 'A456'
model_type = 'KNN'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + KNN VM test loss 9.44
------------------------------------------------
test
Statistical Features + KNN VM test loss 17.586
------------------------------------------------
all
Statistical Features + KNN VM test loss 13.207
------------------------------------------------
extend
Statistical Features + KNN VM test loss 17.657


In [None]:
# MLP
dataset = 'A456'
model_type = 'MLP'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + MLP VM test loss 7.886
------------------------------------------------
test
Statistical Features + MLP VM test loss 18.699
------------------------------------------------
all
Statistical Features + MLP VM test loss 12.886
------------------------------------------------
extend
Statistical Features + MLP VM test loss 18.517


B456

In [None]:
# XGB
dataset = 'B456'
model_type = 'XGB'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + XGB VM test loss 12.319
------------------------------------------------
test
Statistical Features + XGB VM test loss 25.443
------------------------------------------------
all
Statistical Features + XGB VM test loss 18.396
------------------------------------------------
extend
Statistical Features + XGB VM test loss 25.273


In [None]:
# RF
dataset = 'B456'
model_type = 'RF'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + RF VM test loss 5.976
------------------------------------------------
test
Statistical Features + RF VM test loss 17.214
------------------------------------------------
all
Statistical Features + RF VM test loss 11.18
------------------------------------------------
extend
Statistical Features + RF VM test loss 17.186


In [None]:
# KNN
dataset = 'B456'
model_type = 'KNN'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + KNN VM test loss 14.48
------------------------------------------------
test
Statistical Features + KNN VM test loss 30.465
------------------------------------------------
all
Statistical Features + KNN VM test loss 21.882
------------------------------------------------
extend
Statistical Features + KNN VM test loss 30.326


In [None]:
# MLP
dataset = 'B456'
model_type = 'MLP'
vm_pred = get_VM_predition(dataset, model_type)

------------------------------------------------
unsampled
Statistical Features + MLP VM test loss 13.016
------------------------------------------------
test
Statistical Features + MLP VM test loss 18.81
------------------------------------------------
all
Statistical Features + MLP VM test loss 15.699
------------------------------------------------
extend
Statistical Features + MLP VM test loss 18.759
