In [3]:
#basic
import pandas as pd
import numpy as np
from collections import defaultdict
#sax
from saxpy.sax import ts_to_string
from saxpy.alphabet import cuts_for_asize
from saxpy.paa import paa
#sklearn
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,roc_curve, auc
from sklearn.model_selection import cross_validate, cross_val_predict
#關掉warning
import warnings
warnings.filterwarnings('ignore')
#draw
import matplotlib.pyplot as plt
import time

original_df = pd.read_excel("Data\\sensor.xlsx")
is_abormal = pd.read_excel("Data\\is_abormal.xlsx")

print(original_df.shape)
original_df.head()

(163283, 8)


Unnamed: 0,run_wafer,record,sensor_6,sensor_7,sensor_8,sensor_11,sensor_12,sensor_15
0,1549_1,1,3,24,10,2,-11,-1
1,1549_1,2,3,25,10,2,-11,-1
2,1549_1,3,3,24,10,2,-11,0
3,1549_1,4,3,25,10,2,-11,-1
4,1549_1,5,3,25,477,2,-11,0


## 標準化

In [2]:
df = original_df.copy()
scaler = StandardScaler()
sensors = ['sensor_6', 'sensor_7', 'sensor_8', 'sensor_11','sensor_12', 'sensor_15']
for run in df.run_wafer.unique():
    mask = df["run_wafer"] == run
    df.loc[mask,sensors] = scaler.fit_transform(df.loc[mask,sensors].values)

(df.groupby("run_wafer").mean()[sensors]>0.0000001 ).sum() 

sensor_6     0
sensor_7     0
sensor_8     0
sensor_11    0
sensor_12    0
sensor_15    0
dtype: int64

## 使用PAA降維

In [4]:
sensors = ['sensor_6', 'sensor_7', 'sensor_8', 'sensor_11','sensor_12', 'sensor_15']
ratio = 2

s = time.time()
d = defaultdict(list)
for run in df.run_wafer.unique():
    tmp = df[df["run_wafer"] == run]
    for sensor in sensors:
        compress = paa(tmp[sensor].values, tmp.shape[0]//ratio).tolist()
        d[sensor].extend(compress)
    d["run_wafer"].extend([run]*len(compress))
print(time.time()-s) 

new_df = pd.DataFrame(d)
print(new_df.head())

121.19681215286255
   sensor_6  sensor_7  sensor_8  sensor_11  sensor_12  sensor_15 run_wafer
0 -1.585434  1.260826 -2.105993  -1.465557  -1.566471   1.487811    1549_1
1 -1.585434  1.261043 -2.095084  -1.465557  -1.566471   1.498482    1549_1
2 -1.585434  1.260935 -1.303938  -1.465557  -1.566471   1.509153    1549_1
3 -1.585434  1.260499  0.637462  -1.465557  -1.566471   1.509153    1549_1
4 -1.585434  1.260826  1.478703  -1.465557  -1.566471   1.509153    1549_1


## 計算每一筆紀錄的統計量
1. max
2. min
3. 25%
4. 50%
5. 75%
6. mean
7. std

In [10]:
#將不同sensor攤開聚合計算
def flattern(df):
    statistics = defaultdict(list) 
    for run in df["run_wafer"].unique():
        sta = df[df["run_wafer"]==run].describe()
        statistics["run_wafer"].append(run)
        for sensor in ["sensor_6","sensor_7","sensor_8","sensor_11","sensor_12","sensor_15"]:
            th = sensor.split("_")[-1]
            statistics[f"max_{th}"].append(sta.loc['max',sensor])
            statistics[f"min_{th}"].append(sta.loc["min",sensor])
            statistics[f"25%_{th}"].append(sta.loc["25%",sensor])
            statistics[f"50%_{th}"].append(sta.loc["50%",sensor])
            statistics[f"75%_{th}"].append(sta.loc["75%",sensor])
            statistics[f"mean_{th}"].append(sta.loc["mean",sensor])
            statistics[f"std_{th}"].append(sta.loc["std",sensor])
    return pd.DataFrame(statistics)

#計算統計量
df_sta = flattern(new_df)
df_sta.head()

Unnamed: 0,run_wafer,max_6,min_6,25%_6,50%_6,75%_6,mean_6,std_6,max_7,min_7,...,75%_12,mean_12,std_12,max_15,min_15,25%_15,50%_15,75%_15,mean_15,std_15
0,1549_1,0.82029,-1.587454,-1.499202,0.608105,0.611391,-3.951088e-16,0.979275,3.005177,-0.696804,...,0.580351,-1.169e-15,0.98041,1.509153,-0.791218,-0.76972,-0.718248,1.427826,-3.526591e-16,0.990688
1,1549_2,0.847689,-1.500817,-1.496973,0.63467,0.637641,8.913058e-16,0.985517,1.481557,-0.745496,...,0.611518,3.690319e-16,0.987463,1.449295,-0.812871,-0.791119,-0.719628,1.427543,9.225797e-16,0.990544
2,1549_4,0.905271,-1.460575,-1.457195,0.654202,0.656616,-3.330669e-16,0.990588,1.555107,-0.795116,...,0.645328,6.167906e-18,0.992692,1.442394,-0.921343,-0.76929,-0.741644,1.414748,3.330669e-16,0.995215
3,1549_6,0.884996,-1.444958,-1.441226,0.661702,0.664989,5.119362e-16,0.981266,1.398198,-0.779656,...,0.618106,-6.137066e-16,0.984316,1.388516,-0.858014,-0.816411,-0.76613,1.36793,-4.502571e-16,0.991135
4,1549_7,0.842025,-1.499659,-1.496977,0.635691,0.637651,-1.351032e-15,0.985491,1.606777,-0.769984,...,0.718632,-2.408089e-16,0.989204,1.433565,-0.867655,-0.792259,-0.748017,1.411485,-5.691848e-16,0.990884


## 訓練模型

In [15]:
def cross(models,X,y,cv):
    d = {}
    for model in models:
        algorithm = str(model)[:str(model).find("(")]
        d[algorithm] = cross_validate(model,X,y,cv=cv)
        d[algorithm]["fit_time"] = np.mean(d[algorithm]["fit_time"])
        d[algorithm]["score_time"] = np.mean(d[algorithm]["score_time"])
        d[algorithm]["test_score"] = np.mean(d[algorithm]["test_score"])  
        y_pred = cross_val_predict(model, X, y, cv=cv)
        d[algorithm]["conf_mat"] = confusion_matrix(y, y_pred)
    return d

df_merged = df_sta.merge(is_abormal,on="run_wafer")
y = df_merged["is_abnormal"].values
X = df_merged.drop(["run_wafer","is_abnormal"],axis=1)
RF_model = RandomForestClassifier(random_state = 0)
xgb_model = XGBClassifier(random_state=0)
svc_model = SVC(gamma='auto')
score = cross([RF_model,xgb_model,svc_model],X,y,cv=10)

In [16]:
print(score)

{'RandomForestClassifier': {'fit_time': 0.024325227737426756, 'score_time': 0.0015018463134765625, 'test_score': 0.9832485875706215, 'conf_mat': array([[1063,    4],
       [  16,  111]], dtype=int64)}, 'XGBClassifier': {'fit_time': 0.40300090312957765, 'score_time': 0.002600836753845215, 'test_score': 0.9882768361581921, 'conf_mat': array([[1064,    3],
       [  11,  116]], dtype=int64)}, 'SVC': {'fit_time': 0.017253637313842773, 'score_time': 0.0020904302597045898, 'test_score': 0.8986581920903955, 'conf_mat': array([[1067,    0],
       [ 121,    6]], dtype=int64)}}


## 計算每筆sensor之間的距離
1. 歐幾里得

In [19]:
def get_euclidean_distance(arr1,arr2):
    return np.sqrt(sum(pow(arr1-arr2,2)))

#將不同sensor攤開
def cal_euclidean_distance_groupby_run_wafer(df):
    dis = defaultdict(list) 
    for run in df["run_wafer"].unique():
        dis["run_wafer"].append(run)
        tmp = df[df["run_wafer"] == run]
        dis["dis_6_7"].append(get_euclidean_distance(tmp["sensor_6"].values,tmp["sensor_7"].values))
        dis["dis_6_8"].append(get_euclidean_distance(tmp["sensor_6"].values,tmp["sensor_8"].values))
        dis["dis_6_11"].append(get_euclidean_distance(tmp["sensor_6"].values,tmp["sensor_11"].values))
        dis["dis_6_12"].append(get_euclidean_distance(tmp["sensor_6"].values,tmp["sensor_12"].values))
        dis["dis_6_15"].append(get_euclidean_distance(tmp["sensor_6"].values,tmp["sensor_15"].values))
        
        dis["dis_7_8"].append(get_euclidean_distance(tmp["sensor_7"].values,tmp["sensor_8"].values))
        dis["dis_7_11"].append(get_euclidean_distance(tmp["sensor_7"].values,tmp["sensor_11"].values))
        dis["dis_7_12"].append(get_euclidean_distance(tmp["sensor_7"].values,tmp["sensor_12"].values))
        dis["dis_7_15"].append(get_euclidean_distance(tmp["sensor_7"].values,tmp["sensor_15"].values))
        
        dis["dis_8_11"].append(get_euclidean_distance(tmp["sensor_8"].values,tmp["sensor_11"].values))
        dis["dis_8_12"].append(get_euclidean_distance(tmp["sensor_8"].values,tmp["sensor_12"].values))
        dis["dis_8_15"].append(get_euclidean_distance(tmp["sensor_8"].values,tmp["sensor_15"].values))
        
        dis["dis_11_12"].append(get_euclidean_distance(tmp["sensor_11"].values,tmp["sensor_12"].values))
        dis["dis_11_15"].append(get_euclidean_distance(tmp["sensor_11"].values,tmp["sensor_15"].values))
        
        dis["dis_12_15"].append(get_euclidean_distance(tmp["sensor_12"].values,tmp["sensor_15"].values))
        
    return pd.DataFrame(dis) 

df_dis = cal_euclidean_distance_groupby_run_wafer(new_df)
df_merged = df_sta.merge(df_dis, on="run_wafer")
df_merged = df_merged.merge(is_abormal,on="run_wafer")

y = df_merged["is_abnormal"].values
X = df_merged.drop(["run_wafer","is_abnormal"],axis=1)
RF_model = RandomForestClassifier(random_state = 0)
xgb_model = XGBClassifier(random_state=0)
svc_model = SVC(gamma='auto')
score = cross([RF_model,xgb_model,svc_model],X,y,cv=10)

In [20]:
print(score)

{'RandomForestClassifier': {'fit_time': 0.029525566101074218, 'score_time': 0.0014802217483520508, 'test_score': 0.9840395480225987, 'conf_mat': array([[1060,    7],
       [  12,  115]], dtype=int64)}, 'XGBClassifier': {'fit_time': 0.5137918949127197, 'score_time': 0.002063941955566406, 'test_score': 0.9882627118644066, 'conf_mat': array([[1065,    2],
       [  12,  115]], dtype=int64)}, 'SVC': {'fit_time': 0.01745030879974365, 'score_time': 0.0023001670837402345, 'test_score': 0.9513276836158193, 'conf_mat': array([[1064,    3],
       [  55,   72]], dtype=int64)}}
