In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import warnings
from sklearn.metrics import r2_score,roc_auc_score
import random
from sklearn.linear_model import LogisticRegression,HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler,PowerTransformer
from sklearn.metrics import roc_auc_score
from imblearn.under_sampling import NearMiss, ClusterCentroids
from imblearn.over_sampling import SMOTE
import feature_engine as fe
from feature_engine.encoding import WoEEncoder

from colorama import Fore, Back, Style

sns.set()

warnings.filterwarnings("ignore")

#### Read the data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
display(train.head())

print(f'Train data shape is = {train.shape}')
print(f'Test data shape is = {test.shape}')

print("\n"*1)

print(f'Train data missing value is = {format(100* train.isna().sum().sum()/(len(train)*25))}')
print(f'Test data missing value is  = {format(100* test.isna().sum().sum()/(len(test)*25))}')

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


Train data shape is = (26570, 26)
Test data shape is = (20775, 25)


Train data missing value is = 3.052013549115544
Test data missing value is  = 3.0245968712394706


In [3]:
cat_feat = list()
num_feat = list()
for col in train.columns:
    if col in ['failure','id']:
        continue
    if train[col].dtype=="O":
        cat_feat.append(col)
    else:
        num_feat.append(col)

### Loading Feature 

The loading feature seems to have right skewed distribution.

Let's apply log transformation to make the distribution more normal.

In [4]:
target = train.pop('failure')
target_mean = np.mean(target)
print(f"target mean --> {target_mean}")

target mean --> 0.21260820474219044


In [5]:
data = pd.concat([train, test])
train.shape,test.shape

((26570, 25), (20775, 25))

In [6]:
data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
data['area'] = data['attribute_2'] * data['attribute_3']

data['loading'] = np.log(data['loading'])
data['count_null'] = data.isnull().sum(axis=1)

feature = [f for f in test.columns if f.startswith('measurement') or f=='loading']

In [7]:
full_fill_dict ={}
full_fill_dict['measurement_17'] = {
    'A': ['measurement_5','measurement_6','measurement_8','measurement_7'],
    'B': ['measurement_4','measurement_5','measurement_7','measurement_9'],
    'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
    'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
    'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8','measurement_9']
}

#收集按相關性排序的下 10 個最佳測量row的名稱：
col = [col for col in test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
a = []
b =[]

for x in range(3,17):
    corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
    a.append(np.round(np.sum(corr[1:4]),3)) #我們添加相關值的前 3 column以獲得最相關
    b.append(f'measurement_{x}')
c = pd.DataFrame()
c['Selected columns'] = b
c['correlation total'] = a

In [8]:
c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
print(f'Columns selected by correlation sum of the 3 first rows : ')
display(c.head(10))

Columns selected by correlation sum of the 3 first rows : 


Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.448
1,measurement_11,0.395
2,measurement_5,0.376
3,measurement_6,0.359
4,measurement_7,0.33
5,measurement_4,0.328
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225


In [9]:

#計算每個測量row的前 4 個相關特徵 w.r.t 的每個產品代碼
for i in range(10):
    measurement_col = 'measurement_' + c.iloc[i,0][12:] # 選擇下一個最佳相關的row
    fill_dict = {}
    for x in data.product_code.unique() : 
        corr = np.absolute(data[data.product_code == x].drop(col, axis=1)
                           .corr()[measurement_col]).sort_values(ascending=False)

        measurement_col_dic = {}
        measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
        fill_dict[x] = measurement_col_dic[measurement_col]

    full_fill_dict[measurement_col] =fill_dict

In [10]:
feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
nullValue_cols = [col for col in train.columns if train[col].isnull().sum()!=0]

In [11]:
#使用 HuberRegressor 和 KNNImputer 填充 NA 值


for code in data.product_code.unique():
    total_na_filled_by_linear_model = 0
    print(f'\n-------- Product code {code} ----------\n')
    print(f'filled by linear model :')
    for measurement_col in list(full_fill_dict.keys()):
        tmp = data[data.product_code == code]
        column = full_fill_dict[measurement_col][code]
        tmp_train = tmp[column+[measurement_col]].dropna(how='any')
        tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

        model = HuberRegressor(epsilon=1.9)
        model.fit(tmp_train[column], tmp_train[measurement_col])
        data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)
                 &(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
        print(f'{measurement_col} : {len(tmp_test)}')
        total_na_filled_by_linear_model += len(tmp_test)
        
    # 其他 NA columns:
    NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
    model1 = KNNImputer(n_neighbors=3)
    data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
    print(f'\n{total_na_filled_by_linear_model} filled by linear model ') 
    print(f'{NA} filled by KNN ')


-------- Product code A ----------

filled by linear model :
measurement_17 : 374
measurement_8 : 167
measurement_11 : 225
measurement_5 : 113
measurement_6 : 146
measurement_7 : 155
measurement_4 : 78
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2270 filled by linear model 
1579 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 397
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 174
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2404 filled by linear model 
1571 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 189
measurement_11 : 231
measurement_5 : 141
measurement_6 : 150
measurement_7 : 140
measurement_4 : 108
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 330

2604 filled by linear model 
1740 fill

In [12]:
data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
df_train = data.iloc[:train.shape[0],:]
df_test = data.iloc[train.shape[0]:,:]

woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(df_train, target)
df_train = woe_encoder.transform(df_train)
df_test = woe_encoder.transform(df_test)

In [13]:
df_train['measurement(3*5)'] = df_train['measurement_3'] * df_train['measurement_5']
df_test['measurement(3*5)'] = df_test['measurement_3'] * df_test['measurement_5']

df_train['missing(3*5)'] = df_train['m5_missing'] * (df_train['m3_missing'])
df_test['missing(3*5)'] = df_test['m5_missing'] * (df_test['m3_missing'])

In [14]:

features = ['loading','measurement_17','m3_missing','m5_missing',]

df_train['failure'] = target

### Standard Scaler

In [15]:
def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

### Logistic Regression

In [16]:
import pickle
N_FOLDS = 6
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=30)
y_oof = np.zeros(df_train[features].shape[0])
y_test = np.zeros(df_test[features].shape[0])
logistic_auc = 0
ix = 0
feature_importance = []
lg_model=[]

sm = SMOTE(random_state = 42, n_jobs = -1)

for train_ind, val_ind in skf.split(df_train[features], df_train[['failure']]):
    print(f"******* Fold {ix} ******* ")
    tr_x, val_x = (
        df_train[features].iloc[train_ind].reset_index(drop=True),
        df_train[features].iloc[val_ind].reset_index(drop=True),
    )
    tr_y, val_y = (
        df_train['failure'].iloc[train_ind].reset_index(drop=True),
        df_train['failure'].iloc[val_ind].reset_index(drop=True),
    )
    
    tr_x,val_x,test_x = scale(tr_x, val_x, df_test[features], features)
    
    tr_x, tr_y = sm.fit_resample(tr_x, tr_y)
    clf = LogisticRegression(max_iter=700, C=0.0001, penalty='l2',solver='newton-cg')
    
    clf.fit(tr_x, tr_y)
    
    feature_importance.append(clf.coef_.ravel())
    preds = clf.predict_proba(val_x)[:,1]
    
    roc_score = roc_auc_score(val_y, preds)
    
    logistic_auc += roc_score/N_FOLDS

    print('VAL_ROC-AUC:', round(roc_score, 6))
    
    y_oof[val_ind] = y_oof[val_ind] + preds

    preds_test = clf.predict_proba(test_x)[:,1]
    lg_model.append(preds_test)
    y_test = y_test + preds_test / N_FOLDS
    ix = ix + 1
    if(ix==6):
        with open('model.pickle', 'wb') as f:
            pickle.dump(clf, f)
    
print(f"{Fore.GREEN}{Style.BRIGHT}Average auc = {round(logistic_auc, 6)}{Style.RESET_ALL}")
print(f"{Fore.BLUE}{Style.BRIGHT}OOF auc = {round(roc_auc_score(df_train[['failure']], y_oof), 6)}{Style.RESET_ALL}")


******* Fold 0 ******* 
VAL_ROC-AUC: 0.589246
******* Fold 1 ******* 
VAL_ROC-AUC: 0.592352
******* Fold 2 ******* 
VAL_ROC-AUC: 0.591732
******* Fold 3 ******* 
VAL_ROC-AUC: 0.589075
******* Fold 4 ******* 
VAL_ROC-AUC: 0.579772
******* Fold 5 ******* 
VAL_ROC-AUC: 0.599812
[32m[1mAverage auc = 0.590331[0m
[34m[1mOOF auc = 0.590205[0m
