In [58]:
import numpy as np
import pandas as pd
import warnings
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from colorama import Fore, Back, Style
from sklearn.preprocessing import StandardScaler
import itertools
from collections import defaultdict
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
import lightgbm as lgb
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import pearsonr, spearmanr, rankdata
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
import pickle

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Load data

* _scale(): 對 data 的 features 進行標準化

In [59]:
def _scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    
    # back to dataframe
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    
    return new_train, new_val, new_test

In [60]:
train = pd.read_csv('./input/new_train.csv')
test = pd.read_csv('./input/new_test.csv')
submission = pd.read_csv('./input/sample_submission.csv')

X = train.drop(['id','failure','product_code','attribute_0','attribute_1'],axis=1)
y = train['failure'].astype(int)

## 選擇不同組合的 feature，最後給予不同權重算出 failure

* 選擇 feature 

In [61]:
select_feature = ['m3_missing', 'm5_missing', 'measurement_1', 'measurement_2', 'loading', 'measurement_17']

* Cross-Validation (K-fold)

In [62]:
lr_test = np.zeros(len(test))
model_list = ['./models/model1-1.pkl', './models/model1-2.pkl', './models/model1-3.pkl', './models/model1-4.pkl','./models/model1-5.pkl']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    x_test = test.copy()
    x_train, x_val, x_test = _scale(x_train, x_val, x_test, select_feature)
    
    with open(model_list[fold_idx], 'rb') as f:
        model = pickle.load(f)
    lr_test += model.predict_proba(x_test[select_feature])[:, 1] / 5

In [63]:
submission['lr0'] = lr_test

In [64]:
select_feature = ['measurement_1', 'measurement_2', 'loading', 'measurement_17']

In [65]:
lr_test = np.zeros(len(test))
model_list = ['./models/model2-1.pkl', './models/model2-2.pkl', './models/model2-3.pkl', './models/model2-4.pkl','./models/model2-5.pkl']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    x_test = test.copy()
    x_train, x_val, x_test = _scale(x_train, x_val, x_test, select_feature)
    
    with open(model_list[fold_idx], 'rb') as f:
        model = pickle.load(f)
    lr_test += model.predict_proba(x_test[select_feature])[:, 1] / 5

In [66]:
submission['lr1'] = lr_test

In [67]:
select_feature = ['loading', 'measurement_17', 'm3_17_avg']

In [68]:
lr_test = np.zeros(len(test))
model_list = ['./models/model3-1.pkl', './models/model3-2.pkl', './models/model3-3.pkl', './models/model3-4.pkl','./models/model3-5.pkl']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    x_test = test.copy()
    
    x_train, x_val, x_test = _scale(x_train, x_val, x_test, select_feature)
    with open(model_list[fold_idx], 'rb') as f:
        model = pickle.load(f)
    lr_test += model.predict_proba(x_test[select_feature])[:, 1] / 5

In [69]:
submission['lr2'] = lr_test

In [70]:
submission.head()

Unnamed: 0,id,failure,lr0,lr1,lr2
0,26570,0.0,0.208906,0.208996,0.208252
1,26571,0.0,0.201129,0.201224,0.201619
2,26572,0.0,0.204884,0.204971,0.206438
3,26573,0.0,0.206612,0.206694,0.203775
4,26574,0.0,0.23922,0.239314,0.242482


In [71]:
submission['rank0'] = rankdata(submission['lr0'])
submission['rank1'] = rankdata(submission['lr1'])
submission['rank2'] = rankdata(submission['lr2'])

In [72]:
submission['failure'] = submission['rank0']*0.70 + submission['rank1']*0.05 + submission['rank2']*0.30

In [73]:
submission.head()

Unnamed: 0,id,failure,lr0,lr1,lr2,rank0,rank1,rank2
0,26570,9014.85,0.208906,0.208996,0.208252,8721.0,8715.0,8248.0
1,26571,4998.65,0.201129,0.201224,0.201619,4711.0,4685.0,4889.0
2,26572,7067.9,0.204884,0.204971,0.206438,6523.0,6500.0,7256.0
3,26573,7376.15,0.206612,0.206694,0.203775,7457.0,7431.0,5949.0
4,26574,21084.9,0.23922,0.239314,0.242482,20018.0,20084.0,20227.0


In [74]:
submission[['id', 'failure']].to_csv('submission.csv', index=False)