# TPS-Aug-2022

In [1]:
class Config:
    NB = '301'
    dataset_NB = '108'
    stacking_NB = ['212', '213', '214']

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
import random
import joblib
import itertools
from itertools import combinations

from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import lightgbm as lgb
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score, roc_curve, auc
from lightgbm import LGBMClassifier, early_stopping

## Load and check data

In [5]:
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_test = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_test.pkl', compression='zip')

submission = pd.read_csv(Config.raw_data_dir + 'sample_submission.csv', header=None)

df_train.shape

(26570, 35)

In [6]:
df_train.head()

Unnamed: 0,id,product_code,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure,missing_loading,missing_measurement_3,missing_measurement_4,missing_measurement_5,missing_measurement_9,ohe0_5,ohe0_7,ohe1_5,ohe1_6,ohe1_7,ohe1_8
0,0,A,80.1,9,5,7.0,8.0,11.0,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,14.537333,13.034,14.684,764.1,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
1,1,A,84.89,9,5,14.0,3.0,11.0,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
2,2,A,82.43,9,5,12.0,1.0,11.0,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,19.391,13.798,16.711,18.631,14.094,17.946,663.376,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
3,3,A,101.07,9,5,13.0,2.0,11.0,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0
4,4,A,188.06,9,5,9.0,2.0,11.0,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0.0,False,False,False,False,False,0.0,1.0,0.0,0.0,0.0,1.0


In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     26570 non-null  int64  
 1   product_code           26570 non-null  object 
 2   loading                26570 non-null  float64
 3   attribute_2            26570 non-null  int64  
 4   attribute_3            26570 non-null  int64  
 5   measurement_0          26570 non-null  float64
 6   measurement_1          26570 non-null  float64
 7   measurement_2          26570 non-null  float64
 8   measurement_3          26570 non-null  float64
 9   measurement_4          26570 non-null  float64
 10  measurement_5          26570 non-null  float64
 11  measurement_6          26570 non-null  float64
 12  measurement_7          26570 non-null  float64
 13  measurement_8          26570 non-null  float64
 14  measurement_9          26570 non-null  float64
 15  me

## Stacking Setting

In [8]:
for i in Config.stacking_NB:
    df_train_NB =  pd.read_csv(Config.interim_dir + f'nb{i}.csv')
    df_test_NB =  pd.read_csv(Config.submission_dir + f'nb{i}.csv')

    df_train[f'nb{i}'] = df_train_NB[f'nb{i}']

    df_test = df_test.reset_index()
    df_test[f'nb{i}'] = df_test_NB[Config.target]
    df_test = df_test.set_index('index')

df_test

Unnamed: 0_level_0,id,product_code,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,missing_loading,missing_measurement_3,missing_measurement_4,missing_measurement_5,missing_measurement_9,ohe0_5,ohe0_7,ohe1_5,ohe1_6,ohe1_7,ohe1_8,nb212,nb213,nb214
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
26570,26570,F,119.57,6,4,6.0,9.0,11.0,19.305,10.178000,17.534,18.168000,11.598,18.654,10.802,15.909000,18.070000,13.772,13.659,16.825,13.742,17.710000,634.612,False,False,False,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.246832,0.204737,0.211843
26571,26571,F,113.51,6,4,11.0,8.0,11.0,17.883,11.927000,17.228,16.033000,11.179,19.368,12.032,13.998000,19.959333,12.473,17.468,16.708,14.776,14.102000,537.037,False,False,False,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.188590,0.190704,0.199458
26572,26572,F,112.16,6,4,8.0,12.0,11.0,18.475,10.481000,16.619,18.189000,12.126,17.774,11.743,17.046000,18.086000,10.907,13.363,15.737,17.065,16.021000,658.995,False,False,False,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.175761,0.197098,0.203392
26573,26573,F,112.72,6,4,8.0,11.0,11.0,16.518,10.888000,15.293,18.592000,11.304,18.948,11.790,18.165000,16.163000,10.933,15.501,15.667,12.620,16.111000,594.301,False,False,False,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.206644,0.192616,0.203340
26574,26574,F,208.00,6,4,14.0,16.0,11.0,17.808,12.693000,17.678,15.814000,13.431,19.141,12.370,14.578000,17.849000,11.941,16.070,16.183,13.324,17.150000,801.044,False,False,False,False,False,1.0,0.0,0.0,1.0,0.0,0.0,0.294080,0.333894,0.240819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47340,47340,I,144.74,9,5,0.0,4.0,11.0,18.465,12.570000,18.146,17.089000,11.204,18.573,11.691,15.664333,19.771000,11.562,17.246,15.131,15.209,16.027667,696.466,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0,0.188165,0.236309,0.210572
47341,47341,I,74.53,9,5,4.0,8.0,11.0,18.900,9.896000,18.288,18.713333,11.747,17.917,10.980,16.027000,15.694000,13.564,15.494,15.296,13.812,16.501000,613.249,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0,0.195007,0.157130,0.192247
47342,47342,I,67.73,9,5,10.0,11.0,11.0,18.656,12.439667,18.242,17.910000,12.081,19.630,10.436,16.137000,20.612000,11.134,16.519,15.525,14.175,17.728000,783.349,False,False,True,False,False,0.0,1.0,1.0,0.0,0.0,0.0,0.193357,0.159318,0.196690
47343,47343,I,126.15,9,5,8.0,16.0,11.0,16.536,11.226000,18.144,17.250000,12.692,19.575,12.672,15.422000,19.496000,9.319,15.817,17.403,16.437,15.179000,745.210,False,False,False,False,False,0.0,1.0,1.0,0.0,0.0,0.0,0.283514,0.218416,0.211684


## Training

In [9]:
def seed_everything(seed):

    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [10]:
# Get feature list
features = [col for col in df_train.columns if col not in [Config.row_id, Config.target, 'product_code']]
features

['loading',
 'attribute_2',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17',
 'missing_loading',
 'missing_measurement_3',
 'missing_measurement_4',
 'missing_measurement_5',
 'missing_measurement_9',
 'ohe0_5',
 'ohe0_7',
 'ohe1_5',
 'ohe1_6',
 'ohe1_7',
 'ohe1_8',
 'nb212',
 'nb213',
 'nb214']

In [11]:
# 約40分

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(df_test))

# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(df_train))

feature_importance_df = pd.DataFrame(index=features)
y_valids, val_preds =[],[]
amex_scores = []

kfold = GroupKFold(n_splits=Config.n_folds) # must be 5 because of the 5 product codes
for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_train[Config.target], df_train['product_code'])):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold+1} with {len(features)} features...')

    x_train, x_val = df_train[features].iloc[train_idx], df_train[features].iloc[valid_idx]
    y_train, y_val = df_train[Config.target].iloc[train_idx], df_train[Config.target].iloc[valid_idx]

    #model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=Config.random_seed)

    model = make_pipeline(StandardScaler(),
                          LogisticRegression(penalty='l1', C=0.01,
                                             solver='liblinear', random_state=1))

    model.fit(x_train, y_train)

    print(f'================================== training {fold+1} fin. ==================================')

    # Predict validation data
    print(f'================================== validation-data predicting ... ==================================')
    val_pred = model.predict_proba(x_val)[:, 1]
    oof_predictions[valid_idx] = val_pred

    # Predict test data
    print(f'================================== test-data predicting ... ==================================')
    test_pred = model.predict_proba(df_test[features])[:, 1]
    test_predictions += test_pred / Config.n_folds

    # save results
    y_valids.append(y_val)
    val_preds.append(val_pred)
    feature_importance_df["Importance_Fold"+str(fold+1)]=model.named_steps['logisticregression'].coef_.ravel()

    # Compute fold metric
    val_pred = pd.DataFrame(data={'prediction': val_pred})
    y_val = pd.DataFrame(data={'target': y_val.reset_index(drop=True)})
    auc_score = roc_auc_score(y_val, val_pred)

    print(f'Fold {fold+1} CV result')
    print(f' ROC metric : {auc_score}')

    del x_train, x_val, y_train, y_val
    _ = gc.collect()

# Compute out of folds metric
oof_predictions = pd.DataFrame(data={'prediction': oof_predictions})
y_true = pd.DataFrame(data={Config.target: df_train[Config.target]})

print(' ')
print('-'*50)
print(f'TOTAL AUC socre : {roc_auc_score(df_train[Config.target], oof_predictions["prediction"])}')
print('-'*50)

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({Config.row_id: df_train[Config.row_id], Config.target: df_train[Config.target], 'prediction': oof_predictions['prediction']})

# Create a dataframe to store test prediction
test_df = pd.DataFrame({Config.row_id: df_test[Config.row_id], Config.target: test_predictions})

 
--------------------------------------------------
Training fold 1 with 35 features...
Fold 1 CV result
 ROC metric : 0.5874933722880484
 
--------------------------------------------------
Training fold 2 with 35 features...
Fold 2 CV result
 ROC metric : 0.5819556127646698
 
--------------------------------------------------
Training fold 3 with 35 features...
Fold 3 CV result
 ROC metric : 0.5908256602365421
 
--------------------------------------------------
Training fold 4 with 35 features...
Fold 4 CV result
 ROC metric : 0.5973682553956834
 
--------------------------------------------------
Training fold 5 with 35 features...
Fold 5 CV result
 ROC metric : 0.5937036342129236
 
--------------------------------------------------
TOTAL AUC socre : 0.5892554740380043
--------------------------------------------------


In [12]:
oof_df.head()

Unnamed: 0,id,failure,prediction
0,0,0.0,0.164892
1,1,0.0,0.16365
2,2,0.0,0.160343
3,3,0.0,0.189466
4,4,0.0,0.272837


In [13]:
# Save results
oof_df_tmp = oof_df.drop(columns=[Config.target])
oof_df_tmp.columns = [Config.row_id, f'nb{Config.NB}']
oof_df_tmp.to_csv(Config.interim_dir + f'nb{Config.NB}.csv', index=False)
oof_df_tmp

Unnamed: 0,id,nb301
0,0,0.164892
1,1,0.163650
2,2,0.160343
3,3,0.189466
4,4,0.272837
...,...,...
26565,26565,0.252087
26566,26566,0.247302
26567,26567,0.198035
26568,26568,0.189474


In [14]:
def plot_roc(y_val, y_prob):
    #colors=px.colors.qualitative.Prism
    fig = go.Figure(layout=plotly_template['layout'])
    fig.add_trace(go.Scatter(x=np.linspace(0,1,11), y=np.linspace(0,1,11), name='Random Chance', mode='lines', showlegend=False, line=dict(color="Black", width=1, dash="dot")))

    for i in range(len(y_val)):
        y=y_val[i]
        prob=y_prob[i]
        fpr, tpr, _ = roc_curve(y, prob)
        roc_auc = auc(fpr,tpr)
        fig.add_trace(go.Scatter(x=fpr, y=tpr, line=dict(color=color_palette['Cat5'][i], width=3),
                                 hovertemplate = 'True positive rate = %{y:.3f}<br>False positive rate = %{x:.3f}',
                                 name='Fold {}: AUC = {:.3f}'.format(i+1, roc_auc)))

    fig.update_layout(template=plotly_template, title="Cross-Validation ROC Curves",
                      hovermode="x unified", width=700, height=600,
                      xaxis_title='False Positive Rate (1 - Specificity)',
                      yaxis_title='True Positive Rate (Sensitivity)',
                      legend=dict(orientation='v', y=.07, x=1, xanchor="right",
                                  bordercolor="black", borderwidth=.5))
    fig.show()

plot_roc(y_valids, val_preds)

In [15]:
top = 50

feature_importance_df['avg'] = feature_importance_df.mean(axis=1)
feature_importance_top = feature_importance_df.avg.nlargest(top).sort_values(ascending=True)

pal=sns.color_palette("YlGnBu", 65).as_hex()
fig=go.Figure()
for i in range(len(feature_importance_top.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=feature_importance_top[i],
                       line_color=pal[::-1][i],opacity=0.8,line_width=4))

fig.add_trace(go.Scatter(x=feature_importance_top, y=feature_importance_top.index, mode='markers',
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))

fig.update_layout(template=plotly_template,title=f'LGBM Feature Importance<br>Top {top}',
                  margin=dict(l=150,t=80),
                  xaxis=dict(title='Importance', zeroline=False),
                  yaxis_showgrid=False, height=1000, width=800)
fig.show()

In [16]:
# test_df = pd.DataFrame({Config.row_id: test[Config.row_id], 'prediction': test_predictions})

df = pd.DataFrame(data={'Target':test_df[Config.target].apply(lambda x: 1 if x>0.25 else 0)})
df = df.Target.value_counts(normalize=True)
df.rename(index={1:'Positive', 0:'Negative'}, inplace=True)

#pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
fig=go.Figure()

fig.add_trace(go.Pie(labels=df.index, values=df*100, hole=.45,
                     showlegend=True,sort=False,
                     marker=dict(colors=color_palette['Bin'],line=dict(color=pal,width=2.5)),
                     hovertemplate = "%{label}: %{value:.2f}%<extra></extra>"))

fig.update_layout(template=plotly_template, title='Predicted Target Distribution',
                  legend=dict(traceorder='reversed',y=1.05,x=0),
                  uniformtext_minsize=15, uniformtext_mode='hide',width=700)
fig.show()

In [17]:
test_df

Unnamed: 0_level_0,id,failure
index,Unnamed: 1_level_1,Unnamed: 2_level_1
26570,26570,0.204277
26571,26571,0.190271
26572,26572,0.196653
26573,26573,0.192179
26574,26574,0.333285
...,...,...
47340,47340,0.236817
47341,47341,0.157514
47342,47342,0.159693
47343,47343,0.218900


In [18]:
test_df[Config.target].describe()

count    20775.000000
mean         0.218890
std          0.050268
min          0.120694
25%          0.183727
50%          0.209205
75%          0.242484
max          0.593576
Name: failure, dtype: float64

In [19]:
Config.NB

'301'

In [20]:
test_df.to_csv(Config.submission_dir + f'nb{Config.NB}.csv', index=False)

## 検証メモ