# TPS-Aug-2022

In [1]:
class Config:
    NB = '206'
    dataset_NB = '106'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc
from scipy.stats import mode
import time

import tensorflow as tf
from tensorflow.keras import datasets
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import optimizers

## Load and check data

In [5]:
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_test = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_test.pkl', compression='zip')

submission = pd.read_csv(Config.raw_data_dir + 'sample_submission.csv')

df_train.shape

(26570, 335)

## Parameter Setting

In [6]:
features = [col for col in df_train.columns if col not in [Config.row_id, Config.target]]
features

['loading',
 'attribute_2',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17',
 'attribute_0_material_5',
 'attribute_0_material_7',
 'attribute_1_material_5',
 'attribute_1_material_6',
 'attribute_1_material_7',
 'attribute_1_material_8',
 'mes_0x1',
 'mes_0-1',
 'mes_0x2',
 'mes_0-2',
 'mes_0x3',
 'mes_0-3',
 'mes_0x4',
 'mes_0-4',
 'mes_0x5',
 'mes_0-5',
 'mes_0x6',
 'mes_0-6',
 'mes_0x7',
 'mes_0-7',
 'mes_0x8',
 'mes_0-8',
 'mes_0x9',
 'mes_0-9',
 'mes_0x10',
 'mes_0-10',
 'mes_0x11',
 'mes_0-11',
 'mes_0x12',
 'mes_0-12',
 'mes_0x13',
 'mes_0-13',
 'mes_0x14',
 'mes_0-14',
 'mes_0x15',
 'mes_0-15',
 'mes_0x16',
 'mes_0-16',
 'mes_0x17',
 'mes_0-17',
 'mes_1x2',
 'mes_1-2',
 'mes_1x3',
 '

## Validation data Setting

In [7]:
X_test = df_test[features]

'''
for c in TARGET_ENCODING_CATEGORY:
    data_tmp = pd.DataFrame({c: df_train[c], 'target': df_train[TARGET]})
    target_mean = data_tmp.groupby(c)['target'].mean()
    X_test.loc[:, c] = X_test[c].map(target_mean)
'''

X_test = (X_test.values).astype(np.float32)
X_test.shape

(20775, 333)

## Modeling

### Multi Layer Perceptron
- 隠れ層3層のMLP
- kernel_initializerにHeの初期化を採用
- Batch Normalizationを採用
- 活性化関数にReLUを採用
- Optimizerを採用（SGD、Adamなど。）
- Dropoutを採用
  - DropoutとBatchNormalizationを同時に使うと学習がうまくできない場合がある。
  - その場合、Dropoutを外す
- モデルの順序は、BatchNormalization、活性化関数、Dropoutであることに注意

In [8]:
def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

def setup_model():
    activation = 'relu'
    kernel_initializer = 'he_normal'

    model = Sequential()

    model.add(Dense(256, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(0.25))

    model.add(Dense(128, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(0.25))

    model.add(Dense(72, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(32, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(8, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(1, activation='sigmoid'))

    optimizer = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=True)
    # optimizer = optimizers.SGD(learning_rate=0.001)

    # model.compile(optimizer=optimizer, loss=root_mean_squared_error, metrics=[root_mean_squared_error])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

    return model


def setup_callbacks():
    es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.7, patience=5, verbose=1)
    callbacks = [es, lr]

    return callbacks


mlp_param = {
    'epochs': 300,
    'batch_size': 100,
    'verbose': 1,
}


### Training & Validation with TargetEncoding

In [9]:
np.random.seed(Config.random_seed)
tf.random.set_seed(Config.random_seed)

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(df_test))

# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(df_train))

feature_importance_df = pd.DataFrame(index=features)
y_valids, val_preds =[],[]
amex_scores = []

kfold = StratifiedKFold(n_splits = Config.n_folds, shuffle = True, random_state = Config.random_seed)

for fold, (train_idx, valid_idx) in enumerate(kfold.split(df_train, df_train[Config.target])):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold+1} with {len(features)} features...')

    X_train, X_val = df_train[features].iloc[train_idx], df_train[features].iloc[valid_idx]
    y_train, y_val = df_train[Config.target].iloc[train_idx], df_train[Config.target].iloc[valid_idx]

    # training
    model = setup_model()
    callbacks = setup_callbacks()
    hist = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=mlp_param['epochs'], batch_size=mlp_param['batch_size'], callbacks=callbacks, verbose=mlp_param['verbose'])

    print(f'================================== training {fold+1} fin. ==================================')

    # Predict validation data
    print(f'================================== validation-data predicting ... ==================================')
    val_pred = np.minimum(np.maximum(model.predict(X_val), 0), 1).reshape(len(X_val))
    oof_predictions[valid_idx] = val_pred

    # Predict test data
    print(f'================================== test-data predicting ... ==================================')
    test_predictions += np.minimum(np.maximum(model.predict(X_test), 0), 1).reshape(len(X_test)) / Config.n_folds

    # save results
    y_valids.append(y_val)
    val_preds.append(val_pred)
    # feature_importance_df["Importance_Fold"+str(fold+1)]=model.feature_importance(importance_type='gain')

    # Compute fold metric
    val_pred = pd.DataFrame(data={'prediction': val_pred})
    y_val = pd.DataFrame(data={'target': y_val.reset_index(drop=True)})
    auc_score = roc_auc_score(y_val, val_pred)

    print(f'Fold {fold+1} CV result')
    print(f' ROC metric : {auc_score}')

    del X_train, X_val, y_train, y_val
    _ = gc.collect()

# Compute out of folds metric
oof_predictions = pd.DataFrame(data={'prediction': oof_predictions})
y_true = pd.DataFrame(data={Config.target: df_train[Config.target]})

print(' ')
print('-'*50)
print(f'TOTAL AUC socre : {roc_auc_score(df_train[Config.target], oof_predictions["prediction"])}')
print('-'*50)

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({Config.row_id: df_train[Config.row_id], Config.target: df_train[Config.target], 'prediction': oof_predictions['prediction']})

# Create a dataframe to store test prediction
test_df = pd.DataFrame({Config.row_id: df_test[Config.row_id], Config.target: test_predictions})

 
--------------------------------------------------
Training fold 1 with 333 features...


2022-08-27 21:46:43.471232: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 00012: ReduceLROnPlateau reducing learning rate to 0.006999999843537807.
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 00017: ReduceLROnPlateau reducing learning rate to 0.004899999825283885.
Epoch 00017: early stopping
Fold 1 CV result
 ROC metric : 0.5539900081801279
 
--------------------------------------------------
Training fold 2 with 333 features...
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.006999999843537807.
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.004899999825283885.
Epoch 00020: early stopping
Fold 2 CV result
 ROC met

In [10]:
# stacking
'''
stacking_preds_valid = np.concatenate(stacking_preds_valid, axis=0)
stacking_idxes_valid = np.concatenate(stacking_idxes_valid)
stacking_order_valid = np.argsort(stacking_idxes_valid)
stacking_preds_valid_sorted = stacking_preds_valid[stacking_order_valid]

df_train_stacking = pd.DataFrame({Config.row_id: df_train[Config.row_id], f'nb{Config.NB}': stacking_preds_valid_sorted.reshape(-1)})
df_test_stacking = pd.DataFrame({Config.row_id: df_test[Config.row_id], f'nb{Config.NB}': preds_test[:, 0]})

df_train_stacking.to_csv(f'../data/interim/nb{Config.NB}_train.csv', index=False)
df_test_stacking.to_csv(f'../data/interim/nb{Config.NB}_test.csv', index=False)
'''

"\nstacking_preds_valid = np.concatenate(stacking_preds_valid, axis=0)\nstacking_idxes_valid = np.concatenate(stacking_idxes_valid)\nstacking_order_valid = np.argsort(stacking_idxes_valid)\nstacking_preds_valid_sorted = stacking_preds_valid[stacking_order_valid]\n\ndf_train_stacking = pd.DataFrame({Config.row_id: df_train[Config.row_id], f'nb{Config.NB}': stacking_preds_valid_sorted.reshape(-1)})\ndf_test_stacking = pd.DataFrame({Config.row_id: df_test[Config.row_id], f'nb{Config.NB}': preds_test[:, 0]})\n\ndf_train_stacking.to_csv(f'../data/interim/nb{Config.NB}_train.csv', index=False)\ndf_test_stacking.to_csv(f'../data/interim/nb{Config.NB}_test.csv', index=False)\n"

## 結果の可視化

In [11]:
def plot_roc(y_val, y_prob):
    #colors=px.colors.qualitative.Prism
    fig = go.Figure(layout=plotly_template['layout'])
    fig.add_trace(go.Scatter(x=np.linspace(0,1,11), y=np.linspace(0,1,11), name='Random Chance', mode='lines', showlegend=False, line=dict(color="Black", width=1, dash="dot")))

    for i in range(len(y_val)):
        y=y_val[i]
        prob=y_prob[i]
        fpr, tpr, _ = roc_curve(y, prob)
        roc_auc = auc(fpr,tpr)
        fig.add_trace(go.Scatter(x=fpr, y=tpr, line=dict(color=color_palette['Cat5'][i], width=3),
                                 hovertemplate = 'True positive rate = %{y:.3f}<br>False positive rate = %{x:.3f}',
                                 name='Fold {}: AUC = {:.3f}'.format(i+1, roc_auc)))

    fig.update_layout(template=plotly_template, title="Cross-Validation ROC Curves",
                      hovermode="x unified", width=700, height=600,
                      xaxis_title='False Positive Rate (1 - Specificity)',
                      yaxis_title='True Positive Rate (Sensitivity)',
                      legend=dict(orientation='v', y=.07, x=1, xanchor="right",
                                  bordercolor="black", borderwidth=.5))
    fig.show()

plot_roc(y_valids, val_preds)

In [12]:
# test_df = pd.DataFrame({Config.row_id: test[Config.row_id], 'prediction': test_predictions})

df = pd.DataFrame(data={'Target':test_df[Config.target].apply(lambda x: 1 if x>0.25 else 0)})
df = df.Target.value_counts(normalize=True)
df.rename(index={1:'Positive', 0:'Negative'}, inplace=True)

pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
fig=go.Figure()

fig.add_trace(go.Pie(labels=df.index, values=df*100, hole=.45,
                     showlegend=True,sort=False,
                     marker=dict(colors=color_palette['Bin'],line=dict(color=pal,width=2.5)),
                     hovertemplate = "%{label}: %{value:.2f}%<extra></extra>"))

fig.update_layout(template=plotly_template, title='Predicted Target Distribution',
                  legend=dict(traceorder='reversed',y=1.05,x=0),
                  uniformtext_minsize=15, uniformtext_mode='hide',width=700)
fig.show()

## Submission

In [13]:
test_df.to_csv(Config.submission_dir + f'nb{Config.NB}.csv', index=False)

## 検証メモ