# TPS-Aug-2022

In [1]:
class Config:
    NB = '205'
    dataset_NB = '104'

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

In [3]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

plotly_template = dict(
    layout=go.Layout(
        template='plotly_dark',
        font=dict(
            family="Franklin Gothic",
            size=12
        ),
        height=500,
        width=1000,
    )
)


color_palette = {
    'Bin': ['#016CC9','#E876A3'],
    'Cat5': ['#E876A3', '#E0A224', '#63B70D', '#6BCFF6', '#13399E'],
}

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, auc
from scipy.stats import mode
import time

import tensorflow as tf
from tensorflow.keras import datasets
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import optimizers

## Load and check data

In [5]:
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_test = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_test.pkl', compression='zip')

submission = pd.read_csv(Config.raw_data_dir + 'sample_submission.csv')

df_train.shape

(26570, 29)

## Parameter Setting

In [6]:
feature_list = [col for col in df_train.columns if col not in [Config.row_id, Config.target]]
feature_list

['loading',
 'attribute_2',
 'attribute_3',
 'measurement_0',
 'measurement_1',
 'measurement_2',
 'measurement_3',
 'measurement_4',
 'measurement_5',
 'measurement_6',
 'measurement_7',
 'measurement_8',
 'measurement_9',
 'measurement_10',
 'measurement_11',
 'measurement_12',
 'measurement_13',
 'measurement_14',
 'measurement_15',
 'measurement_16',
 'measurement_17',
 'attribute_0_material_5',
 'attribute_0_material_7',
 'attribute_1_material_5',
 'attribute_1_material_6',
 'attribute_1_material_7',
 'attribute_1_material_8']

## Validation data Setting

In [7]:
X_test = df_test[feature_list]

'''
for c in TARGET_ENCODING_CATEGORY:
    data_tmp = pd.DataFrame({c: df_train[c], 'target': df_train[TARGET]})
    target_mean = data_tmp.groupby(c)['target'].mean()
    X_test.loc[:, c] = X_test[c].map(target_mean)
'''

X_test = (X_test.values).astype(np.float32)
X_test.shape

(20775, 27)

## Modeling

### Multi Layer Perceptron
- 隠れ層3層のMLP
- kernel_initializerにHeの初期化を採用
- Batch Normalizationを採用
- 活性化関数にReLUを採用
- Optimizerを採用（SGD、Adamなど。）
- Dropoutを採用
  - DropoutとBatchNormalizationを同時に使うと学習がうまくできない場合がある。
  - その場合、Dropoutを外す
- モデルの順序は、BatchNormalization、活性化関数、Dropoutであることに注意

In [8]:
def root_mean_squared_error(y_true, y_pred):
    return tf.keras.backend.sqrt(tf.keras.backend.mean(tf.keras.backend.square(y_pred - y_true)))

def setup_model():
    activation = 'relu'
    kernel_initializer = 'he_normal'

    model = Sequential()

    '''
    model.add(Dense(96, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(0.25))

    model.add(Dense(64, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    model.add(Dropout(0.25))
    '''
    model.add(Dense(32, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(16, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(8, kernel_initializer=kernel_initializer))
    model.add(BatchNormalization())
    model.add(Activation(activation))
    # model.add(Dropout(0.25))

    model.add(Dense(1, activation='softmax'))

    optimizer = optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999, amsgrad=True)
    # optimizer = optimizers.SGD(learning_rate=0.001)

    # model.compile(optimizer=optimizer, loss=root_mean_squared_error, metrics=[root_mean_squared_error])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

    return model

def setup_callbacks():
    es = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
    lr = ReduceLROnPlateau(monitor="val_loss", factor=0.7, patience=5, verbose=1)
    callbacks = [es, lr]

    return callbacks


mlp_param = {
    'epochs': 300,
    'batch_size': 100,
    'verbose': 1,
}


### Training & Validation with TargetEncoding

In [23]:
np.random.seed(Config.random_seed)
tf.random.set_seed(Config.random_seed)

kf = KFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.random_seed)
kf_encoding = KFold(n_splits=Config.n_folds, shuffle=True, random_state=Config.random_seed + 1)

#split_series = df_train[split_col]
#split_unique_series = df_train[split_col].unique()

results = {}
preds_test = np.zeros(len(X_test))
stacking_preds_valid, stacking_idxes_valid = [], []

for idx, (idx_train, idx_valid) in enumerate(kf.split(df_train)):
    X_train = df_train.loc[idx_train][feature_list]
    y_train = df_train.loc[idx_train][Config.target]
    X_valid = df_train.loc[idx_valid][feature_list]
    y_valid = df_train.loc[idx_valid][Config.target]

    results[f'Fold{idx+1}'] = {}
    preds_train = np.zeros(len(X_train))
    preds_valid = np.zeros(len(X_valid))

    X_len = len(X_train)
    X_train = (X_train.values).astype(np.float32)
    X_valid = (X_valid.values).astype(np.float32)

    y_train = (y_train.values).astype(np.float32)
    y_valid = (y_valid.values).astype(np.float32)

    # training
    model = setup_model()
    callbacks = setup_callbacks()
    hist = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=mlp_param['epochs'], batch_size=mlp_param['batch_size'], callbacks=callbacks, verbose=mlp_param['verbose'])
    print(f'================================== training {idx + 1} fin. predicting ... ==================================')

    # evaluation
    # train_loss, train_rmse = model.evaluate(X_train, y_train, verbose=0)
    # valid_loss, valid_rmse = model.evaluate(X_valid, y_valid, verbose=0)
    # print(train_loss, train_rmse, valid_loss, valid_rmse)

    print(len(model.predict(X_train)))
    print(model.predict(X_train).shape)
    preds_train = np.minimum(np.maximum(model.predict(X_train), 0), 1)
    preds_valid = np.minimum(np.maximum(model.predict(X_valid), 0), 1)

    auc_train = np.sqrt(roc_auc_score(y_train, preds_train))
    auc_valid = np.sqrt(roc_auc_score(y_valid, preds_valid))

    # save training data
    results[f'Fold{idx + 1}']['datasets'] = [X_train, y_train, X_valid, y_valid]
    results[f'Fold{idx + 1}']['index_train'] = df_train.loc[idx_train].index
    results[f'Fold{idx + 1}']['index_valid'] = df_train.loc[idx_valid].index
    results[f'Fold{idx + 1}']['model'] = model
    results[f'Fold{idx + 1}']['hist'] = hist
    results[f'Fold{idx + 1}']['preds_train'] = preds_train
    results[f'Fold{idx + 1}']['preds_valid'] = preds_valid
    results[f'Fold{idx + 1}']['score_train'] = auc_train
    results[f'Fold{idx + 1}']['score_valid'] = auc_train

    # predict for submission
    preds_test +=  np.minimum(np.maximum(model.predict(X_test), 0), 1) / Config.n_folds

    # stacking
    stacking_preds_valid.append(results[f'Fold{idx + 1}']['preds_valid'])
    stacking_idxes_valid.append(results[f'Fold{idx + 1}']['index_valid'])

# output results
train_score = valid_score = 0
for i in range(Config.n_folds):
    train_score += results[f'Fold{idx+1}']['score_train'] / Config.n_folds
    valid_score += results[f'Fold{idx+1}']['score_valid'] / Config.n_folds

print('')
print(f'Train Score : {train_score:.5f}')
print(f'Valid Score : {valid_score:.5f}')

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.006999999843537807.
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.004899999825283885.
Epoch 00015: early stopping
21256
(21256, 1)
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.006999999843537807.
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 00013: ReduceLROnPlateau reducing learning rate to 0.004899999825283885.
Epoch 00013: early stopping
21256
(21256, 1)
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 00011: ReduceLROnPlateau reducing learning rate to 0.006999999843537807.
Epoch 12/300
Epoch 13/300
Epoch 14/

In [10]:
# stacking
stacking_preds_valid = np.concatenate(stacking_preds_valid, axis=0)
stacking_idxes_valid = np.concatenate(stacking_idxes_valid)
stacking_order_valid = np.argsort(stacking_idxes_valid)
stacking_preds_valid_sorted = stacking_preds_valid[stacking_order_valid]

df_train_stacking = pd.DataFrame({Config.row_id: df_train[Config.row_id], f'nb{Config.NB}': stacking_preds_valid_sorted.reshape(-1)})
df_test_stacking = pd.DataFrame({Config.row_id: df_test[Config.row_id], f'nb{Config.NB}': preds_test[:, 0]})

df_train_stacking.to_csv(f'../data/interim/nb{Config.NB}_train.csv', index=False)
df_test_stacking.to_csv(f'../data/interim/nb{Config.NB}_test.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/interim/nb205_train.csv'

In [None]:
len(preds_test[:, 0])

In [24]:
preds_test

array([1.00000001, 1.00000001, 1.00000001, ..., 1.00000001, 1.00000001,
       1.00000001])

In [11]:
submission

Unnamed: 0,id,failure
0,26570,0.0
1,26571,0.0
2,26572,0.0
3,26573,0.0
4,26574,0.0
...,...,...
20770,47340,0.0
20771,47341,0.0
20772,47342,0.0
20773,47343,0.0


### 検証データの誤差の可視化

In [None]:
for i in range(FOLDS):
    train_loss = results[f'Fold{i + 1}']['hist'].history['loss']
    val_loss = results[f'Fold{i + 1}']['hist'].history['val_loss']

    fig = plt.figure()
    plt.rc('font', family='serif')
    plt.plot(range(len(train_loss)), train_loss, color='blue', linewidth=1, label='train_loss')
    plt.plot(range(len(val_loss)), val_loss, color='red', linewidth=1, label='val_loss')
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel('loss')
    plt.show()

## Submission

In [None]:
submission.columns = [Config.row_id, Config.target]
submission[Config.target] = preds_test
submission.to_csv(f"../data/submission/nb{Config.NB}.csv", index=False, header=False)
submission

## 結果の可視化

In [None]:
preds = []
preds_index = []
for i in range(FOLDS):
    preds.append(results[f'Fold{i + 1}']['preds_valid'])
    preds_index.append(results[f'Fold{i + 1}']['index_valid'])

preds_index_tmp = np.concatenate(preds_index)
preds_tmp = np.concatenate(preds, axis=0)

order = np.argsort(preds_index_tmp)
preds_sorted = preds_tmp[order]

print(f'valid RMSE : {np.sqrt(mean_squared_error(df_train[TARGET], preds_sorted))}')

df_train_eval = df_train
df_train_eval['preds'] = preds_sorted

df_train_eval = df_train_eval[['id', 'year', 'month', 'day', 'Country', 'City', 'lat', 'lon', 'pm25_mid', 'preds']]

In [None]:
df_train_tmp = pd.read_pickle("../data/processed/nb101_train.pkl", compression='zip')
df_test_tmp = pd.read_pickle("../data/processed/nb101_test.pkl", compression='zip')
df_train_tmp['datetime'] = df_train_tmp['year'] * 10000 + df_train_tmp['month'] * 100 + df_train_tmp['day']
df_train_tmp['datetime'] = df_train_tmp['datetime'].astype(str)
df_train_tmp['datetime'] = pd.to_datetime(df_train_tmp['datetime'])

df_train_eval['datetime'] = df_train_tmp['datetime']
df_train_eval

In [None]:
for city in df_train_eval['City'].unique():
    y_true = df_train_eval[df_train_eval['City'] == city]['pm25_mid']
    y_pred = df_train_eval[df_train_eval['City'] == city]['preds']
    country = df_train_eval[df_train_eval['City'] == city]['Country'].unique()[0]

    city_rmse = np.sqrt(mean_squared_error(y_true, y_pred))

    if city_rmse > 35:
        print(f'{city_rmse} in {city} {country}')

    if city_rmse > 45:
        plt.figure(figsize=(40, 10))
        plt.title(f'PM2.5 prediction RMSE:{city_rmse} in {city}')
        plt.plot(df_train_eval[df_train_eval['City'] == city]['datetime'], y_true, label='y_true')
        plt.plot(df_train_eval[df_train_eval['City'] == city]['datetime'], y_pred, label='y_pred')
        plt.legend()

## 検証メモ

In [None]:
df_train.describe().T

In [None]:
df_train_tmp = pd.read_pickle("../data/processed/nb101_train.pkl", compression='zip')
df_test_tmp = pd.read_pickle("../data/processed/nb101_test.pkl", compression='zip')
df_train_tmp['datetime'] = df_train_tmp['year'] * 10000 + df_train_tmp['month'] * 100 + df_train_tmp['day']
df_train_tmp['datetime'] = df_train_tmp['datetime'].astype(str)
df_train_tmp['datetime'] = pd.to_datetime(df_train_tmp['datetime'])
df_train_tmp['datetime_month'] = df_train_tmp.datetime.dt.month
df_train_tmp

In [None]:
df_train_eval[df_train_eval['City'] == 'Denver']

In [None]:
df_train_eval['diff'] = df_train_eval['pm25_mid'] - df_train_eval['preds']
df_train_eval

In [None]:
df_train_eval[df_train_eval['diff'] > 150]

In [None]:
df_train_eval[df_train_eval['diff'] < -100]

In [None]:
df_tmp.info()

In [None]:
df_tmp1 = df_tmp
df_tmp1['Country'] = df_tmp['Country'].fillna(-1)

In [None]:
df_tmp1[df_tmp1['Country'] < 0]

In [None]:
X_valid = df_train[is_valid][FEATURES]
data_tmp = pd.DataFrame({c: X_train[c], 'target': y_train})

In [None]:
target_mean = data_tmp.groupby(c)['target'].mean()
X_valid.loc[:, c] = X_valid[c].map(target_mean)

In [None]:
X_valid = df_train[is_valid][FEATURES]

In [None]:
X_valid

In [None]:
data_tmp = pd.DataFrame({c: X_train[c], 'target': y_train})