# TPS-Aug-2022

In [1]:
class Config:
    NB = '401'
    dataset_NB = '110'
    emsemble_NB = ['nb301', 'nb302', 'nb303']

    raw_data_dir = '../data/raw/'
    processed_data_dir = '../data/processed/'
    interim_dir = '../data/interim/'
    submission_dir = '../data/submission/'

    random_seed = 42
    n_folds = 5

    row_id = 'id'
    target = 'failure'

## Import libralies

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style='white', context='notebook', palette='deep')

from sklearn.metrics import roc_auc_score

## Load and check data

In [3]:
# Load data
df_train = pd.read_pickle(Config.processed_data_dir + f'nb{Config.dataset_NB}_train.pkl', compression='zip')
df_train = df_train[[Config.row_id, Config.target]]

submission = pd.read_csv(Config.raw_data_dir + 'sample_submission.csv')
submission.columns = [Config.row_id, Config.target]

df_test = submission.copy()

for i in Config.emsemble_NB:
    df_train_NB =  pd.read_csv(Config.interim_dir + f'{i}.csv')
    df_test_NB =  pd.read_csv(Config.submission_dir + f'{i}.csv')

    df_train[f'{i}'] = df_train_NB[f'{i}']
    df_test[f'{i}'] = df_test_NB[f'{Config.target}']

df_test

Unnamed: 0,id,failure,nb301,nb302,nb303
0,26570,0.0,0.204277,0.209574,0.212748
1,26571,0.0,0.190271,0.194931,0.173723
2,26572,0.0,0.196653,0.192234,0.185448
3,26573,0.0,0.192179,0.196530,0.169217
4,26574,0.0,0.333285,0.272826,0.347902
...,...,...,...,...,...
20770,47340,0.0,0.236817,0.217503,0.193377
20771,47341,0.0,0.157514,0.168125,0.157533
20772,47342,0.0,0.159693,0.177581,0.187284
20773,47343,0.0,0.218900,0.203442,0.233752


## Emsemble

In [4]:
df_train['mean'] = df_train[Config.emsemble_NB].mean(axis=1)
df_train['median'] = df_train[Config.emsemble_NB].median(axis=1)
df_train['max'] = df_train[Config.emsemble_NB].max(axis=1)
df_train['min'] = df_train[Config.emsemble_NB].min(axis=1)

df_test['mean'] = df_test[Config.emsemble_NB].mean(axis=1)
df_test['median'] = df_test[Config.emsemble_NB].median(axis=1)
df_test['max'] = df_test[Config.emsemble_NB].max(axis=1)
df_test['min'] = df_test[Config.emsemble_NB].min(axis=1)

for i in Config.emsemble_NB:
    print(f' {i}: {roc_auc_score(df_train[Config.target], df_train[i])}')
print(f'  mean: {roc_auc_score(df_train[Config.target], df_train["mean"])}')
print(f'median: {roc_auc_score(df_train[Config.target], df_train["median"])}')
print(f'   max: {roc_auc_score(df_train[Config.target], df_train["max"])}')
print(f'   min: {roc_auc_score(df_train[Config.target], df_train["min"])}')

 nb301: 0.5892554740380043
 nb302: 0.570320004202983
 nb303: 0.5492358066972712
  mean: 0.5743839355748842
median: 0.5819631563931816
   max: 0.5723572308099265
   min: 0.5616725393098682


In [None]:
df_test

## Submission

In [5]:
best_col = 'median'

submission[Config.target] = df_test[best_col]
submission.to_csv(Config.submission_dir + f'nb{Config.NB}.csv', index=False)
submission

Unnamed: 0,id,failure
0,26570,0.209574
1,26571,0.190271
2,26572,0.192234
3,26573,0.192179
4,26574,0.333285
...,...,...
20770,47340,0.217503
20771,47341,0.157533
20772,47342,0.177581
20773,47343,0.218900
