In [3]:
%load_ext autoreload
%autoreload 2

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import prince

from dataprep.pipeline import get_split, get_data
from models.evaluate import get_confusion_matrix
from models.ensemble import Pool
import config



def create_model():
    model = CatBoostClassifier(iterations=80,
                          learning_rate=0.2,
                          depth=10, 
                          loss_function='Logloss', 
                          random_seed=config.RANDOM_SEED)
    return model


In [31]:
df_train = get_data(mode='train')
pool = Pool(df_train, create_model, n_models=11, fac=0.80)
scores = pool.train_and_eval()

0:	learn: 0.6294355	total: 15.7ms	remaining: 1.24s
1:	learn: 0.5871986	total: 31.9ms	remaining: 1.24s
2:	learn: 0.5516084	total: 46.6ms	remaining: 1.2s
3:	learn: 0.5245245	total: 63.4ms	remaining: 1.2s
4:	learn: 0.5056459	total: 79.9ms	remaining: 1.2s
5:	learn: 0.4914612	total: 96.2ms	remaining: 1.19s
6:	learn: 0.4769120	total: 112ms	remaining: 1.17s
7:	learn: 0.4629565	total: 127ms	remaining: 1.15s
8:	learn: 0.4552989	total: 143ms	remaining: 1.13s
9:	learn: 0.4468434	total: 160ms	remaining: 1.12s
10:	learn: 0.4395044	total: 176ms	remaining: 1.1s
11:	learn: 0.4336660	total: 192ms	remaining: 1.08s
12:	learn: 0.4296530	total: 209ms	remaining: 1.08s
13:	learn: 0.4224033	total: 229ms	remaining: 1.08s
14:	learn: 0.4189463	total: 245ms	remaining: 1.06s
15:	learn: 0.4128918	total: 267ms	remaining: 1.07s
16:	learn: 0.4092835	total: 290ms	remaining: 1.08s
17:	learn: 0.4057844	total: 315ms	remaining: 1.09s
18:	learn: 0.4028036	total: 338ms	remaining: 1.08s
19:	learn: 0.4003526	total: 361ms	remai

In [32]:
print(f'Scores : {np.mean(scores)}')

df_preds = pool.predict(df_val)
#print(accuracy_score(df_preds['final_pred'], df_val['Transported']))

Scores : 0.8115312107247591


In [33]:
df_preds.head(20)

Unnamed: 0,PassengerId,Transported,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,final_pred
0,0003_01,0,0,0,0,0,0,0,0,0,0,0,0,False
1,0006_02,1,1,1,1,1,1,1,1,1,1,1,1,True
2,0012_01,0,0,1,0,1,1,1,1,1,1,1,1,True
3,0016_01,1,1,1,1,1,1,1,1,1,1,1,1,True
4,0024_01,1,1,1,1,1,1,1,1,1,1,1,1,True
5,0025_01,0,0,0,0,0,0,0,0,0,0,0,0,False
6,0035_01,0,0,0,0,0,0,0,0,0,0,0,0,False
7,0051_01,0,0,0,0,0,0,0,0,0,0,0,0,False
8,0056_03,1,1,1,1,1,1,1,1,1,1,1,1,True
9,0064_01,1,1,1,1,1,1,1,1,1,1,1,1,True


In [40]:
# Test
df_test = get_data('test')
preds = pool.predict(df_test)
df_test['pred'] = preds['final_pred']

# Make submission
submission = (pd.read_csv(config.PATH_TO_SAMPLE_SUBMISSION)
              .drop('Transported', axis=1)
              .merge(df_test.filter(['PassengerId', 'pred']), 
                     on='PassengerId', 
                     how='left')
              .rename({'pred': 'Transported'}, axis=1)
              .astype({'Transported': bool})
             )
submission.to_csv('../data/output/sub_05.csv', index=False)