In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform
from pprint import pprint
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier


# Fix a bug with the GroupSize feature

- GroupSize feature is meant to be the number of people in the same group. In previous notebooks the parse function does something different
- after correction (and binarisation) the accuracy improves a bit. 

In [9]:
PATH = '../data/raw'
df = pd.read_csv(os.path.join(PATH,'train.csv'))
m = df.shape[0]
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))

def parse(df):
    GroupId = df['PassengerId'].str.split('_', expand=True)[0]
    counts = GroupId.value_counts()
    return (
        df.assign(
            Solo = GroupId.map(counts) == 1,
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

df.pop('Transported')
parse(pd.concat([df,df_te], axis=0))[:m]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Solo,Deck,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,True,B,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,True,A,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,True,G,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S


In [11]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
m = df.shape[0]
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))

y = df.pop('Transported')  # pop first otherwise the label column will be casted as object because df_te has no label column.
X = parse(pd.concat([df,df_te], axis=0))[:m]
assert y.size == m

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder(drop='if_binary')
scaler = StandardScaler()
model = HistGradientBoostingClassifier()

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'Solo', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, model
)

params = { 
    model.__class__.__name__.lower()+ '__' +'max_bins' : [255, 128, 64],
    model.__class__.__name__.lower()+ '__' +'max_depth': [None, 10, 15, 5],
    model.__class__.__name__.lower()+ '__' +'max_leaf_nodes': [31,21,11],
    model.__class__.__name__.lower()+ '__' +'min_samples_leaf': [20,30,10,5],
    model.__class__.__name__.lower()+ '__' +'learning_rate': loguniform(0.01,1),
    model.__class__.__name__.lower()+ '__' +'l2_regularization': loguniform(0.01, 10)
}


search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=50,
    n_jobs=-1,
    scoring='accuracy',
    random_state=1123
)
search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
display(pd.DataFrame(classification_report(y_pred,y_dev, output_dict=True)).T)
pprint(search.best_params_)

Unnamed: 0,precision,recall,f1-score,support
False,0.777368,0.796974,0.787049,793.0
True,0.826134,0.808668,0.817308,946.0
accuracy,0.803335,0.803335,0.803335,0.803335
macro avg,0.801751,0.802821,0.802178,1739.0
weighted avg,0.803896,0.803335,0.803509,1739.0


{'histgradientboostingclassifier__l2_regularization': 2.4018982155918236,
 'histgradientboostingclassifier__learning_rate': 0.08642180973258734,
 'histgradientboostingclassifier__max_bins': 255,
 'histgradientboostingclassifier__max_depth': 10,
 'histgradientboostingclassifier__max_leaf_nodes': 11,
 'histgradientboostingclassifier__min_samples_leaf': 30}
