In [21]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform

## a bit of feature engineering
- drop name
- create GroupSize from PassengerID
- create Deck and Side from Cabin
- one hot endoding categorical columns
- scale numerical columns
- HistGB (which accepts nan natively)

In [7]:
PATH = '../data/raw'
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [8]:
def parse(df):
    return (
        df.assign(
            GroupSize = df['PassengerId'].str.split('_', expand=True)[1].astype(int),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

parse(df).nunique()

HomePlanet         3
CryoSleep          2
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Transported        2
GroupSize          8
Deck               8
Side               2
dtype: int64

In [25]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df)

y = df.pop('Transported')
X = df

ohe = OneHotEncoder()
scaler = StandardScaler()
hgb = HistGradientBoostingClassifier()


ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, hgb
)

param_distributions = {
    'histgradientboostingclassifier__learning_rate': loguniform(0.01,0.1),
    'histgradientboostingclassifier__l2_regularization': loguniform(0.01,10)
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions, 
    n_jobs=-1,
    n_iter=20,
    verbose=True,
    random_state=42
)

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=0)

search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
print(classification_report(y_pred,y_dev))
print(search.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
              precision    recall  f1-score   support

       False       0.79      0.81      0.80       841
        True       0.82      0.80      0.81       898

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739

{'histgradientboostingclassifier__l2_regularization': 0.011527987128232402, 'histgradientboostingclassifier__learning_rate': 0.09330606024425668}


In [18]:
hgb.get_params()

{'categorical_features': None,
 'class_weight': None,
 'early_stopping': 'auto',
 'interaction_cst': None,
 'l2_regularization': 0.0,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_bins': 255,
 'max_depth': None,
 'max_iter': 100,
 'max_leaf_nodes': 31,
 'min_samples_leaf': 20,
 'monotonic_cst': None,
 'n_iter_no_change': 10,
 'random_state': None,
 'scoring': 'loss',
 'tol': 1e-07,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [48]:
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))
df_parsed = parse(df_te)
y_pred = search.predict(df_parsed)
df_te['Transported'] = y_pred
df_sub = df_te[['PassengerId','Transported']]
df_sub.to_csv(os.path.join(PATH,'0.1-xy-submission.csv'), index=False)


In [49]:
df_sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [52]:
!kaggle competitions submit -c spaceship-titanic -f "C:\Users\bxxy002\OneDrive - Brunel University London\AI projects\LearnFromTabularData\data\raw\0.1-xy-submission.csv" -m "HistGB"

Successfully submitted to Spaceship Titanic



  0%|          | 0.00/60.5k [00:00<?, ?B/s]
 26%|██▋       | 16.0k/60.5k [00:00<00:00, 58.1kB/s]
100%|██████████| 60.5k/60.5k [00:00<00:00, 77.1kB/s]


In [54]:
# !kaggle competitions submissions -c spaceship-titanic