In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform

## a bit of feature engineering
- drop name
- create GroupSize from PassengerID
- create Deck and Side from Cabin
- one hot endoding categorical columns
- scale numerical columns
- HistGB (which accepts nan natively)

In [None]:
PATH = '../data/raw'
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df.head(3)

In [None]:
def parse(df):
    return (
        df.assign(
            GroupSize = df['PassengerId'].str.split('_', expand=True)[1].astype(int),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

parse(df).nunique()

In [None]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df)

y = df.pop('Transported')
X = df

ohe = OneHotEncoder()
scaler = StandardScaler()
hgb = HistGradientBoostingClassifier()


ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, hgb
)

param_distributions = {
    'histgradientboostingclassifier__learning_rate': loguniform(0.01,0.1),
    'histgradientboostingclassifier__l2_regularization': loguniform(0.01,10)
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_distributions, 
    n_jobs=-1,
    n_iter=20,
    verbose=True,
    random_state=1123
)

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
print(classification_report(y_pred,y_dev))
print(search.best_params_)

In [None]:
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))
df_parsed = parse(df_te)
y_pred = search.predict(df_parsed)
df_te['Transported'] = y_pred
df_sub = df_te[['PassengerId','Transported']]
df_sub.to_csv(os.path.join(PATH,'0.1-xy-submission.csv'), index=False)


In [None]:
# !kaggle competitions submit -c spaceship-titanic -f ../data/raw/0.1-xy-submission.csv -m "HistGB-1"
# !kaggle competitions submissions -c spaceship-titanic