In [28]:
# !pip install kaggle
PATH = '../data/raw'
NAME = 'spaceship-titanic'
# download_command = f'kaggle competitions download -c {NAME} -p {PATH}'
# unzip_command = f'unzip {PATH}/{NAME}.zip -d {PATH}'
# !{download_command}
# !{unzip_command}

In [29]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform

In [30]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [32]:
df.isna().sum(axis=0)

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

## quick benchmark 

- only use numerical cols, home, and destination
- histogram gb model (natively handle nan values)

In [33]:
df[['Transported']]

Unnamed: 0,Transported
0,False
1,True
2,False
3,False
4,True
...,...
8688,False
8689,False
8690,True
8691,False


In [34]:
y = df.pop('Transported')
X = df

ohe = OneHotEncoder()
scaler = StandardScaler()

ct = make_column_transformer(
    (ohe, ['Destination','HomePlanet']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct,HistGradientBoostingClassifier()
)

params = {
    'histgradientboostingclassifier__learning_rate': loguniform(0.001,0.1)
}

search = RandomizedSearchCV(pipe, param_distributions=params, n_jobs=-1)

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2)

search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
print(classification_report(y_pred,y_dev))


              precision    recall  f1-score   support

       False       0.71      0.83      0.77       729
        True       0.86      0.76      0.81      1010

    accuracy                           0.79      1739
   macro avg       0.79      0.80      0.79      1739
weighted avg       0.80      0.79      0.79      1739

