In [17]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
# from scipy.stats import loguniform, uniform

## find performance upper bound
 
Making the assumption that the existing features has all the predictive power, we would like to examine whether dropping nan would improve model performance. In a way this tells us the performance upper bound of a model & whether getting the nan right is the key to success of prediction. 

In [18]:
PATH = '../data/raw'
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [19]:
def parse(df):
    return (
        df.assign(
            GroupSize = df['PassengerId'].str.split('_', expand=True)[1].astype(int),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

# parse(df).dropna()

## try several models aftering dropna. 

In [20]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df).dropna()

y = df.pop('Transported')
X = df

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder()
scaler = StandardScaler()

hgb = HistGradientBoostingClassifier()
lr = LogisticRegression()
rf = RandomForestClassifier()
svc = SVC()

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

models = [lr, rf, svc, hgb]

for model in models:
    pipe = make_pipeline(
        ct, model
    )
    pipe.fit(X_tr,y_tr)
    y_pred = pipe.predict(X_dev)
    print(model.__class__.__name__, classification_report(y_pred,y_dev))

LogisticRegression               precision    recall  f1-score   support

       False       0.77      0.80      0.79       636
        True       0.82      0.79      0.80       717

    accuracy                           0.79      1353
   macro avg       0.79      0.79      0.79      1353
weighted avg       0.79      0.79      0.79      1353

RandomForestClassifier               precision    recall  f1-score   support

       False       0.83      0.77      0.80       710
        True       0.77      0.82      0.79       643

    accuracy                           0.80      1353
   macro avg       0.80      0.80      0.80      1353
weighted avg       0.80      0.80      0.80      1353

SVC               precision    recall  f1-score   support

       False       0.78      0.82      0.80       635
        True       0.83      0.80      0.81       718

    accuracy                           0.80      1353
   macro avg       0.80      0.81      0.80      1353
weighted avg       0.81     

In [21]:
# df_te = pd.read_csv(os.path.join(PATH,'test.csv'))
# df_parsed = parse(df_te)
# y_pred = search.predict(df_parsed)
# df_te['Transported'] = y_pred
# df_sub = df_te[['PassengerId','Transported']]
# df_sub.to_csv(os.path.join(PATH,'0.2-xy-submission.csv'), index=False)


In [22]:
# !kaggle competitions submit -c spaceship-titanic -f ../data/raw/0.1-xy-submission.csv -m "HistGB-1"
# !kaggle competitions submissions -c spaceship-titanic