In [14]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform
from pprint import pprint
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

## find performance upper bound
 
Making the assumption that the existing features has all the predictive power, we would like to examine whether dropping nan would improve model performance. In a way this tells us the performance upper bound of a model & whether getting the nan right is the key to success of prediction. 

In [4]:
PATH = '../data/raw'
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False


In [5]:
def parse(df):
    return (
        df.assign(
            GroupSize = df['PassengerId'].str.split('_', expand=True)[1].astype(int),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

# parse(df).dropna()

## try several models aftering dropna. 

In [58]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df).dropna()

y = df.pop('Transported')
X = df

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder()
scaler = StandardScaler()


lr = LogisticRegression()
rf = RandomForestClassifier()
svc = SVC()
hgb = HistGradientBoostingClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()


ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

models = [lr, rf, svc, hgb,xgb,lgb]

for model in models:
    pipe = make_pipeline(
        ct, model
    )
    pipe.fit(X_tr,y_tr)
    y_pred = pipe.predict(X_dev)
    print(model.__class__.__name__)
    # print(classification_report(y_pred,y_dev))
    report = classification_report(y_pred,y_dev, output_dict=True) 
    display(pd.DataFrame(report).T)

LogisticRegression


Unnamed: 0,precision,recall,f1-score,support
False,0.768421,0.803459,0.78555,636.0
True,0.818314,0.785216,0.801423,717.0
accuracy,0.793792,0.793792,0.793792,0.793792
macro avg,0.793368,0.794338,0.793487,1353.0
weighted avg,0.794861,0.793792,0.793962,1353.0


RandomForestClassifier


Unnamed: 0,precision,recall,f1-score,support
False,0.825564,0.767832,0.795652,715.0
True,0.758721,0.818182,0.78733,638.0
accuracy,0.791574,0.791574,0.791574,0.791574
macro avg,0.792142,0.793007,0.791491,1353.0
weighted avg,0.794044,0.791574,0.791728,1353.0


SVC


Unnamed: 0,precision,recall,f1-score,support
False,0.778947,0.815748,0.796923,635.0
True,0.829942,0.795265,0.812233,718.0
accuracy,0.804878,0.804878,0.804878,0.804878
macro avg,0.804445,0.805506,0.804578,1353.0
weighted avg,0.806009,0.804878,0.805048,1353.0


HistGradientBoostingClassifier


Unnamed: 0,precision,recall,f1-score,support
False,0.78797,0.803681,0.795748,652.0
True,0.813953,0.798859,0.806335,701.0
accuracy,0.801183,0.801183,0.801183,0.801183
macro avg,0.800962,0.80127,0.801042,1353.0
weighted avg,0.801432,0.801183,0.801233,1353.0


XGBClassifier


Unnamed: 0,precision,recall,f1-score,support
0,0.786466,0.798473,0.792424,655.0
1,0.80814,0.796562,0.802309,698.0
accuracy,0.797487,0.797487,0.797487,0.797487
macro avg,0.797303,0.797517,0.797367,1353.0
weighted avg,0.797647,0.797487,0.797524,1353.0


[LightGBM] [Info] Number of positive: 2713, number of negative: 2698
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1412
[LightGBM] [Info] Number of data points in the train set: 5411, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501386 -> initscore=0.005544
[LightGBM] [Info] Start training from score 0.005544
LGBMClassifier


Unnamed: 0,precision,recall,f1-score,support
False,0.793985,0.804878,0.799394,656.0
True,0.813953,0.803443,0.808664,697.0
accuracy,0.804139,0.804139,0.804139,0.804139
macro avg,0.803969,0.804161,0.804029,1353.0
weighted avg,0.804272,0.804139,0.80417,1353.0


In [41]:
pd.DataFrame(hgb.get_params(), index = ['values']).T
# pd.DataFrame(xgb.get_params(), index = ['values']).T
# pd.DataFrame(lgb.get_params(), index = ['values']).T

Unnamed: 0,values
categorical_features,
class_weight,
early_stopping,auto
interaction_cst,
l2_regularization,0.0
learning_rate,0.1
loss,log_loss
max_bins,255
max_depth,
max_iter,100


## Tuning more parameters of HistGB (with nan included)

In [61]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df)

y = df.pop('Transported')
X = df

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder()
scaler = StandardScaler()
hgb = HistGradientBoostingClassifier()

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

model = hgb

pipe = make_pipeline(
    ct, model
)

params = { 
    model.__class__.__name__.lower()+ '__' +'max_bins' : [255, 128, 64],
    model.__class__.__name__.lower()+ '__' +'max_depth': [None, 10, 15, 5],
    model.__class__.__name__.lower()+ '__' +'max_leaf_nodes': [31,21,11],
    model.__class__.__name__.lower()+ '__' +'min_samples_leaf': [20,30,10,5],
    model.__class__.__name__.lower()+ '__' +'learning_rate': loguniform(0.01,1),
    model.__class__.__name__.lower()+ '__' +'l2_regularization': loguniform(0.01, 10)
}


search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=50,
    n_jobs=-1,
    scoring='accuracy',
    random_state=1123
)
search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
display(pd.DataFrame(classification_report(y_pred,y_dev, output_dict=True)).T)
pprint(search.best_params_)

Unnamed: 0,precision,recall,f1-score,support
False,0.774908,0.797468,0.786026,790.0
True,0.827214,0.807165,0.817067,949.0
accuracy,0.80276,0.80276,0.80276,0.80276
macro avg,0.801061,0.802317,0.801546,1739.0
weighted avg,0.803452,0.80276,0.802965,1739.0


{'histgradientboostingclassifier__l2_regularization': 2.4018982155918236,
 'histgradientboostingclassifier__learning_rate': 0.08642180973258734,
 'histgradientboostingclassifier__max_bins': 255,
 'histgradientboostingclassifier__max_depth': 10,
 'histgradientboostingclassifier__max_leaf_nodes': 11,
 'histgradientboostingclassifier__min_samples_leaf': 30}


In [39]:
# df_te = pd.read_csv(os.path.join(PATH,'test.csv'))
# df_parsed = parse(df_te)
# y_pred = search.predict(df_parsed)
# df_te['Transported'] = y_pred
# df_sub = df_te[['PassengerId','Transported']]
# df_sub.to_csv(os.path.join(PATH,'0.2-xy-submission.csv'), index=False)


In [38]:
# !kaggle competitions submit -c spaceship-titanic -f ../data/raw/0.2-xy-submission.csv -m "HistGB-2-tuned-with-nan"
# !kaggle competitions submissions -c spaceship-titanic