In [18]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform
import pprint
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

# Tune HGB 

In [13]:
model = HistGradientBoostingClassifier()
pd.DataFrame(model.get_params(), index = ['values']).T

Unnamed: 0,values
categorical_features,
class_weight,
early_stopping,auto
interaction_cst,
l2_regularization,0.0
learning_rate,0.1
loss,log_loss
max_bins,255
max_depth,
max_iter,100


In [14]:
PATH = '../data/raw'

def parse(df):
    return (
        df.assign(
            GroupSize = df['PassengerId'].str.split('_', expand=True)[1].astype(int),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

In [17]:
df = pd.read_csv(os.path.join(PATH,'train.csv'))
df = parse(df)

y = df.pop('Transported')
X = df

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder()
scaler = StandardScaler()
model = HistGradientBoostingClassifier()

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, model
)

params = { 
    model.__class__.__name__.lower()+ '__' +'max_bins' : [255, 128, 64],
    model.__class__.__name__.lower()+ '__' +'max_depth': [None, 10, 15, 5],
    model.__class__.__name__.lower()+ '__' +'max_leaf_nodes': [31,21,11],
    model.__class__.__name__.lower()+ '__' +'min_samples_leaf': [20,30,10,5],
    model.__class__.__name__.lower()+ '__' +'learning_rate': loguniform(0.01,1),
    model.__class__.__name__.lower()+ '__' +'l2_regularization': loguniform(0.01, 10)
}


search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=50,
    n_jobs=-1,
    scoring='accuracy',
    random_state=1123
)
search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
display(pd.DataFrame(classification_report(y_pred,y_dev, output_dict=True)).T)
pprint(search.best_params_)

Unnamed: 0,precision,recall,f1-score,support
False,0.774908,0.797468,0.786026,790.0
True,0.827214,0.807165,0.817067,949.0
accuracy,0.80276,0.80276,0.80276,0.80276
macro avg,0.801061,0.802317,0.801546,1739.0
weighted avg,0.803452,0.80276,0.802965,1739.0


{'histgradientboostingclassifier__l2_regularization': 2.4018982155918236,
 'histgradientboostingclassifier__learning_rate': 0.08642180973258734,
 'histgradientboostingclassifier__max_bins': 255,
 'histgradientboostingclassifier__max_depth': 10,
 'histgradientboostingclassifier__max_leaf_nodes': 11,
 'histgradientboostingclassifier__min_samples_leaf': 30}
