In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.naive_bayes import GaussianNB
# from sklearn.svm import SVC
from sklearn.metrics import classification_report
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.experimental import enable_iterative_imputer
# from sklearn.impute import IterativeImputer, SimpleImputer
from scipy.stats import loguniform, uniform
from pprint import pprint
# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Tune LightGBM

- amongst all the models in notebook 0.2 (dropna!), LGB seems to behave the best in terms of predictiong false label, as well as the overall accuracy
- Just as HGB, it handles np.nan natively. 
- parameters details: https://lightgbm.readthedocs.io/en/latest/Parameters.html
- lightGBM paper: https://dl.acm.org/doi/pdf/10.5555/3294996.3295074
- more ref: https://lightgbm.readthedocs.io/en/latest/Features.html#references

More theoretical stuff
- Gradient Boosting paper: https://projecteuclid.org/journals/annals-of-statistics/volume-29/issue-5/Greedy-function-approximation-A-gradient-boosting-machine/10.1214/aos/1013203451.full

In [2]:
model = LGBMClassifier()
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [3]:
PATH = '../data/raw'

def parse(df):
    GroupId = df['PassengerId'].str.split('_', expand=True)[0]
    counts = GroupId.value_counts()
    return (
        df.assign(
            GroupSize = GroupId.map(counts),
            Deck = df['Cabin'].str.split('/',expand=True)[0],
            Side = df['Cabin'].str.split('/',expand=True)[2]
            )
        .drop(['Name','PassengerId','Cabin'],axis=1)
        )

In [4]:
# baseline performance with the correct GroupSize feature, no search

df = pd.read_csv(os.path.join(PATH,'train.csv'))
m = df.shape[0]
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))

y = df.pop('Transported')  # pop first otherwise the label column will be casted as object because df_te has no label column.
X = parse(pd.concat([df,df_te], axis=0))[:m]

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder(drop='if_binary')
scaler = StandardScaler()
model = LGBMClassifier()

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, model
)

pipe.fit(X_tr,y_tr)
y_pred = pipe.predict(X_dev)
display(pd.DataFrame(classification_report(y_pred,y_dev, output_dict=True)).T)
importance_base = pipe.named_steps['lgbmclassifier'].feature_importances_


[LightGBM] [Info] Number of positive: 3452, number of negative: 3502
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1429
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496405 -> initscore=-0.014380
[LightGBM] [Info] Start training from score -0.014380


Unnamed: 0,precision,recall,f1-score,support
False,0.785978,0.776428,0.781174,823.0
True,0.801296,0.810044,0.805646,916.0
accuracy,0.794135,0.794135,0.794135,0.794135
macro avg,0.793637,0.793236,0.79341,1739.0
weighted avg,0.794046,0.794135,0.794064,1739.0


In [5]:
# random search

df = pd.read_csv(os.path.join(PATH,'train.csv'))
m = df.shape[0]
df_te = pd.read_csv(os.path.join(PATH,'test.csv'))

y = df.pop('Transported')  # pop first otherwise the label column will be casted as object because df_te has no label column.
X = parse(pd.concat([df,df_te], axis=0))[:m]

X_tr, X_dev, y_tr, y_dev = train_test_split(X,y,test_size=0.2, random_state=1123)

ohe = OneHotEncoder(drop='if_binary')
scaler = StandardScaler()
model = LGBMClassifier(n_jobs=-1)

ct = make_column_transformer(
    (ohe, ['HomePlanet','CryoSleep', 'Destination', 'VIP', 'GroupSize', 'Deck','Side']),
    (scaler, make_column_selector(dtype_include=np.number)),
    remainder='drop',
)

pipe = make_pipeline(
    ct, model
)


params = {
    model.__class__.__name__.lower()+ '__' +'boosting_type': ['gbdt', 'dart', 'goss'],
    model.__class__.__name__.lower()+ '__' +'num_leaves': [20, 30, 40],
    model.__class__.__name__.lower()+ '__' + 'learning_rate': loguniform(0.01,1),
    model.__class__.__name__.lower()+ '__' +'n_estimators': [50, 100, 150],
    model.__class__.__name__.lower()+ '__' +'max_depth': [-1, 10, 20],
    model.__class__.__name__.lower()+ '__' +'min_child_samples': [20, 30, 40]
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=50,
    n_jobs=-1,
    scoring='accuracy',
    random_state=1123
)
search.fit(X_tr,y_tr)
y_pred = search.predict(X_dev)
display(pd.DataFrame(classification_report(y_pred,y_dev, output_dict=True)).T)
pprint(search.best_params_)

[LightGBM] [Info] Number of positive: 3452, number of negative: 3502
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1429
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 40
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.496405 -> initscore=-0.014380
[LightGBM] [Info] Start training from score -0.014380


Unnamed: 0,precision,recall,f1-score,support
False,0.792128,0.789216,0.790669,816.0
True,0.814255,0.816901,0.815576,923.0
accuracy,0.80391,0.80391,0.80391,0.80391
macro avg,0.803191,0.803059,0.803123,1739.0
weighted avg,0.803872,0.80391,0.803889,1739.0


{'lgbmclassifier__boosting_type': 'goss',
 'lgbmclassifier__learning_rate': 0.033528041482088565,
 'lgbmclassifier__max_depth': 10,
 'lgbmclassifier__min_child_samples': 20,
 'lgbmclassifier__n_estimators': 150,
 'lgbmclassifier__num_leaves': 20}


In [6]:
importance_search = search.best_estimator_.named_steps['lgbmclassifier'].feature_importances_
print(np.intersect1d(importance_base,importance_search))
# watch out: more features have been created by the model and the feature importances refer to feature index of the model 
best_transform = search.best_estimator_.named_steps['columntransformer']
print(best_transform.transform(X_tr).shape)

[ 0  1  4  5  7 12 13 14 28 48 51]
(6954, 41)
