1. Use any binary classification dataset
2. Define validation strategy and use it for all next steps without changes
3. Train decision tree model and estimate performance on validation
4. Train bagging model with decision tree as a base model and estimate performance on validation
5. Write your own bagging implementation:
  <br>5.1. Define init for our CustomBaggingClassifier
  <br>5.2. Write fit as described in lecture: divide train data on n parts (`n_estimators` in CustomBaggingClassifier), train `base_estimator` on each part and save these models inside class
  <br>5.3. For predictions we should use all saved models and combine their predictions (as voting)
6. Compare performance of sklearn bagging model with your own implementation

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
train = pd.read_csv('../data/spaceship_titanic_train.csv')
test = pd.read_csv('../data/spaceship_titanic_test.csv')

In [3]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [125]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [126]:
def prepare_data(data, do_1hot=True):
    group_count = lambda id_: id_[:4]
    data['travel_group'] = data['PassengerId'].apply(lambda id_: id_[:4])
    data['travel_group_n'] = data['travel_group'].apply(lambda group: data.loc[data.travel_group == group].shape[0])
    data['travelling_with_n_kids'] = (-1)*(train['Age'] < 18).astype(int)+ data['travel_group'].apply(lambda group: data.loc[(data.travel_group == group) & (data.Age < 18)].shape[0])
    data['cabin_side'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[-1])
    data['cabin_deck'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[0])
    data['cabin_num'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else int(str(cabin)[2:-2]))
    
    
    drop_ = ['PassengerId','Name', 'Cabin', 'travel_group']
    cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'cabin_side', 'cabin_deck']
    num = ['FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService', 'ShoppingMall', 'cabin_num']
    one_hot = OneHotEncoder(sparse_output=False, drop='if_binary')
   
    if do_1hot:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        one_hot)
    else:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OrdinalEncoder())
    
    transformer = ColumnTransformer([
        ('cat', pipe, cat),
        ('num', SimpleImputer(strategy='mean'), num),
        ('drop_', 'drop', drop_)
    ], remainder='passthrough',
       verbose_feature_names_out=False)

    
    d = transformer.fit_transform(data)
    if do_1hot:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out(), dtype=float)
    else:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out())
    return df

In [6]:
train_prep = prepare_data(train, True)

In [7]:
train_X, train_y = train_prep.drop('Transported_True', axis=1), train_prep.Transported_True

In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from IPython import display

In [77]:
params = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",
    #train-error-mean    0.861325
    #test-error-mean     0.810883
    
    #eta = 0.01 test 0.78, train 80.5
    
    # regularization parameters
    "max_depth": 5,
    "max_leaves": 0,
    "min_child_weight":1,
    "subsample": 0.9,
    "colsample_bytree": 0.9
    
    #train-error-mean    0.848240
    #test-error-mean     0.811689
    
#    "tree_method": "hist",    
#    "grow_policy": "lossguide"
#     train-error-mean    0.827677
#     test-error-mean     0.807776
}
xgb_train = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)
result = xgb.cv(params, xgb_train, folds =skf, num_boost_round=10_000, early_stopping_rounds=10, verbose_eval=10)
1-result.iloc[-1, [0, 2]]

[0]	train-error:0.24140+0.00109	test-error:0.25020+0.00579
[10]	train-error:0.18981+0.00140	test-error:0.20821+0.00452
[20]	train-error:0.18115+0.00137	test-error:0.20246+0.00477
[30]	train-error:0.17169+0.00144	test-error:0.19591+0.00383
[40]	train-error:0.16620+0.00193	test-error:0.19360+0.00493
[50]	train-error:0.16125+0.00164	test-error:0.19096+0.00499
[60]	train-error:0.15383+0.00171	test-error:0.19038+0.00750
[70]	train-error:0.14960+0.00188	test-error:0.18946+0.00611
[74]	train-error:0.14851+0.00176	test-error:0.18877+0.00493


train-error-mean    0.848240
test-error-mean     0.811689
Name: 64, dtype: float64

In [133]:
train_prep_ = prepare_data(train, do_1hot = False)
train__X, train__y = train_prep_.drop('Transported', axis=1), train_prep_.Transported
lgb_train = lgb.Dataset(train__X, label=train__y, 
                        categorical_feature=['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_side', 'cabin_deck'],
                        free_raw_data=False)

In [206]:
import lightgbm as lgb

params = {
    #('train: 0.8529852970335823', 'valid: 0.8153695330371873')
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    #eta 0.01 ('train: 0.8085527468480869', 'valid: 0.7939731642128626')
    #eta 0.03 ('train: 0.8322500349942634', 'valid: 0.8069714549649911')
    
     #regularization
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    #('train: 0.8471183562848146', 'valid: 0.8108826746585971')
    "subsample_freq": 1,
    "min_data_in_leaf": 60,
    #('train: 0.8432359386639986', 'valid: 0.8141035779064327')
    "num_leaves":20,
    #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y, free_raw_data=False)
result = lgb.cv(params, lgb_train, 10_000, folds=skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True)
'train: '+str(1-result['train binary_error-mean'][-1]), 'valid: '+str(1-result['valid binary_error-mean'][-1])

Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train binary_error: 0.191821 + 0.00309633	cv_agg's valid binary_error: 0.205683 + 0.00371206
[20]	cv_agg's train binary_error: 0.183596 + 0.00118703	cv_agg's valid binary_error: 0.19924 + 0.00403496
[30]	cv_agg's train binary_error: 0.176809 + 0.000542577	cv_agg's valid binary_error: 0.193949 + 0.00246604
[40]	cv_agg's train binary_error: 0.16913 + 0.0024848	cv_agg's valid binary_error: 0.192914 + 0.00416128
[50]	cv_agg's train binary_error: 0.162027 + 0.00290216	cv_agg's valid binary_error: 0.186817 + 0.00433977
[60]	cv_agg's train binary_error: 0.15639 + 0.00269836	cv_agg's valid binary_error: 0.188427 + 0.00477547
Early stopping, best iteration is:
[53]	cv_agg's train binary_error: 0.160762 + 0.00295383	cv_agg's valid binary_error: 0.186241 + 0.00496277


('train: 0.8392384710701339', 'valid: 0.8137586182024641')

In [196]:
import catboost as ctb
params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "logging_level":"Silent",
    
#     train-Accuracy-mean      0.817439
#     test-Accuracy-mean       0.806167
    
    #regularization
    "depth":4,
#     train-Accuracy-mean      0.814132
#     test-Accuracy-mean       0.808697
    "subsample":0.8,
    "rsm":0.7,
    "min_data_in_leaf":50,
    
    #tree
    "grow_policy":"Depthwise"
    
#     train-Accuracy-mean      0.841050
#     test-Accuracy-mean       0.810077
}
ctb_train = ctb.Pool(train_X, train_y)
result = ctb.cv(ctb_train, params, folds=skf, seed=42, verbose_eval=100,plot=False)
result.iloc[-1, [0,3, 1]]

iterations             213.000000
train-Accuracy-mean      0.841050
test-Accuracy-mean       0.810077
Name: 213, dtype: float64

In [209]:
test_prep = prepare_data(test, do_1hot=True)

In [253]:
params = {
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    "subsample_freq": 1,
    "min_data_in_leaf": 60,
    "num_leaves":20
    #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
}

lgb_clf = lgb.LGBMClassifier(**params)
lgb_clf.fit(train_X, train_y)
preds = lgb_clf.predict(test_prep)



In [254]:
df = pd.DataFrame()
df['Transported'] = preds.astype(bool)
df.set_index(test.PassengerId, inplace=True)

In [241]:
df.to_csv('spaceship_preds_lightgb.csv')

Kaggle score: 0.80196