In [61]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from IPython import display
import numpy as np
import pandas as pd
import seaborn as sns

In [215]:
train = pd.read_csv('../data/spaceship_titanic_train.csv')
test = pd.read_csv('../data/spaceship_titanic_test.csv')
def prepare_data(data, do_1hot=True, X_only = False):
    group_count = lambda id_: id_[:4]
    data['travel_group'] = data['PassengerId'].apply(lambda id_: id_[:4])
#     data['travel_group_n'] = data['travel_group'].apply(lambda group: data.loc[data.travel_group == group].shape[0])
#     data['travelling_with_n_kids'] = (-1)*(train['Age'] < 18).astype(int)+ data['travel_group'].apply(lambda group: data.loc[(data.travel_group == group) & (data.Age < 18)].shape[0])
    data['cabin_side'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[-1])
    data['cabin_deck'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[0])
    data['cabin_num'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else int(str(cabin)[2:-2]))
    
    
    drop_ = ['PassengerId','Name', 'Cabin', 'travel_group']
    if not X_only:
        cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'cabin_side', 'cabin_deck']
    else:
        cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_side', 'cabin_deck']
    num = ['FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService', 'ShoppingMall', 'cabin_num']
    one_hot = OneHotEncoder(sparse_output=False, drop='if_binary')
   
    if do_1hot:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        one_hot)
    else:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OrdinalEncoder())
    
    transformer = ColumnTransformer([
        ('cat', pipe, cat),
        ('num', SimpleImputer(strategy='mean'), num),
        ('drop_', 'drop', drop_)
    ], remainder='passthrough',
       verbose_feature_names_out=False)

    
    d = transformer.fit_transform(data)
    if do_1hot:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out(), dtype=float)
    else:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out())
    return df

In [216]:
train_prep = prepare_data(train, True)

In [217]:
train_X, train_y = train_prep.drop('Transported_True', axis=1), train_prep.Transported_True

In [218]:
fixed_skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)

In [219]:
import lightgbm as lgb
lgb_params = {
    #('train: 0.8529852970335823', 'valid: 0.8153695330371873')
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    #eta 0.01 ('train: 0.8085527468480869', 'valid: 0.7939731642128626')
    #eta 0.03 ('train: 0.8322500349942634', 'valid: 0.8069714549649911')
    
     #regularization
    "max_depth":5,
    "colsample_bytree": 0.9,
    "subsample": 0.9,
    #('train: 0.8471183562848146', 'valid: 0.8108826746585971')
    "subsample_freq": 1,
#     "min_data_in_leaf": 60,
    #('train: 0.8432359386639986', 'valid: 0.8141035779064327')
#     "num_leaves":20,
#     #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y, free_raw_data=False)
lgb_result = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result["cvbooster"].best_iteration



Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train binary_error: 0.196566 + 0.00203069	cv_agg's valid binary_error: 0.216036 + 0.00139315
[20]	cv_agg's train binary_error: 0.181468 + 0.001711	cv_agg's valid binary_error: 0.201081 + 0.00634233
[30]	cv_agg's train binary_error: 0.173876 + 0.00268055	cv_agg's valid binary_error: 0.19717 + 0.00477764
[40]	cv_agg's train binary_error: 0.169361 + 0.00303455	cv_agg's valid binary_error: 0.194409 + 0.00435873
[50]	cv_agg's train binary_error: 0.163695 + 0.00331076	cv_agg's valid binary_error: 0.191763 + 0.00312278
[60]	cv_agg's train binary_error: 0.161193 + 0.00228092	cv_agg's valid binary_error: 0.190037 + 0.00415925
[70]	cv_agg's train binary_error: 0.155125 + 0.00248465	cv_agg's valid binary_error: 0.189807 + 0.00382374
[80]	cv_agg's train binary_error: 0.151933 + 0.0021022	cv_agg's valid binary_error: 0.188197 + 0.00504529
[90]	cv_agg's train binary_error: 0.147446 + 0.00209266	cv_agg's valid binary_error: 0.

In [220]:
lgb_result_df = pd.DataFrame()
lgb_result_df['train binary_error-mean'] = lgb_result['train binary_error-mean']
lgb_result_df['valid binary_error-mean'] = lgb_result['valid binary_error-mean']
1-lgb_result_df.iloc[-1]

train binary_error-mean    0.848528
valid binary_error-mean    0.811918
Name: 80, dtype: float64

In [221]:
xgb_params = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",
    #train-error-mean    0.861325
    #test-error-mean     0.810883
    
    #eta = 0.01 test 0.78, train 80.5
    
    # regularization parameters
    "max_depth": 5,
    "max_leaves": 0,
    "min_child_weight":1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 10_000
    
    #train-error-mean    0.848240
    #test-error-mean     0.811689
    
#    "tree_method": "hist",    
#    "grow_policy": "lossguide"
#     train-error-mean    0.827677
#     test-error-mean     0.807776
}
xgb_train = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns)
xgb_result = xgb.cv(xgb_params, xgb_train, folds =fixed_skf, num_boost_round=10_000, early_stopping_rounds=10, verbose_eval=10)
xgb_params['n_estimators'] = list(xgb_result['test-error-mean']).index(min(list(xgb_result['test-error-mean'])))
1-xgb_result.iloc[-1, [0, 2]]

[0]	train-error:0.22719+0.00428	test-error:0.23939+0.00538
[10]	train-error:0.19056+0.00248	test-error:0.21029+0.00426
[20]	train-error:0.17940+0.00176	test-error:0.20143+0.00329
[30]	train-error:0.17313+0.00177	test-error:0.19901+0.00194
[40]	train-error:0.16706+0.00193	test-error:0.19475+0.00297
[50]	train-error:0.16096+0.00194	test-error:0.19372+0.00446
[60]	train-error:0.15674+0.00204	test-error:0.19176+0.00504
[70]	train-error:0.15280+0.00276	test-error:0.19096+0.00487
[80]	train-error:0.14972+0.00238	test-error:0.18854+0.00492
[90]	train-error:0.14552+0.00261	test-error:0.18831+0.00456
[95]	train-error:0.14256+0.00236	test-error:0.19096+0.00603


train-error-mean    0.852496
test-error-mean     0.812148
Name: 86, dtype: float64

In [222]:
import catboost as ctb
ctb_params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "logging_level":"Silent",
    
#     train-Accuracy-mean      0.817439
#     test-Accuracy-mean       0.806167
    
    #regularization
    "depth":4,
#     train-Accuracy-mean      0.814132
#     test-Accuracy-mean       0.808697
    "subsample":0.8,
    "rsm":0.7,
    "min_data_in_leaf":50,
    
    #tree
    "grow_policy":"Depthwise"
    
#     train-Accuracy-mean      0.841050
#     test-Accuracy-mean       0.810077
}
ctb_train = ctb.Pool(train_X, train_y)
ctb_result = ctb.cv(ctb_train, ctb_params, folds=fixed_skf, seed=42, verbose_eval=100,plot=False)
ctb_params['iterations'] = ctb_result.iloc[-1,0]
ctb_result.iloc[-1, [0,3, 1]]

iterations             153.000000
train-Accuracy-mean      0.827505
test-Accuracy-mean       0.804785
Name: 153, dtype: float64

In [223]:
test_prep = prepare_data(test, do_1hot=True, X_only=True)

Kaggle score on LightGBM after specifying optimal n_iter: 0.80476

Kaggle XGBoost Score: 0.80804

CatBoost: 0.8036

## 1. Mean

In [224]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
lgb_clf = lgb.LGBMClassifier(**lgb_params)
ctb_clf = ctb.CatBoostClassifier(**ctb_params)
xgb_cv_results = []
lgb_cv_results = []
ctb_cv_results = []
valid_y_folds = []
for train, valid in fixed_skf.split(train_X, train_y):
    train_X_fold = train_X.iloc[train]
    train_y_fold = train_y.iloc[train]
    valid_X_fold = train_X.iloc[valid]
    valid_y_fold = train_y.iloc[valid]
    xgb_clf.fit(train_X_fold, train_y_fold)
    lgb_clf.fit(train_X_fold, train_y_fold)
    ctb_clf.fit(train_X_fold, train_y_fold)
    xgb_cv_results += list(xgb_clf.predict_proba(valid_X_fold)[:, 1])
    lgb_cv_results += list(lgb_clf.predict_proba(valid_X_fold)[:, 1])
    ctb_cv_results += list(ctb_clf.predict_proba(valid_X_fold)[:, 1])
    valid_y_folds += valid_y_fold.tolist()
xgb_cv_results = np.array(xgb_cv_results)
lgb_cv_results = np.array(lgb_cv_results)
ctb_cv_results = np.array(ctb_cv_results)
valid_y_folds = np.array(valid_y_folds)



In [225]:
mean = (xgb_cv_results*(1/3) + lgb_cv_results*(1/3) + ctb_cv_results*(1/3))
'accuracy on cv: ' + str(accuracy_score(valid_y_folds, np.vectorize(lambda x: 1 if x > 0.5 else 0)(mean)))

'accuracy on cv: 0.8116875647072357'

In [226]:
train_X

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,cabin_side_S,cabin_deck_A,...,cabin_deck_F,cabin_deck_G,cabin_deck_T,FoodCourt,VRDeck,Spa,Age,RoomService,ShoppingMall,cabin_num
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,39.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,9.0,44.0,549.0,24.0,109.0,25.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,3576.0,49.0,6715.0,58.0,43.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1283.0,193.0,3329.0,33.0,0.0,371.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,70.0,2.0,565.0,16.0,303.0,151.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,6819.0,74.0,1643.0,41.0,0.0,0.0,98.0
8689,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,1499.0
8690,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,26.0,0.0,1872.0,1500.0
8691,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1049.0,3235.0,353.0,32.0,0.0,0.0,608.0


In [227]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(train_X, train_y)
xgb_preds = xgb_clf.predict_proba(test_prep)[:, 1]

In [228]:
lgb_clf = lgb.LGBMClassifier(**lgb_params)
lgb_clf.fit(train_X, train_y)
lgb_preds = lgb_clf.predict_proba(test_prep)
lgb_preds = lgb_preds[:, 1]



In [229]:
ctb_clf = ctb.CatBoostClassifier(**ctb_params)
ctb_clf.fit(train_X, train_y)
ctb_preds = ctb_clf.predict_proba(test_prep)
ctb_preds = ctb_preds[:, 1]

In [230]:
preds = xgb_preds*(1/3) + ctb_preds*(1/3) + lgb_preds*(1/3)

In [14]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.80851

In [127]:
weighted_mean = (xgb_cv_results*(1/2) + lgb_cv_results*(1/8) + ctb_cv_results*(3/8))
'accuracy on cv: ' + str(accuracy_score(valid_y_folds, np.vectorize(lambda x: 1 if x > 0.5 else 0)(weighted_mean)))

'accuracy on cv: 0.8107672840216266'

In [32]:
preds = xgb_preds*(1/2) + ctb_preds*(1/8) + lgb_preds*(3/8)

In [16]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.81061

## 2. Median

In [231]:
d = np.stack([xgb_cv_results, lgb_cv_results, ctb_cv_results])
'accuracy on cv: ' + str(accuracy_score(valid_y_folds, np.vectorize(lambda x: 1 if x > 0.5 else 0)(np.median(d, axis = 0))))

'accuracy on cv: 0.8116875647072357'

In [37]:
preds = np.median(np.c_[xgb_preds,ctb_preds,lgb_preds], axis=1)

In [18]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.81108

## 3. Stacking

1. Adding XGB, LGB and CTB preds to the Extra Trees

In [232]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [233]:
xgb_preds_train = cross_val_predict(xgb_clf, train_X, train_y, cv=fixed_skf, method='predict_proba')[:, 1]
lgb_preds_train = cross_val_predict(lgb_clf, train_X, train_y, cv=fixed_skf, method='predict_proba')[:, 1]
ctb_preds_train = cross_val_predict(ctb_clf, train_X, train_y, cv=fixed_skf, method='predict_proba')[:, 1]



In [234]:
train_X['ctb_preds'] = ctb_preds_train
train_X['lgb_preds'] = lgb_preds_train
train_X['xgb_preds'] = xgb_preds_train

In [235]:
xgb_preds = xgb_clf.predict_proba(test_prep)[:, 1]
lgb_preds = lgb_clf.predict_proba(test_prep)[:, 1]
ctb_preds = ctb_clf.predict_proba(test_prep)[:, 1]
test_prep['xgb_preds'] = xgb_preds
test_prep['lgb_preds'] = lgb_preds
test_prep['ctb_preds'] = ctb_preds

In [180]:
params = {'max_features': [6, 7, 8, 9, 10], 'min_samples_leaf': [ 6, 7, 8, 9, 10, 15], 'max_depth': [18, 19, 20, 21]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=fixed_skf, verbose=3, scoring='accuracy')
grid.fit(train_X, train_y)

In [168]:
grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep[train_X.columns]).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('kaggle.csv')

LGB, kaggle score: 0.80056
<br>cv score: 0.8134133276336346

XGB, kaggle score: 0.80032
<br>cv score: 0.8155981606560655

XGB+LGB, kaggle score: 0.80032, <br> cv score: 0.8154843431439176

XGB+LGB+CTB, Kaggle score: 0.80173, <br>       cv score: 0.8155992194236201

2. Adding overfitted XGB, LGB, CTB:

In [236]:
cols = ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'CryoSleep_True', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_True',
       'cabin_side_S', 'cabin_deck_A', 'cabin_deck_B', 'cabin_deck_C',
       'cabin_deck_D', 'cabin_deck_E', 'cabin_deck_F', 'cabin_deck_G',
       'cabin_deck_T', 'FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService',
       'ShoppingMall', 'cabin_num']

In [237]:
import lightgbm as lgb
lgb_params = {
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    #eta 0.01 ('train: 0.8085527468480869', 'valid: 0.7939731642128626')
    #eta 0.03 ('train: 0.8322500349942634', 'valid: 0.8069714549649911')
    
     #regularization
    "max_depth":25,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    #('train: 0.8471183562848146', 'valid: 0.8108826746585971')
    "subsample_freq": 1,
    "min_data_in_leaf": 1,
    #('train: 0.8432359386639986', 'valid: 0.8141035779064327')
    "num_leaves":1000,
#     #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X[cols], label=train_y, free_raw_data=False)
lgb_result_overfitted = lgb.cv(lgb_params, lgb_train, 10_000, folds=fixed_skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = lgb_result_overfitted["cvbooster"].best_iteration



Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train binary_error: 0.00359486 + 0.00028779	cv_agg's valid binary_error: 0.212011 + 0.00733137
[20]	cv_agg's train binary_error: 0.00181183 + 0.000469142	cv_agg's valid binary_error: 0.205568 + 0.00486779
[30]	cv_agg's train binary_error: 0.00129416 + 0.000301685	cv_agg's valid binary_error: 0.203268 + 0.00322293
[40]	cv_agg's train binary_error: 0.00129416 + 0.000301685	cv_agg's valid binary_error: 0.199586 + 0.00406748
[50]	cv_agg's train binary_error: 0.00129416 + 0.000301685	cv_agg's valid binary_error: 0.198665 + 0.0044827
[60]	cv_agg's train binary_error: 0.00129416 + 0.000301685	cv_agg's valid binary_error: 0.198896 + 0.00580805
Early stopping, best iteration is:
[51]	cv_agg's train binary_error: 0.00129416 + 0.000301685	cv_agg's valid binary_error: 0.19694 + 0.00522694


In [238]:
lgb_clf = lgb.LGBMClassifier(**lgb_params)
lgb_preds_train = cross_val_predict(lgb_clf, train_X[cols], train_y, cv=fixed_skf, method='predict_proba')[:, 1]
lgb_clf.fit(train_X[cols], train_y)
lgb_preds_test = lgb_clf.predict_proba(test_prep[cols])[:, 1]



In [239]:
train_X['lgb_overfit'] = lgb_preds_train
test_prep['lgb_overfit'] = lgb_preds_test

In [245]:
xgb_params = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",
    #train-error-mean    0.861325
    #test-error-mean     0.810883
    
    #eta = 0.01 test 0.78, train 80.5
    
    # regularization parameters
    "max_depth": 100,
    "max_leaves": 0,
    "min_child_weight":1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 10_000
    
    #train-error-mean    0.848240
    #test-error-mean     0.811689
    
#    "tree_method": "hist",    
#    "grow_policy": "lossguide"
#     train-error-mean    0.827677
#     test-error-mean     0.807776
}
xgb_train = xgb.DMatrix(train_X[cols], train_y, feature_names=cols)
result = xgb.cv(xgb_params, xgb_train, folds=fixed_skf, num_boost_round=10_000, early_stopping_rounds=10, verbose_eval=10)
xgb_params['n_estimators'] = list(result['test-error-mean']).index(min(list(result['test-error-mean'])))
1-result.iloc[-1, [0, 2]]

[0]	train-error:0.12093+0.00152	test-error:0.23340+0.00955
[10]	train-error:0.06215+0.00336	test-error:0.19590+0.00284
[20]	train-error:0.03857+0.00248	test-error:0.19475+0.00155
[30]	train-error:0.02344+0.00141	test-error:0.19464+0.00390
[36]	train-error:0.01720+0.00132	test-error:0.19395+0.00495


train-error-mean    0.972765
test-error-mean     0.807777
Name: 27, dtype: float64

In [246]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_overfit_train = cross_val_predict(xgb_clf, train_X[cols], train_y, cv=fixed_skf, method='predict_proba')[:, 1]
xgb_clf.fit(train_X[cols], train_y)
xgb_overfit_test = xgb_clf.predict_proba(test_prep[cols])[:,1]

In [247]:
train_X['xgb_overfit'] = xgb_overfit_train
test_prep['xgb_overfit'] = xgb_overfit_test

In [248]:
import catboost as ctb
ctb_params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "logging_level":"Silent",
    
#     train-Accuracy-mean      0.817439
#     test-Accuracy-mean       0.806167
    
    #regularization
    "depth":16,
#     train-Accuracy-mean      0.814132
#     test-Accuracy-mean       0.808697
    "subsample":0.8,
    "rsm":0.7,
    "min_data_in_leaf":1,
    
    #tree
    "grow_policy":"Depthwise"
    
#     train-Accuracy-mean      0.841050
#     test-Accuracy-mean       0.810077
}
ctb_train = ctb.Pool(train_X[cols], train_y)
result = ctb.cv(ctb_train, ctb_params, folds=fixed_skf, seed=42, verbose_eval=100,plot=False)
# result.iloc[-1, [0,3, 1]]
ctb_params['iterations'] = result.iloc[-1,0]
result.iloc[-1,0]

118

In [249]:
result.iloc[-1, [1, 3]]

test-Accuracy-mean     0.806511
train-Accuracy-mean    0.979035
Name: 118, dtype: float64

In [250]:
ctb_clf = ctb.CatBoostClassifier(**ctb_params)
ctb_preds_train = cross_val_predict(ctb_clf, train_X[cols], train_y, cv=fixed_skf, method='predict_proba')[:, 1]

ctb_clf.fit(train_X[cols], train_y)
ctb_preds_test = ctb_clf.predict_proba(test_prep[cols])[:, 1]

train_X['ctb_overfit'] = ctb_preds_train
test_prep['ctb_overfit'] = ctb_preds_test

In [253]:
logistic = LogisticRegression(penalty='l2',
                                C=0.01,
                              class_weight='balanced',
                              random_state=42,
                              max_iter=1000
                                )
preds = cross_val_predict(logistic, train_X[cols], train_y, cv=fixed_skf, method='predict_proba')[:, 1]
train_X['logistic_reg'] = preds
logistic.fit(train_X[cols], train_y)
test_prep['logistic_reg'] = logistic.predict_proba(test_prep[cols])[:, 1]

In [254]:
'accuracy on cv: ' + str(accuracy_score(train_y, np.vectorize(lambda x: 1 if x > 0.5 else 0)(preds)))

'accuracy on cv: 0.796157828137582'

In [255]:
params = {'max_features': [ 7, 8, 9], 'min_samples_leaf': [5, 6, 7, 8], 'max_depth': [20, 21]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=fixed_skf, verbose=0, scoring='accuracy')
grid.fit(train_X[test_prep.columns], train_y)

In [256]:
grid.best_score_

0.8189344033944088

In [257]:
grid.best_params_

{'max_depth': 21, 'max_features': 9, 'min_samples_leaf': 6}

In [258]:
# grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.80079
<br> cv score: 0.8189344033944088

3. Adding underfitted logistic regressions:

In [264]:
logistic = LogisticRegression(C=0.00000000001)
logistic_severely_underfit_train = cross_val_predict(logistic, train_X[cols], train_y, method='predict_proba', cv=fixed_skf)[:, 1]
logistic.fit(train_X[cols], train_y)
logistic_severely_underfit_test = logistic.predict_proba(test_prep[cols])[:,1]
train_X['logistic_severely_underfit'] = logistic_severely_underfit_train
test_prep['logistic_severely_underfit'] = logistic_severely_underfit_test

In [265]:
accuracy_score(train_y, np.vectorize(lambda p: 1 if p>0.5 else 0)(logistic_severely_underfit_train))

0.5497526745657425

In [266]:
logistic = LogisticRegression(C=0.0000001)
logistic_severely_underfit_train = cross_val_predict(logistic, train_X[cols], train_y, cv=fixed_skf, method='predict_proba')[:, 1]
logistic.fit(train_X[cols], train_y)
logistic_severely_underfit_test = logistic.predict_proba(test_prep[cols])[:,1]
train_X['logistic_underfit'] = logistic_severely_underfit_train
test_prep['logistic_underfit'] = logistic_severely_underfit_test

In [267]:
accuracy_score(train_y, np.vectorize(lambda p: 1 if p>0.5 else 0)(logistic_severely_underfit_train))

0.7728057057402508

In [268]:
train_X.head(2)

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,cabin_side_S,cabin_deck_A,...,cabin_num,ctb_preds,lgb_preds,xgb_preds,lgb_overfit,xgb_overfit,ctb_overfit,logistic_reg,logistic_severely_underfit,logistic_underfit
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.733904,0.754392,0.698305,0.743251,0.374629,0.395169,0.761902,0.5,0.722113
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.128062,0.098184,0.142037,0.105581,0.168077,0.143543,0.234017,0.498577,0.396027


In [277]:
params = {'max_features': [5, 6, 7], 'min_samples_leaf': [4, 5, 6, 7], 'max_depth': [18, 19, 20]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=fixed_skf, verbose=0, scoring='accuracy')
grid.fit(train_X[test_prep.columns], train_y)

In [278]:
grid.best_score_

0.8189346019133253

In [279]:
# grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.80009
<br>cv score: 0.8189346019133253