In [1]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from IPython import display
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
train = pd.read_csv('../data/spaceship_titanic_train.csv')
test = pd.read_csv('../data/spaceship_titanic_test.csv')

In [3]:
def prepare_data(data, do_1hot=True, X_only = False):
    group_count = lambda id_: id_[:4]
    data['travel_group'] = data['PassengerId'].apply(lambda id_: id_[:4])
    data['travel_group_n'] = data['travel_group'].apply(lambda group: data.loc[data.travel_group == group].shape[0])
    data['travelling_with_n_kids'] = (-1)*(train['Age'] < 18).astype(int)+ data['travel_group'].apply(lambda group: data.loc[(data.travel_group == group) & (data.Age < 18)].shape[0])
    data['cabin_side'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[-1])
    data['cabin_deck'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else str(cabin)[0])
    data['cabin_num'] = data.Cabin.apply(lambda cabin: np.nan if str(cabin) == 'nan' else int(str(cabin)[2:-2]))
    
    
    drop_ = ['PassengerId','Name', 'Cabin', 'travel_group']
    if not X_only:
        cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'cabin_side', 'cabin_deck']
    else:
        cat = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_side', 'cabin_deck']
    num = ['FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService', 'ShoppingMall', 'cabin_num']
    one_hot = OneHotEncoder(sparse_output=False, drop='if_binary')
   
    if do_1hot:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        one_hot)
    else:
        pipe = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OrdinalEncoder())
    
    transformer = ColumnTransformer([
        ('cat', pipe, cat),
        ('num', SimpleImputer(strategy='mean'), num),
        ('drop_', 'drop', drop_)
    ], remainder='passthrough',
       verbose_feature_names_out=False)

    
    d = transformer.fit_transform(data)
    if do_1hot:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out(), dtype=float)
    else:
        df = pd.DataFrame(d, columns = transformer.get_feature_names_out())
    return df

In [83]:
train_prep = prepare_data(train, True)

In [84]:
train_X, train_y = train_prep.drop('Transported_True', axis=1), train_prep.Transported_True

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)
import lightgbm as lgb
lgb_params = {
    #('train: 0.8529852970335823', 'valid: 0.8153695330371873')
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    #eta 0.01 ('train: 0.8085527468480869', 'valid: 0.7939731642128626')
    #eta 0.03 ('train: 0.8322500349942634', 'valid: 0.8069714549649911')
    
     #regularization
    "max_depth":5,
    "colsample_bytree": 0.9,
    "subsample": 0.9,
    #('train: 0.8471183562848146', 'valid: 0.8108826746585971')
    "subsample_freq": 1,
#     "min_data_in_leaf": 60,
    #('train: 0.8432359386639986', 'valid: 0.8141035779064327')
#     "num_leaves":20,
#     #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X, label=train_y, free_raw_data=False)
result = lgb.cv(lgb_params, lgb_train, 10_000, folds=skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = result["cvbooster"].best_iteration



Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train binary_error: 0.19277 + 0.00192927	cv_agg's valid binary_error: 0.208904 + 0.00359363
[20]	cv_agg's train binary_error: 0.182043 + 0.00121495	cv_agg's valid binary_error: 0.202923 + 0.0028768
[30]	cv_agg's train binary_error: 0.173732 + 0.00267712	cv_agg's valid binary_error: 0.197285 + 0.000977483
[40]	cv_agg's train binary_error: 0.166111 + 0.00274629	cv_agg's valid binary_error: 0.193144 + 0.00315814
[50]	cv_agg's train binary_error: 0.162113 + 0.00184924	cv_agg's valid binary_error: 0.190498 + 0.00241319
[60]	cv_agg's train binary_error: 0.15593 + 0.00218753	cv_agg's valid binary_error: 0.191417 + 0.00392863
Early stopping, best iteration is:
[56]	cv_agg's train binary_error: 0.158662 + 0.00228811	cv_agg's valid binary_error: 0.189692 + 0.00282765


In [7]:
xgb_params = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",
    #train-error-mean    0.861325
    #test-error-mean     0.810883
    
    #eta = 0.01 test 0.78, train 80.5
    
    # regularization parameters
    "max_depth": 5,
    "max_leaves": 0,
    "min_child_weight":1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 10_000
    
    #train-error-mean    0.848240
    #test-error-mean     0.811689
    
#    "tree_method": "hist",    
#    "grow_policy": "lossguide"
#     train-error-mean    0.827677
#     test-error-mean     0.807776
}
xgb_train = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)
result = xgb.cv(xgb_params, xgb_train, folds =skf, num_boost_round=10_000, early_stopping_rounds=10, verbose_eval=10)
xgb_params['n_estimators'] = list(result['test-error-mean']).index(min(list(result['test-error-mean'])))
1-result.iloc[-1, [0, 2]]

[0]	train-error:0.24140+0.00109	test-error:0.25020+0.00579
[10]	train-error:0.18981+0.00140	test-error:0.20821+0.00452
[20]	train-error:0.18115+0.00137	test-error:0.20246+0.00477
[30]	train-error:0.17169+0.00144	test-error:0.19591+0.00383
[40]	train-error:0.16620+0.00193	test-error:0.19360+0.00493
[50]	train-error:0.16125+0.00164	test-error:0.19096+0.00499
[60]	train-error:0.15383+0.00171	test-error:0.19038+0.00750
[70]	train-error:0.14960+0.00188	test-error:0.18946+0.00611
[73]	train-error:0.14857+0.00175	test-error:0.18935+0.00643


train-error-mean    0.848240
test-error-mean     0.811689
Name: 64, dtype: float64

In [8]:
import catboost as ctb
ctb_params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "logging_level":"Silent",
    
#     train-Accuracy-mean      0.817439
#     test-Accuracy-mean       0.806167
    
    #regularization
    "depth":4,
#     train-Accuracy-mean      0.814132
#     test-Accuracy-mean       0.808697
    "subsample":0.8,
    "rsm":0.7,
    "min_data_in_leaf":50,
    
    #tree
    "grow_policy":"Depthwise"
    
#     train-Accuracy-mean      0.841050
#     test-Accuracy-mean       0.810077
}
ctb_train = ctb.Pool(train_X, train_y)
result = ctb.cv(ctb_train, ctb_params, folds=skf, seed=42, verbose_eval=100,plot=False)
# result.iloc[-1, [0,3, 1]]
ctb_params['iterations'] = result.iloc[-1,0]
result.iloc[-1,0]

213

In [85]:
test_prep = prepare_data(test, do_1hot=True, X_only=True)

In [86]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(train_X, train_y)
xgb_preds = xgb_clf.predict_proba(test_prep)
xgb_preds = xgb_preds[:, 1]

In [87]:
lgb_clf = lgb.LGBMClassifier(**lgb_params)
lgb_clf.fit(train_X, train_y)
lgb_preds = lgb_clf.predict_proba(test_prep)
lgb_preds = lgb_preds[:, 1]



In [88]:
ctb_clf = ctb.CatBoostClassifier(**ctb_params)
ctb_clf.fit(train_X, train_y)
ctb_preds = ctb_clf.predict_proba(test_prep)
ctb_preds = ctb_preds[:, 1]

Kaggle score on LightGBM after specifying optimal n_iter: 0.80476

Kaggle XGBoost Score: 0.80804

CatBoost: 0.8036

## 1. Mean

In [13]:
preds = xgb_preds*(1/3) + ctb_preds*(1/3) + lgb_preds*(1/3)

In [14]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.80851

In [15]:
preds = xgb_preds*(1/2) + ctb_preds*(1/8) + lgb_preds*(3/8)

In [16]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.81061

## 2. Median

In [17]:
preds = np.median(np.c_[xgb_preds,ctb_preds,lgb_preds], axis=1)

In [18]:
df = pd.DataFrame()
df['Transported'] = np.vectorize(lambda p: 1 if p > 0.5 else 0)(preds).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.81108

## 3. Stacking

1. Adding XGB, LGB and CTB preds to the Extra Trees

In [19]:
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [89]:
xgb_preds_train = cross_val_predict(xgb_clf, train_X, train_y, cv=3, method='predict_proba')[:, 1]
lgb_preds_train = cross_val_predict(lgb_clf, train_X, train_y, cv=3, method='predict_proba')[:, 1]
# ctb_preds_train = cross_val_predict(ctb_clf, train_X, train_y, cv=3, method='predict_proba')[:, 1]



In [90]:
# train_X['ctb_preds'] = ctb_preds_train
train_X['lgb_preds'] = lgb_preds_train
train_X['xgb_preds'] = xgb_preds_train

In [91]:
xgb_preds = xgb_clf.predict_proba(test_prep)[:, 1]
lgb_preds = lgb_clf.predict_proba(test_prep)[:, 1]
# ctb_preds = ctb_clf.predict_proba(test_prep)[:, 1]
test_prep['xgb_preds'] = xgb_preds
test_prep['lgb_preds'] = lgb_preds
# test_prep['ctb_preds'] = ctb_preds

In [92]:
params = {'max_features': [5, 6, 7], 'min_samples_leaf': [4, 5, 6, 7, 8], 'max_depth': [18, 19, 20]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=5, verbose=0, scoring='accuracy')
grid.fit(train_X, train_y)

In [93]:
grid.best_estimator_

In [94]:
grid.best_score_

0.7996125572478924

In [95]:
grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep[train_X.columns]).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

XGB, kaggle score: 0.79892

LGB, kaggle score: 0.80102

XGB+LGB, kaggle score: 0.80266

XGB+LGB+CTB, Kaggle score: 0.80126

2. Adding overfitted XGB, LGB, CTB:

In [96]:
cols = ['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'CryoSleep_True', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e', 'VIP_True',
       'cabin_side_S', 'cabin_deck_A', 'cabin_deck_B', 'cabin_deck_C',
       'cabin_deck_D', 'cabin_deck_E', 'cabin_deck_F', 'cabin_deck_G',
       'cabin_deck_T', 'FoodCourt', 'VRDeck', 'Spa', 'Age', 'RoomService',
       'ShoppingMall', 'cabin_num', 'travel_group_n', 'travelling_with_n_kids']

In [97]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)
import lightgbm as lgb
lgb_params = {
    "objective": "binary",
    "learning_rate": 0.1,
    "num_threads": 10,
    "metric": "binary_error",
    "seed": 42,
   "verbose":-1,
    
    #eta 0.01 ('train: 0.8085527468480869', 'valid: 0.7939731642128626')
    #eta 0.03 ('train: 0.8322500349942634', 'valid: 0.8069714549649911')
    
     #regularization
    "max_depth":25,
    "colsample_bytree": 0.8,
    "subsample": 0.8,
    #('train: 0.8471183562848146', 'valid: 0.8108826746585971')
    "subsample_freq": 1,
    "min_data_in_leaf": 1,
    #('train: 0.8432359386639986', 'valid: 0.8141035779064327')
    "num_leaves":1000,
#     #('train: 0.8392384710701339', 'valid: 0.8137586182024641')
    
    "n_estimators":10_000
    
    #categorical features
#     'cat_smooth': 5,
#     'min_data_per_group': 2
#     did not improve the results
    
}
lgb_train = lgb.Dataset(train_X[cols], label=train_y, free_raw_data=False)
result = lgb.cv(lgb_params, lgb_train, 10_000, folds=skf, callbacks=[lgb.early_stopping(10), lgb.log_evaluation(10)], eval_train_metric=True, return_cvbooster=True)
lgb_params['n_estimators'] = result["cvbooster"].best_iteration



Training until validation scores don't improve for 10 rounds
[10]	cv_agg's train binary_error: 0.00293342 + 0.000347686	cv_agg's valid binary_error: 0.21201 + 0.00436324
[20]	cv_agg's train binary_error: 0.00126542 + 0.000525659	cv_agg's valid binary_error: 0.204647 + 0.0028758
[30]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.201312 + 0.00482641
[40]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.200393 + 0.0071966
[50]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.197402 + 0.00621735
[60]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.197976 + 0.00374019
[70]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.194986 + 0.00692065
[80]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_agg's valid binary_error: 0.194641 + 0.00776258
[90]	cv_agg's train binary_error: 0.00086278 + 0.000315081	cv_

In [98]:
lgb_clf = lgb.LGBMClassifier(**lgb_params)
lgb_preds_train = cross_val_predict(lgb_clf, train_X[cols], train_y, cv=5, method='predict_proba')[:, 1]
lgb_clf.fit(train_X[cols], train_y)
lgb_preds_test = lgb_clf.predict_proba(test_prep[cols])[:, 1]



In [99]:
train_X['lgb_overfit'] = lgb_preds_train
test_prep['lgb_overfit'] = lgb_preds_test

In [118]:
logistic = LogisticRegression(penalty='l2',
                                C=0.01,
                              class_weight='balanced',
                              random_state=42,
                              max_iter=1000
                                )
preds = cross_val_predict(logistic, train_X[cols], train_y, cv=3, method='predict_proba')[:, 1]
train_X['logistic_reg'] = preds
logistic.fit(train_X[cols], train_y)
test_prep['logistic_reg'] = logistic.predict_proba(test_prep[cols])[:, 1]

In [101]:
xgb_params = {
    #default
    "objective": "binary:logistic",
    "eta": 0.1,
    "verbosity": 0,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": "error",
    #train-error-mean    0.861325
    #test-error-mean     0.810883
    
    #eta = 0.01 test 0.78, train 80.5
    
    # regularization parameters
    "max_depth": 20,
    "max_leaves": 0,
    "min_child_weight":1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "n_estimators": 10_000
    
    #train-error-mean    0.848240
    #test-error-mean     0.811689
    
#    "tree_method": "hist",    
#    "grow_policy": "lossguide"
#     train-error-mean    0.827677
#     test-error-mean     0.807776
}
xgb_train = xgb.DMatrix(train_X[cols], train_y, feature_names=cols)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=127)
result = xgb.cv(xgb_params, xgb_train, folds =skf, num_boost_round=10_000, early_stopping_rounds=10, verbose_eval=10)
xgb_params['n_estimators'] = list(result['test-error-mean']).index(min(list(result['test-error-mean'])))
1-result.iloc[-1, [0, 2]]

[0]	train-error:0.13436+0.00378	test-error:0.27148+0.00537
[10]	train-error:0.05818+0.00241	test-error:0.20246+0.00363
[20]	train-error:0.03649+0.00203	test-error:0.19855+0.00410
[30]	train-error:0.02108+0.00099	test-error:0.19763+0.00419
[32]	train-error:0.01884+0.00153	test-error:0.19867+0.00528


train-error-mean    0.968768
test-error-mean     0.804210
Name: 23, dtype: float64

In [102]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_overfit_train = cross_val_predict(xgb_clf, train_X[cols], train_y, cv=5, method='predict_proba')[:, 1]
xgb_clf.fit(train_X[cols], train_y)
xgb_overfit_test = xgb_clf.predict_proba(test_prep[cols])[:,1]

In [103]:
train_X['xgb_overfit'] = xgb_overfit_train
test_prep['xgb_overfit'] = xgb_overfit_test

In [104]:
import catboost as ctb
ctb_params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "iterations": 1000,
    "learning_rate": 0.1,
    "random_seed": 42,
    "od_wait": 30,
    "od_type": "Iter",
    "thread_count": 10,
    "logging_level":"Silent",
    
#     train-Accuracy-mean      0.817439
#     test-Accuracy-mean       0.806167
    
    #regularization
    "depth":16,
#     train-Accuracy-mean      0.814132
#     test-Accuracy-mean       0.808697
    "subsample":0.8,
    "rsm":0.7,
    "min_data_in_leaf":50,
    
    #tree
    "grow_policy":"Depthwise"
    
#     train-Accuracy-mean      0.841050
#     test-Accuracy-mean       0.810077
}
ctb_train = ctb.Pool(train_X, train_y)
result = ctb.cv(ctb_train, ctb_params, folds=skf, seed=42, verbose_eval=100,plot=False)
# result.iloc[-1, [0,3, 1]]
ctb_params['iterations'] = result.iloc[-1,0]
result.iloc[-1,0]

108

In [105]:
result.iloc[-1, [1, 3]]

test-Accuracy-mean     0.816865
train-Accuracy-mean    0.974030
Name: 108, dtype: float64

In [106]:
ctb_clf = ctb.CatBoostClassifier(**ctb_params)
ctb_preds_train = cross_val_predict(ctb_clf, train_X[cols], train_y, cv=5, method='predict_proba')[:, 1]

ctb_clf.fit(train_X[cols], train_y)
ctb_preds_test = ctb_clf.predict_proba(test_prep[cols])[:, 1]

train_X['ctb_overfit'] = ctb_preds_train
test_prep['ctb_overfit'] = ctb_preds_test

In [120]:
params = {'max_features': [5, 6, 7], 'min_samples_leaf': [4, 5, 6], 'max_depth': [18, 19, 20]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=5, verbose=0, scoring='accuracy')
grid.fit(train_X, train_y)

In [38]:
grid.best_score_

0.8080092456876727

In [39]:
grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep[train_X.columns]).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.79845

3. Adding underfitted logistic regressions:

In [107]:
logistic = LogisticRegression(C=0.00000000001)
logistic_severely_underfit_train = cross_val_predict(logistic, train_X[cols], train_y, method='predict_proba')[:, 1]
logistic.fit(train_X[cols], train_y)
logistic_severely_underfit_test = logistic.predict_proba(test_prep[cols])[:,1]
train_X['logistic_severely_underfit'] = logistic_severely_underfit_train
test_prep['logistic_severely_underfit'] = logistic_severely_underfit_test

In [108]:
accuracy_score(train_y, np.vectorize(lambda p: 1 if p>0.5 else 0)(logistic_severely_underfit_train))

0.5515932359369607

In [109]:
logistic = LogisticRegression(C=0.0000001)
logistic_severely_underfit_train = cross_val_predict(logistic, train_X[cols], train_y, method='predict_proba')[:, 1]
logistic.fit(train_X[cols], train_y)
logistic_severely_underfit_test = logistic.predict_proba(test_prep[cols])[:,1]
train_X['logistic_underfit'] = logistic_severely_underfit_train
test_prep['logistic_underfit'] = logistic_severely_underfit_test

In [110]:
accuracy_score(train_y, np.vectorize(lambda p: 1 if p>0.5 else 0)(logistic_severely_underfit_train))

0.76969975842632

In [111]:
train_X.head(2)

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,cabin_side_S,cabin_deck_A,...,travel_group_n,travelling_with_n_kids,lgb_preds,xgb_preds,lgb_overfit,logistic_reg,xgb_overfit,ctb_overfit,logistic_severely_underfit,logistic_underfit
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.021052,0.364029,0.113313,0.725583,0.416498,0.067392,0.5,0.76007
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.006808,0.249655,0.001791,0.288185,0.1923,0.006822,0.498639,0.440897


In [125]:
params = {'max_features': [5, 6, 7], 'min_samples_leaf': [4, 5, 6], 'max_depth': [18, 19, 20]}
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
grid = GridSearchCV(extra_trees, params, cv=5, verbose=0, scoring='accuracy')
grid.fit(train_X, train_y)

In [126]:
grid.best_score_

0.8082398584957161

In [124]:
train_X

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,cabin_side_S,cabin_deck_A,...,travel_group_n,travelling_with_n_kids,lgb_preds,xgb_preds,lgb_overfit,logistic_reg,xgb_overfit,ctb_overfit,logistic_severely_underfit,logistic_underfit
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.021052,0.364029,0.113313,0.725583,0.416498,0.067392,0.500000,0.760070
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.006808,0.249655,0.001791,0.288185,0.192300,0.006822,0.498639,0.440897
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,2.0,0.0,0.001889,0.095474,0.000642,0.000049,0.118349,0.037262,0.487882,0.000030
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,2.0,0.0,0.000782,0.108845,0.001101,0.010386,0.115037,0.008218,0.493448,0.006246
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.007384,0.282802,0.000824,0.252875,0.164430,0.146857,0.498468,0.372061
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.851204,0.626135,0.714163,0.879245,0.770861,0.790302,0.501592,0.868423
8689,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.998676,0.891678,0.120602,0.836636,0.455795,0.118803,0.499986,0.626546
8690,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.987176,0.862881,0.117032,0.812295,0.608523,0.268266,0.500077,0.695083
8691,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,2.0,0.0,0.001518,0.106638,0.001345,0.021834,0.111150,0.016703,0.493881,0.005712


In [127]:
grid.best_estimator_.fit(train_X, train_y)
df = pd.DataFrame()
df['Transported'] = grid.best_estimator_.predict(test_prep[train_X.columns]).astype(bool)
df.set_index(test.PassengerId, inplace=True)
df.to_csv('spaceship_preds_ensemble_mean_3.csv')

Kaggle score: 0.80009