In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [9]:
train = pd.read_csv('data/preprocessed_train.csv')
test = pd.read_csv('data/preprocessed_test.csv')

print(train.shape)
print(test.shape)

(891, 42)
(418, 41)


In [10]:
features = list(train.columns)

target = 'Survived'
features.remove(target)

train_x = train[features]
train_y = train[target]
test_x = test[features]

train_num = train_x.shape[0]
test_num = test_x.shape[1]

In [11]:
print(train_y.value_counts())

0.0    549
1.0    342
Name: Survived, dtype: int64


In [12]:
dead_weight = 549/891.0
survived_weight = 1.0 - dead_weight

In [13]:
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

seperate into two groups, one can predict prob for soft voting classifier, another group cannot. 

### Group 1: Classifiers with predict_prob functions

In [None]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=4,
                    alpha=0.0005, l1_ratio=0.87, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'alpha': [0.05, 0.01, 0.005, 0.001],
    'l1_ratio': [1.0, 0.8, 0.6],
    #'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5, verbose=1)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [24]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=1.0, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 3} 0.786822089754


In [16]:
lr = LogisticRegression(class_weight = 'balanced', random_state=3, max_iter=100000)

params = {
    'penalty': ['l1', 'l2'],
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'penalty': 'l2'} 0.809243996089


In [17]:
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=3, max_iter=100000)

params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.809243996089


In [None]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.001, random_state=3, 
                    max_iter=10000)
params = {
    'alpha': [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.0005],
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

In [45]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.005, random_state=3, 
                    max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 2} 0.804774864201


In [42]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.0,)
params = {
    'min_samples_split': [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.49],
    'min_impurity_decrease': [0.2, 0.1, 0.05, 0.01, 0.005, 0.001],
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_samples_split': 0.01, 'min_weight_fraction_leaf': 0.0, 'min_impurity_decrease': 0.001} 0.827221878816


In [46]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.830586388263


In [73]:
nb = GaussianNB(priors=[survived_weight, dead_weight])
params = {
    'priors': [[survived_weight, dead_weight], [.5, .5], [dead_weight, survived_weight]]
}
gs =  GridSearchCV(estimator=nb, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'priors': [0.38383838383838387, 0.6161616161616161]} 0.790230751472


In [78]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=1, colsample_bytree=1,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1,random_state=0)

params = {
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'subsample': [1.0, 0.8, 0.6], 
    'colsample_bytree': [1.0, 0.8, 0.6], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'colsample_bytree': 0.8, 'learning_rate': 0.01, 'subsample': 0.8} 0.839550044028


In [80]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1, random_state=0)

params = {
    'min_child_weight': [0.01, 0.05, 0.1, 0.175, 0.2, 0.25, 0.3],
    'gamma': [0.0, 0.01, 0.05, 0.1, 0.2, 0.4], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_child_weight': 0.175, 'gamma': 0.01} 0.840673639534


In [81]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.175, gamma=0.01,
                      reg_alpha=0, reg_lambda=1, random_state=0)

params = {
    'reg_alpha': [0.0, .2, .4, .6, .8, 1.],
    'reg_lambda': [0.0, .2, .4, .6, .8, 1.], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'reg_lambda': 0.6, 'reg_alpha': 0.0} 0.841797235039


In [69]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.175, gamma=0.01,
                      reg_alpha=0, reg_lambda=.6, random_state=0, objective='binary:logistic')

params = {
    'random_state': [0, 1, 2, 3, 4, 5]
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=3)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 5} 0.838383838384


In [97]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = 1.0, colsample_bytree=1.0,
                       min_child_samples = 1, min_child_weight=0.0, min_split_gain=0.01,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'learning_rate': [0.005, 0.01, 0.05],
    'subsample': [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], 
    'colsample_bytree': [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'colsample_bytree': 0.6, 'learning_rate': 0.01, 'subsample': 0.2} 0.840648318472


In [98]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = .2, colsample_bytree=.6,
                       min_child_samples = 1, min_child_weight=0.0, min_split_gain=0.01,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'min_child_samples': [1, 2, 4, 8, 16, 32],
    'min_child_weight': [0.2, 0.4, 0.8, 1.6, 3.2], 
    'min_split_gain': [0.0, 0.01, 0.05, 0.1, 0.2, 0.4], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_child_weight': 0.4, 'min_child_samples': 4, 'min_split_gain': 0.1} 0.848526112079


In [99]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = .2, colsample_bytree=.6,
                       min_child_samples = 4, min_child_weight=0.4, min_split_gain=0.1,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'reg_alpha': [0.0, .2, .4, .6, .8, 1.],
    'reg_lambda': [0.0, .2, .4, .6, .8, 1.], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'reg_lambda': 0.0, 'reg_alpha': 0.0} 0.848526112079


In [100]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = .2, colsample_bytree=.6,
                       min_child_samples = 4, min_child_weight=0.4, min_split_gain=0.1,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'random_state': [0,1,2,3,4,5]
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 0} 0.848526112079


In [64]:
# 0.776684338214
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=1, max_iter=100000)

# 0.786822089754
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=1.0, random_state=3, max_iter=100000, learning_rate='optimal')

# 0.790230751472
nb = GaussianNB(priors=[survived_weight, dead_weight])

# 0.804774864201
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.005, random_state=2, 
                    max_iter=10000)

# 0.830586388263
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=1,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)

# 0.841797235039
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.175, gamma=0.01,
                      reg_alpha=0, reg_lambda=.6, random_state=0, objective='binary:logistic')

# 0.848526112079
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = .2, colsample_bytree=.6,
                       min_child_samples = 4, min_child_weight=0.4, min_split_gain=0.1,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

### Group 2: Classifiers without predict_prob functions

In [31]:
ridge = RidgeClassifier(alpha=1.0, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'alpha': [1.0, 0.8, 0.6, 0.4, 0.2],
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'alpha': 0.2} 0.777770129446


In [47]:
ridge = RidgeClassifier(alpha=0.2, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.777770129446


In [8]:
# 0.777770129446
ridge = RidgeClassifier(alpha=0.2, normalize=True, class_weight='balanced', random_state=1, max_iter=10000)

## Use Group1 + 2 (cv accuracy < 0.83) for voting classifier

In [9]:
from sklearn.ensemble import VotingClassifier

use hard voting since these models are not well-calibrated

In [10]:
vc = VotingClassifier(estimators=[('lr', lr), ('sgd', sgd), ('nb', nb), ('mlp', mlp)], 
                 voting='hard', n_jobs=2)

In [11]:
cross_val_score(vc, train_x, train_y, cv=5, scoring='accuracy')

array([ 0.79888268,  0.76536313,  0.80337079,  0.82022472,  0.8079096 ])

In [115]:
np.array([ 0.79888268,  0.76536313,  0.80337079,  0.82022472,  0.8079096 ]).mean()

0.79915018399999993

In [10]:
vc = VotingClassifier(estimators=[('lr', lr), ('sgd', sgd), ('nb', nb), ('mlp', mlp)], 
                 voting='hard', n_jobs=2)

## Use Group1 + 2 (cv accuracy < 0.83) in the stacking averaged model

In [11]:
from stacking_models_api import StackingAveragedModels
from cross_valid_api import cross_validate
from sklearn.metrics import accuracy_score

In [12]:
supervised_dict = {
    'ridge': ridge,
    'lr': lr,
    'sgd': sgd,
    'mlp': mlp,
    'nb': nb
}

In [14]:
sam = StackingAveragedModels(sl_base_models_dict=supervised_dict, meta_model=rf, target_col='Survived', eval_func=accuracy_score)

In [16]:
cross_validate(sam, train_x, train_y, 5, scoring=accuracy_score)


 ridge
score= 0.755244755245
score= 0.762237762238
score= 0.781690140845
score= 0.788732394366
score= 0.774647887324
Avg score =  0.772510588004

 lr
score= 0.755244755245
score= 0.762237762238
score= 0.781690140845
score= 0.774647887324
score= 0.781690140845
Avg score =  0.771102137299

 nb
score= 0.769230769231
score= 0.762237762238
score= 0.746478873239
score= 0.788732394366
score= 0.830985915493
Avg score =  0.779533142913

 sgd
score= 0.741258741259
score= 0.755244755245
score= 0.746478873239
score= 0.795774647887
score= 0.802816901408
Avg score =  0.768314783808

 mlp
score= 0.79020979021
score= 0.811188811189
score= 0.788732394366
score= 0.802816901408
score= 0.802816901408
Avg score =  0.799152959716
meta model's training set score=  0.799157303371 

fold  1  valid score:  0.843575418994

 ridge
score= 0.769230769231
score= 0.811188811189
score= 0.734265734266
score= 0.830985915493
score= 0.774647887324
Avg score =  0.7840638235

 lr
score= 0.776223776224
score= 0.804195804196

### The CV result is slightly better when using stacking averaged model than using hard-voting classifier

In [13]:
meta_rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=1,
                            min_samples_split=0.005, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.005)

In [18]:
sam = StackingAveragedModels(sl_base_models_dict=supervised_dict, meta_model=meta_rf, target_col='Survived', eval_func=accuracy_score)

In [21]:
sam.fit(train_x, train_y)


 ridge
score= 0.821229050279
score= 0.752808988764
score= 0.741573033708
score= 0.76404494382
score= 0.803370786517
Avg score =  0.776605360618

 lr
score= 0.810055865922
score= 0.76404494382
score= 0.758426966292
score= 0.747191011236
score= 0.803370786517
Avg score =  0.776617914757

 mlp
score= 0.815642458101
score= 0.820224719101
score= 0.76404494382
score= 0.792134831461
score= 0.808988764045
Avg score =  0.800207143306

 nb
score= 0.804469273743
score= 0.741573033708
score= 0.758426966292
score= 0.786516853933
score= 0.85393258427
Avg score =  0.788983742389

 sgd
score= 0.810055865922
score= 0.769662921348
score= 0.76404494382
score= 0.769662921348
score= 0.825842696629
Avg score =  0.787853869814


In [22]:
sam.predict(test_x)

meta model's training set score=  0.803591470258 



array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [23]:
sam.predict_proba(test_x)

meta model's training set score=  0.803591470258 



array([[ 0.81607696,  0.18392304],
       [ 0.50928251,  0.49071749],
       [ 0.81607696,  0.18392304],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.36810826,  0.63189174],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.81607696,  0.18392304],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.16752006,  0.83247994],
       [ 0.56778112,  0.43221888],
       [ 0.4312676 ,  0.5687324 ],
       [ 0.16752006,  0.83247994],
       [ 0.36810826,  0.63189174],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.16752006,  0.83247994],
       [ 0.81607696,  0.18392304],
       [ 0.36810826,

In [14]:
sam = StackingAveragedModels(sl_base_models_dict=supervised_dict, meta_model=meta_rf, target_col='Survived', eval_func=accuracy_score)

In [19]:
final_vc = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('xg', xg), ('lg', lg)], 
                 voting='soft', n_jobs=2)

In [16]:
weights = np.arange(4)

params = {
    'weights': [weights, 
                np.power(1.5, weights), 
                np.power(2, weights),
                np.power(2.5, weights),
                np.power(3, weights),
                np.power(3.5, weights),
                np.power(4, weights),
                np.power(4.5, weights),
                np.power(5, weights),]
}

gs =  GridSearchCV(estimator=final_vc, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'weights': array([ 1,  3,  9, 27], dtype=int32)} 0.848519905936


In [33]:
final_vc_soft = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('xg', xg), ('lg', lg)], 
                 weights = [ 1,  3,  9, 27],
                 voting='soft', n_jobs=2)

In [34]:
cross_validate(final_vc_soft, train_x, train_y, 5, accuracy_score)

fold  1  valid score:  0.871508379888
fold  2  valid score:  0.814606741573
fold  3  valid score:  0.85393258427
fold  4  valid score:  0.825842696629
fold  5  valid score:  0.803370786517
5  fold(s) avg. valid score:  0.833852237775


In [50]:
final_vc_soft.fit(train_x, train_y)

VotingClassifier(estimators=[('sam', StackingAveragedModels(eval_func=<function accuracy_score at 0x000000000AFF7048>,
            meta_model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurit...=0, reg_lambda=0.0, silent=True, subsample=0.2,
        subsample_for_bin=50000, subsample_freq=1))],
         flatten_transform=None, n_jobs=2, voting='soft',
         weights=[1, 3, 9, 27])

In [51]:
final_vc_soft.predict(test_x)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0,

In [52]:
final_vc_soft.predict_proba(test_x)

array([[ 0.94485294,  0.03014706],
       [ 0.85716361,  0.11783639],
       [ 0.839345  ,  0.135655  ],
       [ 0.70332417,  0.27167583],
       [ 0.43338636,  0.54161364],
       [ 0.89663759,  0.0783624 ],
       [ 0.5667298 ,  0.4082702 ],
       [ 0.88041882,  0.09458118],
       [ 0.11769595,  0.85730405],
       [ 0.95370744,  0.02129256],
       [ 0.94568542,  0.02931458],
       [ 0.94694435,  0.02805564],
       [ 0.01546457,  0.95953543],
       [ 0.94256768,  0.03243232],
       [ 0.03614936,  0.93885064],
       [ 0.03996621,  0.93503379],
       [ 0.87679615,  0.09820385],
       [ 0.68860454,  0.28639546],
       [ 0.22852706,  0.74647294],
       [ 0.45429703,  0.52070296],
       [ 0.9178716 ,  0.0571284 ],
       [ 0.59838973,  0.37661027],
       [ 0.04087199,  0.93412801],
       [ 0.75282841,  0.2221716 ],
       [ 0.02144749,  0.95355251],
       [ 0.95162838,  0.02337162],
       [ 0.01398618,  0.96101382],
       [ 0.81224838,  0.16275162],
       [ 0.50709649,

In [53]:
final_vc_hard = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('xg', xg), ('lg', lg)], 
                 voting='hard', n_jobs=2)

In [39]:
cross_validate(final_vc_hard, train_x, train_y, 5, accuracy_score)

fold  1  valid score:  0.871508379888
fold  2  valid score:  0.820224719101
fold  3  valid score:  0.842696629213
fold  4  valid score:  0.85393258427
fold  5  valid score:  0.797752808989
5  fold(s) avg. valid score:  0.837223024292


In [54]:
final_vc_hard.fit(train_x, train_y)

VotingClassifier(estimators=[('sam', StackingAveragedModels(eval_func=<function accuracy_score at 0x000000000AFF7048>,
            meta_model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurit...=0, reg_lambda=0.0, silent=True, subsample=0.2,
        subsample_for_bin=50000, subsample_freq=1))],
         flatten_transform=None, n_jobs=2, voting='hard', weights=None)

In [55]:
final_vc_hard.predict(test_x)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [65]:
xg.fit(train_x, train_y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.01, learning_rate=0.01,
       max_delta_step=0, max_depth=3, min_child_weight=0.175, missing=None,
       n_estimators=1200, n_jobs=2, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=0.6, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8)

In [66]:
xg.predict(test_x)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

## 2 versions of sbumissions

In [44]:
orig_test = pd.read_csv('data/test.csv', encoding='utf-8')

In [47]:
ID = orig_test['PassengerId']

In [61]:
submission_soft = pd.DataFrame()
submission_soft['PassengerId'] = ID
submission_soft['Survived'] = final_vc_soft.predict(test_x)
submission_soft.to_csv('submission_soft.csv', encoding='utf-8', index=False)

In [62]:
submission_hard = pd.DataFrame()
submission_hard['PassengerId'] = ID
submission_hard['Survived'] = final_vc_hard.predict(test_x)
submission_hard.to_csv('submission_hard.csv', encoding='utf-8', index=False)

In [67]:
submission_xg = pd.DataFrame()
submission_xg['PassengerId'] = ID
submission_xg['Survived'] = xg.predict(test_x)
submission_xg.to_csv('submission_xg.csv', encoding='utf-8', index=False)