In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [9]:
train = pd.read_csv('data/preprocessed_train.csv')
test = pd.read_csv('data/preprocessed_test.csv')

print(train.shape)
print(test.shape)

(891, 42)
(418, 41)


In [10]:
features = list(train.columns)

target = 'Survived'
features.remove(target)

train_x = train[features]
train_y = train[target]
test_x = test[features]

train_num = train_x.shape[0]
test_num = test_x.shape[1]

In [11]:
print(train_y.value_counts())

0.0    549
1.0    342
Name: Survived, dtype: int64


In [12]:
dead_weight = 549/891.0
survived_weight = 1.0 - dead_weight

In [33]:
from sklearn.linear_model import SGDClassifier, LogisticRegression, RidgeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

seperate into two groups, one can predict prob for soft voting classifier, another group cannot. 

### Group 1: Classifiers with predict_prob functions

In [20]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=4,
                    alpha=0.0005, l1_ratio=0.87, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'alpha': [0.05, 0.01, 0.005, 0.001],
    'l1_ratio': [1.0, 0.8, 0.6],
    #'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5, verbose=1)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 31.1min finished


{'l1_ratio': 0.6, 'alpha': 0.005} 0.808082454455


In [21]:
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=0.6, random_state=3, max_iter=100000, learning_rate='optimal')
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=sgd, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 3} 0.808082454455


In [22]:
lr = LogisticRegression(class_weight = 'balanced', random_state=3, max_iter=100000)

params = {
    'penalty': ['l1', 'l2'],
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'penalty': 'l2'} 0.809243996089


In [23]:
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=3, max_iter=100000)

params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=lr, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.809243996089


In [24]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.001, random_state=3, 
                    max_iter=10000)
params = {
    'alpha': [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005],
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'alpha': 0.0005} 0.813776395167


In [25]:
# use lbfgs for small data set
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.0005, random_state=3, 
                    max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=mlp, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 3} 0.813776395167


In [26]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=2, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.0,)
params = {
    'min_samples_split': [0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5],
    'min_weight_fraction_leaf': [0.0, 0.1, 0.2, 0.3, 0.4, 0.49],
    'min_impurity_decrease': [0.2, 0.1, 0.05, 0.01, 0.005, 0.001],
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_impurity_decrease': 0.001, 'min_samples_split': 0.01, 'min_weight_fraction_leaf': 0.0} 0.831697429629


In [39]:
# use lbfgs for small data set
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=rf, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 3} 0.831697429629


In [37]:
nb = MultinomialNB() #priors=[survived_weight, dead_weight])
params = {
    'alpha': [1.0, 0.8, 0.6, 0.4, 0.2, 0.1],
    'fit_prior': [True, False],
    'class_prior':[[survived_weight, dead_weight], [.5, .5], [dead_weight, survived_weight]]
}
gs =  GridSearchCV(estimator=nb, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'class_prior': [0.5, 0.5], 'alpha': 0.2, 'fit_prior': True} 0.707246965817


In [43]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=1, colsample_bytree=1,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1,random_state=0, objective = 'binary:logistic', eval_metric='logloss')

params = {
    'learning_rate': [0.001, 0.005, 0.01, 0.05],
    'subsample': [1.0, 0.8, 0.6], 
    'colsample_bytree': [1.0, 0.8, 0.6], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.01} 0.848513770721


In [45]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.0, gamma=0.0,
                      reg_alpha=0, reg_lambda=1, random_state=0, objective = 'binary:logistic', eval_metric='logloss')

params = {
    'min_child_weight': [0.01, 0.1, 0.2, 0.3, 0.4, 0.5],
    'gamma': [0.0, 0.01, 0.05, 0.1, 0.2, 0.4], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_child_weight': 0.3, 'gamma': 0.2} 0.850748407592


In [46]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.3, gamma=0.2,
                      reg_alpha=0, reg_lambda=1, random_state=0, objective = 'binary:logistic', eval_metric='logloss')

params = {
    'reg_alpha': [0.0, .2, .4, .6, .8, 1.],
    'reg_lambda': [0.0, .2, .4, .6, .8, 1.], 
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'reg_lambda': 1.0, 'reg_alpha': 0.0} 0.850748407592


In [47]:
xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.3, gamma=0.2,
                      reg_alpha=0, reg_lambda=1, random_state=0, objective = 'binary:logistic', eval_metric='logloss')

params = {
    'random_state': [0, 1, 2, 3, 4, 5]
}
gs =  GridSearchCV(estimator=xg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 0} 0.850748407592


In [48]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.01, subsample = 1.0, colsample_bytree=1.0,
                       min_child_samples = 1, min_child_weight=0.0, min_split_gain=0.01,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'learning_rate': [0.005, 0.01, 0.05],
    'subsample': [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], 
    'colsample_bytree': [1.0, 0.8, 0.6, 0.4, 0.2, 0.1], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.005} 0.840667362464


In [49]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.005, subsample = .4, colsample_bytree=.6,
                       min_child_samples = 1, min_child_weight=0.0, min_split_gain=0.01,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'min_child_samples': [1, 2, 4, 8, 16, 32],
    'min_child_weight': [0.2, 0.4, 0.8, 1.6, 3.2], 
    'min_split_gain': [0.0, 0.01, 0.05, 0.1, 0.2, 0.4], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'min_split_gain': 0.0, 'min_child_weight': 1.6, 'min_child_samples': 1} 0.849649849439


In [50]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.005, subsample = .4, colsample_bytree=.6,
                       min_child_samples = 1, min_child_weight=1.6, min_split_gain=0.0,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'reg_alpha': [0.0, .2, .4, .6, .8, 1.],
    'reg_lambda': [0.0, .2, .4, .6, .8, 1.], 
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'reg_lambda': 0.0, 'reg_alpha': 0.0} 0.849649849439


In [51]:
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.005, subsample = .4, colsample_bytree=.6,
                       min_child_samples = 1, min_child_weight=1.6, min_split_gain=0.0,
                       reg_alpha=0, reg_lambda=.0, random_state=0)

params = {
    'random_state': [0,1,2,3,4,5]
}
gs =  GridSearchCV(estimator=lg, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 5} 0.849668822503


In [52]:
# 0.808082454455
sgd = SGDClassifier(loss='log', penalty='elasticnet', class_weight = 'balanced', n_jobs=2,
                    alpha=0.005, l1_ratio=0.6, random_state=3, max_iter=100000, learning_rate='optimal')

# 0.809243996089
lr = LogisticRegression(class_weight = 'balanced', penalty='l2', random_state=1, max_iter=100000)

# 0.813776395167
mlp = MLPClassifier(hidden_layer_sizes=(20,), solver='lbfgs', alpha=0.0005, random_state=3, 
                    max_iter=10000)

# 0.831697429629
rf = RandomForestClassifier(n_estimators = 1200, n_jobs = 2, class_weight='balanced',
                            random_state=3,
                            min_samples_split=0.01, min_weight_fraction_leaf=0.0,
                            min_impurity_decrease=0.001)

# 0.849668822503
lg = lgb.LGBMClassifier(n_estimators=1200,
                        learning_rate=0.005, subsample = .4, colsample_bytree=.6,
                       min_child_samples = 1, min_child_weight=1.6, min_split_gain=0.0,
                       reg_alpha=0, reg_lambda=.0, random_state=5)

# 0.850748407592
xg = xg = xgb.XGBClassifier(n_estimators = 1200, n_jobs=2,
                       learning_rate=0.01, subsample=.8, colsample_bytree=.8,
                      min_child_weight=0.3, gamma=0.2,
                      reg_alpha=0, reg_lambda=1, random_state=0, objective = 'binary:logistic', eval_metric='logloss')

### Group 2: Classifiers without predict_prob functions

In [53]:
ridge = RidgeClassifier(alpha=1.0, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'alpha': [1.0, 0.8, 0.6, 0.4, 0.2],
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'alpha': 0.6} 0.82046753886


In [54]:
ridge = RidgeClassifier(alpha=0.6, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)
params = {
    'random_state': [1,2,3,4,5]
}
gs =  GridSearchCV(estimator=ridge, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'random_state': 1} 0.82046753886


In [55]:
# 0.82046753886
ridge = RidgeClassifier(alpha=0.6, normalize=True, class_weight='balanced', random_state=3, max_iter=10000)

## Use Group1 + 2 (cv accuracy < 0.83) for voting classifier

In [56]:
from sklearn.ensemble import VotingClassifier

use hard voting since these models are not well-calibrated

In [57]:
vc = VotingClassifier(estimators=[('lr', lr), ('sgd', sgd), ('ridge', ridge), ('mlp', mlp)], 
                 voting='hard', n_jobs=2)

In [58]:
cross_val_score(vc, train_x, train_y, cv=5, scoring='accuracy')

array([ 0.81005587,  0.79329609,  0.81460674,  0.82022472,  0.83615819])

In [59]:
np.array([ 0.81005587,  0.79329609,  0.81460674,  0.82022472,  0.83615819]).mean()

0.81486832200000003

In [60]:
vc = VotingClassifier(estimators=[('lr', lr), ('sgd', sgd), ('ridge', ridge), ('mlp', mlp)], 
                 voting='hard', n_jobs=2)

## Use Group1 + 2 (cv accuracy < 0.83) in the stacking averaged model

In [61]:
from stacking_models_api import StackingAveragedModels
from cross_valid_api import cross_validate
from sklearn.metrics import accuracy_score

In [62]:
supervised_dict = {
    'ridge': ridge,
    'lr': lr,
    'sgd': sgd,
    'mlp': mlp
}

In [63]:
sam = StackingAveragedModels(sl_base_models_dict=supervised_dict, meta_model=rf, target_col='Survived', eval_func=accuracy_score)

In [65]:
cross_validate(sam, train_x, train_y, 5, scoring=accuracy_score)


 sgd
score= 0.804195804196
score= 0.776223776224
score= 0.802816901408
score= 0.830985915493
score= 0.838028169014
Avg score =  0.810450113267

 mlp
score= 0.755244755245
score= 0.776223776224
score= 0.760563380282
score= 0.795774647887
score= 0.838028169014
Avg score =  0.78516694573

 lr
score= 0.755244755245
score= 0.776223776224
score= 0.788732394366
score= 0.852112676056
score= 0.838028169014
Avg score =  0.802068354181

 ridge
score= 0.804195804196
score= 0.783216783217
score= 0.781690140845
score= 0.838028169014
score= 0.852112676056
Avg score =  0.811848714666
meta model's training set score=  0.818820224719 

fold  1  valid score:  0.843575418994

 sgd
score= 0.811188811189
score= 0.804195804196
score= 0.762237762238
score= 0.788732394366
score= 0.795774647887
Avg score =  0.792425883975

 mlp
score= 0.783216783217
score= 0.839160839161
score= 0.734265734266
score= 0.830985915493
score= 0.795774647887
Avg score =  0.796680784005

 lr
score= 0.811188811189
score= 0.82517482517

### The CV result is slightly better when using stacking averaged model than using hard-voting classifier

## Construct the final classifier by combining the stacking averaged model with rf, xgboost and lightGBM

In [73]:
final_vc = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('lg', lg), ('xg', xg)], 
                 voting='soft', n_jobs=2)

In [79]:
weights = np.arange(4)

params = {
    'weights': [weights, 
                [.1, .2, .35, .35],
                [.05, .15, .4, .4],
                [.03, .07,.45, .45],
                [.005, .015, .49, .49],
                [.25, .25, .25, .25]]
}

gs =  GridSearchCV(estimator=final_vc, param_grid = params, scoring='accuracy', iid=False, cv=5)
gs.fit(train_x, train_y)
print(gs.best_params_, gs.best_score_)

{'weights': [0.03, 0.07, 0.45, 0.45]} 0.846285410919


In [80]:
final_vc_soft = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('lg', lg), ('xg', xg)], 
                 weights = [.03, .07,.45, .45],
                 voting='soft', n_jobs=2)

In [81]:
cross_validate(final_vc_soft, train_x, train_y, 5, accuracy_score)

fold  1  valid score:  0.871508379888
fold  2  valid score:  0.820224719101
fold  3  valid score:  0.848314606742
fold  4  valid score:  0.837078651685
fold  5  valid score:  0.797752808989
5  fold(s) avg. valid score:  0.834975833281


In [82]:
final_vc_soft.fit(train_x, train_y)

VotingClassifier(estimators=[('sam', StackingAveragedModels(eval_func=<function accuracy_score at 0x000000000AFFC048>,
            meta_model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurit...      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8))],
         flatten_transform=None, n_jobs=2, voting='soft',
         weights=[0.03, 0.07, 0.45, 0.45])

In [83]:
final_vc_soft.predict(test_x)

array([ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        1.,  1.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0

In [84]:
final_vc_soft.predict_proba(test_x)

array([[ 0.91126272,  0.05873728],
       [ 0.76202499,  0.20797501],
       [ 0.88698991,  0.0830101 ],
       [ 0.72221612,  0.24778388],
       [ 0.39051834,  0.57948166],
       [ 0.88602573,  0.08397426],
       [ 0.41465992,  0.55534008],
       [ 0.82296474,  0.14703527],
       [ 0.12312006,  0.84687994],
       [ 0.9173401 ,  0.05265991],
       [ 0.91994093,  0.05005908],
       [ 0.9069623 ,  0.06303769],
       [ 0.02544993,  0.94455007],
       [ 0.92425085,  0.04574915],
       [ 0.04809482,  0.92190518],
       [ 0.06418925,  0.90581075],
       [ 0.82128989,  0.14871012],
       [ 0.76068599,  0.20931401],
       [ 0.40299299,  0.56700701],
       [ 0.44639423,  0.52360577],
       [ 0.85905791,  0.11094209],
       [ 0.66156198,  0.30843802],
       [ 0.05453049,  0.91546951],
       [ 0.77892372,  0.19107628],
       [ 0.04052555,  0.92947445],
       [ 0.93329621,  0.03670379],
       [ 0.02878268,  0.94121732],
       [ 0.84124129,  0.1287587 ],
       [ 0.55820983,

In [85]:
final_vc_hard = VotingClassifier(estimators=[('sam', sam), ('rf', rf), ('lg', lg), ('xg', xg)], 
                 voting='hard', n_jobs=2)

In [86]:
cross_validate(final_vc_hard, train_x, train_y, 5, accuracy_score)

fold  1  valid score:  0.865921787709
fold  2  valid score:  0.831460674157
fold  3  valid score:  0.837078651685
fold  4  valid score:  0.831460674157
fold  5  valid score:  0.803370786517
5  fold(s) avg. valid score:  0.833858514845


In [87]:
final_vc_hard.fit(train_x, train_y)

VotingClassifier(estimators=[('sam', StackingAveragedModels(eval_func=<function accuracy_score at 0x000000000AFFC048>,
            meta_model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurit...      reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8))],
         flatten_transform=None, n_jobs=2, voting='hard', weights=None)

In [88]:
final_vc_hard.predict(test_x)

array([ 0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  1.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,
        0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.,
        0.,  1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  0.,
        1.,  1.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  1.,  1.,  0

## 2 versions of sbumissions

In [89]:
orig_test = pd.read_csv('data/test.csv', encoding='utf-8')

In [90]:
ID = orig_test['PassengerId']

In [93]:
submission_soft = pd.DataFrame()
submission_soft['PassengerId'] = ID
submission_soft['Survived'] = final_vc_soft.predict(test_x).astype('int')
submission_soft.to_csv('submission_soft.csv', encoding='utf-8', index=False)

In [95]:
soft_pred = final_vc_soft.predict(train_x).astype('int')
print("train accuracy = ", accuracy_score(train_y, soft_pred))

train accuracy =  0.902356902357


In [94]:
submission_hard = pd.DataFrame()
submission_hard['PassengerId'] = ID
submission_hard['Survived'] = final_vc_hard.predict(test_x).astype('int')
submission_hard.to_csv('submission_hard.csv', encoding='utf-8', index=False)

In [96]:
hard_pred = final_vc_hard.predict(train_x).astype('int')
print("train accuracy = ", accuracy_score(train_y, hard_pred))

train accuracy =  0.895622895623


In [100]:
test = pd.read_csv('xg_submission.csv')
test_pred = test['Survived']
print(len(test_pred[test_pred != submission_soft['Survived']]))

17
