In [169]:
# Load in our libraries
import pandas as pd
import numpy as np
import sklearn
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

# import plotly.offline as py
# py.init_notebook_mode(connected=True)
# import plotly.graph_objs as go
# import plotly.tools as tls

# Going to use these 5 base models for the stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [170]:
drop_fields = ['PassengerId','Name', 'Age', 'Ticket', 'Fare', 'Cabin', 'Parch']

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test = df_test.drop(drop_fields, axis=1)


In [171]:
SEED = 0
NFOLDS = 5
kf = KFold(NFOLDS, random_state=SEED)

class Model(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
    
    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self, x, y):
        return self.clf.fit(x, y)
    
    def feature_importances(self, x, y):
        impt = self.clf.fit(x, y).feature_importances_
        print(self.clf.fit(x, y).feature_importances_)
        return impt

In [172]:
ntrain = df_train.shape[0]
ntest = df_test.shape[0]

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        clf.train(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    oof_test[:] = oof_test_skf.mean(axis=0)
    
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


In [173]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [174]:
rf = Model(clf=RandomForestClassifier(), seed=SEED, params=rf_params)
et = Model(clf=ExtraTreesClassifier(), seed=SEED, params=et_params)
ada = Model(clf=AdaBoostClassifier(), seed=SEED, params=ada_params)
gb = Model(clf=GradientBoostingClassifier(), seed=SEED, params=gb_params)
svc = Model(clf=SVC(), seed=SEED, params=svc_params)

y_train = df_train['Survived'].ravel()
X_train = df_train.drop(['Survived'], axis=1).values
X_test = df_test.values

In [175]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, X_train, y_train, X_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,X_train, y_train, X_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, X_train, y_train, X_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,X_train, y_train, X_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,X_train, y_train, X_test) # Support Vector Classifier


In [176]:
rf_features = rf.feature_importances(X_train,y_train)
et_features = et.feature_importances(X_train, y_train)
ada_features = ada.feature_importances(X_train, y_train)
gb_features = gb.feature_importances(X_train,y_train)

all_features = [rf_features, et_features, ada_features, gb_features]

[ 0.12682376  0.23132078  0.04545502  0.04854062  0.12886313  0.10228713
  0.17141684  0.05488931  0.07712272  0.01328069]
[ 0.12357328  0.25592101  0.05254109  0.04143566  0.12563654  0.11030774
  0.16147527  0.03869418  0.06660063  0.0238146 ]
[ 0.02  0.02  0.16  0.02  0.08  0.12  0.24  0.08  0.22  0.04]
[ 0.07484368  0.0319191   0.06877165  0.07686062  0.12447173  0.20441011
  0.22295844  0.03177683  0.13812493  0.02586291]


In [177]:
cols = df_train.drop(['Survived'], axis=1).columns.values
# Create a dataframe with features
feature_df = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })
feature_df['mean'] = feature_df.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_df

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,features,mean
0,0.02,0.112723,0.074862,0.088406,Pclass,0.073998
1,0.02,0.289591,0.037234,0.237225,Sex,0.146013
2,0.16,0.046461,0.070212,0.054787,SibSp,0.082865
3,0.02,0.051605,0.076861,0.040714,Embarked,0.047295
4,0.08,0.121742,0.124476,0.112215,CategoricalAge,0.109608
5,0.12,0.109313,0.199129,0.12822,CategoricalFare,0.139166
6,0.24,0.133166,0.217637,0.183274,Title,0.193519
7,0.08,0.059287,0.031759,0.044055,HasCabin,0.053775
8,0.24,0.059413,0.150475,0.095625,FamilySize,0.136378
9,0.02,0.016699,0.017356,0.015479,IsAlone,0.017383


In [189]:
base_predictions_train = pd.DataFrame({
    'RandomForest': rf_oof_train.ravel(),
    'ExtraTrees': et_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel()
})
base_predictions_train.head()
df_train

Unnamed: 0,Survived,Pclass,Sex,SibSp,Embarked,CategoricalAge,CategoricalFare,Title,HasCabin,FamilySize,IsAlone
0,0,3,0,1,2,1,0,1,0,2,0
1,1,1,1,1,0,2,4,3,1,2,0
2,1,3,1,0,2,1,1,2,0,1,1
3,1,1,1,1,2,2,4,3,1,2,0
4,0,3,0,0,2,2,1,1,0,1,1
5,0,3,0,0,1,1,1,1,0,1,1
6,0,1,0,0,2,4,4,1,1,1,1
7,0,3,0,3,2,0,2,4,0,5,0
8,1,3,1,0,2,2,2,3,0,3,0
9,1,2,1,1,0,1,3,3,0,2,0


In [179]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [187]:
gbm = XGBClassifier(
#     learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_train)

In [188]:
accuracy = accuracy_score(y_train, predictions)

print("Accuracy on training set: {:.3f}".format(accuracy))

Accuracy on training set: 0.838


In [186]:
res = gbm.predict(x_test)
df_result = pd.DataFrame()
test = pd.read_csv('test.csv')
df_result["PassengerId"] = test["PassengerId"]
df_result["Survived"] = res.ravel()
df_result.to_csv("results.csv", sep=",", index=False)