In [2]:
import pandas as pd
import numpy as np
import re
import sklearn
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

import warnings
warnings.filterwarnings('ignore')

# Going to use these 5 base models for the stacking
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC
from sklearn.cross_validation import KFold



In [3]:
train = pd.read_csv("transformedTrain.csv")
test = pd.read_csv("transformedtest.csv")


In [49]:
train.head()

Unnamed: 0.1,Unnamed: 0,Months since Last Donation,Number of Donations,Months since First Donation,Made Donation in March 2007
0,619,0.027027,1.0,1.0,1
1,664,0.0,0.244898,0.270833,1
2,441,0.013514,0.306122,0.34375,1
3,160,0.027027,0.387755,0.447917,1
4,358,0.013514,0.469388,0.78125,0


In [1]:
test.head()

NameError: name 'test' is not defined

In [51]:
# Some useful parameters which will come in handy later on
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 10 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer

In [52]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [53]:
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth':4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':600,
    #'max_features': 0.5,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [54]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)

In [55]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
y_train = train['Made Donation in March 2007'].ravel()
train = train.drop(['Made Donation in March 2007'], axis=1)
x_train = train.values # Creates an array of the train data
x_test = test.values # Creats an array of the test data

In [56]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

In [57]:
print("Training is complete")

Training is complete


In [58]:
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[ 0.25704423  0.29722802  0.22591438  0.21981337]
[ 0.07165522  0.447296    0.38145393  0.09792818]
[ 0.7    0.058  0.062  0.18 ]
[ 0.58533262  0.09918364  0.12939766  0.18608608]


In [59]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,1.0
3,1.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0


In [60]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)

In [61]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 500,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
pred = gbm.predict_proba(x_test)


In [62]:
pred2 = pred[:,1]
pred2

array([ 0.32976717,  0.32976717,  0.17898694,  0.26648009,  0.17898694,
        0.84525752,  0.17898694,  0.26648009,  0.17898694,  0.26648009,
        0.17898694,  0.17898694,  0.17898694,  0.17898694,  0.17898694,
        0.17898694,  0.17898694,  0.17898694,  0.17898694,  0.26839969,
        0.17898694,  0.17898694,  0.17898694,  0.17898694,  0.17898694,
        0.17898694,  0.17898694,  0.26648009,  0.17898694,  0.32976717,
        0.17898694,  0.17898694,  0.17898694,  0.32976717,  0.17898694,
        0.17898694,  0.17898694,  0.26648009,  0.17898694,  0.17898694,
        0.17898694,  0.26648009,  0.26648009,  0.32976717,  0.26648009,
        0.17898694,  0.17898694,  0.26648009,  0.26648009,  0.26839969,
        0.32976717,  0.17898694,  0.26648009,  0.26648009,  0.17898694,
        0.17898694,  0.17898694,  0.32976717,  0.26648009,  0.32976717,
        0.17898694,  0.17898694,  0.17898694,  0.32976717,  0.17898694,
        0.26648009,  0.17898694,  0.17898694,  0.17898694,  0.17

In [64]:
submission=pd.DataFrame({"":test['Unnamed: 0'],"Made Donation in March 2007":pred2})
submission.to_csv("RTransformedPython.csv",index=False)