In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors, RadiusNeighborsClassifier
# from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, LinearRegression, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve, RandomizedSearchCV
# from sklearn.tree import DecisionTreeClassifier 
# from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, LabelEncoder
# from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import re
from xgboost import XGBClassifier

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# from scipy.sparse import csr_matrix, hstack, vstack
%matplotlib inline

In [None]:
RANDOM_STATE = 5
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

In [None]:
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy()

X_full = [X_train, X_test]

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    else:
        return ""

for df in X_full:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    df['Fare'] = df['Fare'].fillna(X_train['Fare'].median())

    age_mean = X_train['Age'].mean()
    age_std = X_train['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    df['Title'] = df['Name'].apply(get_title)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df.drop(['Name', 'Ticket', 'Cabin', 'SibSp'], axis=1, inplace=True)

In [None]:
X, y = X_train, y_train
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=RANDOM_STATE)

In [None]:
y_train.count()

### My version 

In [None]:
classifiers = {'svc': SVC(random_state=RANDOM_STATE),
              'randomforest': RandomForestClassifier(n_estimators=500, 
                                 random_state=RANDOM_STATE, 
                                 n_jobs=-1, 
                                 max_depth=8,
                                 min_samples_leaf=1,
                                 max_features=7)}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


new_train = pd.DataFrame(data={'svc': np.zeros(X_train.shape[0], dtype=int), 
                  'randomforest': np.zeros(X_train.shape[0], dtype=int)}, 
             index=X_train.index)
new_valid = pd.DataFrame(data={'svc': np.zeros(X_valid.shape[0]), 
                  'randomforest': np.zeros(X_valid.shape[0])}, 
             index=X_valid.index)
# new_test = pd.DataFrame(data={'svc': np.zeros(X_test.shape[0]), 
#                   'randomforest': np.zeros(X_test.shape[0])}, 
#              index=X_test.index)

for classifier in classifiers:
    clf = classifiers[classifier]          
    for train_idx, test_idx in skf.split(X_train, y_train):
        X_train_split, y_train_split = X_train.iloc[train_idx], y_train.iloc[train_idx]
        X_test_split = X_train.iloc[test_idx]
        clf.fit(X_train_split, y_train_split)
        new_train[classifier].iloc[test_idx] = clf.predict(X_test_split)
    clf.fit(X_train, y_train)
#     new_test[classifier].iloc[:] = clf.predict(X_test)
    new_valid[classifier].iloc[:] = clf.predict(X_valid)
                    

In [None]:
xgb = XGBClassifier()
xgb.fit(new_train, y_train)
y_valid_preds = xgb.predict(new_valid)


In [None]:

print('ROC AUC:', roc_auc_score(y_valid_preds, y_valid))

In [None]:
classifiers = {'svc': SVC(random_state=RANDOM_STATE),
              'randomforest': RandomForestClassifier(n_estimators=500, 
                                 random_state=RANDOM_STATE, 
                                 n_jobs=-1, 
                                 max_depth=8,
                                 min_samples_leaf=1,
                                 max_features=7)}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


new_train = pd.DataFrame(data={'svc': np.zeros(X.shape[0], dtype=int), 
                  'randomforest': np.zeros(X.shape[0], dtype=int)}, 
             index=X.index)
new_test = pd.DataFrame(data={'svc': np.zeros(X_test.shape[0]), 
                  'randomforest': np.zeros(X_test.shape[0])}, 
             index=X_test.index)

for classifier in classifiers:
    clf = classifiers[classifier]          
    for train_idx, test_idx in skf.split(X, y):
        X_train_split, y_train_split = X.iloc[train_idx], y.iloc[train_idx]
        X_test_split = X.iloc[test_idx]
        clf.fit(X_train_split, y_train_split)
        new_train[classifier].iloc[test_idx] = clf.predict(X_test_split)
    clf.fit(X, y)
#     new_test[classifier].iloc[:] = clf.predict(X_test)
    new_test[classifier].iloc[:] = clf.predict(X_test)
                    

In [None]:
new_train.head()

In [None]:
xgb = XGBClassifier()
xgb.fit(new_train, y)
y_preds = xgb.predict(new_test)


### Submussion #001

In [None]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

In [None]:
y_fin.to_csv('submissions/ensemble_001.csv', header=True, index_label='PassengerId')

### arthurtok's version

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy()

X_full = [X_train, X_test]

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    else:
        return ""

for df in X_full:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    df['Fare'] = df['Fare'].fillna(X_train['Fare'].median())

    age_mean = X_train['Age'].mean()
    age_std = X_train['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    df['Title'] = df['Name'].apply(get_title)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df.drop(['Name', 'Ticket', 'Cabin', 'SibSp', 'PassengerId'], axis=1, inplace=True)

In [None]:
X, y = X_train, y_train
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=RANDOM_STATE)

In [None]:
# Some useful parameters which will come in handy later on
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
NFOLDS = 5 # set folds for out-of-fold prediction
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
# kf = KFold(n_splits=NFOLDS, random_state=RANDOM_STATE)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, random_state=RANDOM_STATE, params=None):
        if clf != KNeighborsClassifier:
            params['random_state'] = random_state
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        return self.clf.fit(x,y).feature_importances_

In [None]:

def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(skf.split(x_train, y_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
#      'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

# KNN parameters
knn_params = {'algorithm': 'auto',
 'leaf_size': 25,
 'n_neighbors': 9,
 'p': 1,
 'weights': 'uniform',
 'n_jobs': -1}

In [None]:
# Create 5 objects that represent our 4 models
rf = SklearnHelper(clf=RandomForestClassifier, random_state=RANDOM_STATE, params=rf_params)
# et = SklearnHelper(clf=ExtraTreesClassifier, random_state=RANDOM_STATE, params=et_params)
# ada = SklearnHelper(clf=AdaBoostClassifier, random_state=RANDOM_STATE, params=ada_params)
# gb = SklearnHelper(clf=GradientBoostingClassifier, random_state=RANDOM_STATE, params=gb_params)
svc = SklearnHelper(clf=SVC, random_state=RANDOM_STATE, params=svc_params)
knn = SklearnHelper(clf=KNeighborsClassifier, random_state=RANDOM_STATE, params=knn_params)

In [None]:
X_train.head()

In [None]:
# Create our OOF train and test predictions. These base results will be used as new features
# et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf, X_train, y_train, X_test) # Random Forest
# ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
# gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc, X_train, y_train, X_test) # Support Vector Classifier
knn_oof_train, knn_oof_test = get_oof(knn, X_train, y_train, X_test) # KNN Classifier

In [None]:
rf_features = rf.feature_importances(X_train, y_train)
# et_feature = et.feature_importances(x_train, y_train)
# ada_feature = ada.feature_importances(x_train, y_train)
# gb_feature = gb.feature_importances(x_train,y_train)


In [None]:
cols = X_train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
#      'Extra Trees  feature importances': et_features,
#       'AdaBoost feature importances': ada_features,
#     'Gradient Boost feature importances': gb_features
    })

In [None]:
trace = go.Scatter(
    y = feature_dataframe['Random Forest feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

In [None]:
py.iplot(fig)

In [None]:
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe.head(3)

In [None]:
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
            x= x,
             y= y,
            width = 0.5,
            marker=dict(
               color = feature_dataframe['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Barplots of Mean Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')

In [None]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
#      'ExtraTrees': et_oof_train.ravel(),
#      'AdaBoost': ada_oof_train.ravel(),
#       'GradientBoost': gb_oof_train.ravel(),
      'SVC': svc_oof_train.ravel(),
      'knn': knn_oof_train.ravel()
    })
base_predictions_train.head()


In [None]:
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

In [None]:
x_train = np.concatenate(( 
#                             et_oof_train, 
                        rf_oof_train, 
#     ada_oof_train, gb_oof_train, 
                        svc_oof_train), axis=1)
x_test = np.concatenate(( #et_oof_test, 
                        rf_oof_test, 
#     ada_oof_test, gb_oof_test, 
                        svc_oof_test), axis=1)

In [None]:
gbm = XGBClassifier(n_jobs=-1).fit(X_train, y_train)
predictions = gbm.predict(X_test)

### submission #002

In [None]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': test.PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("submissions/ensemble_002.csv", index=False)

In [None]:
test.shape[0], len(predictions)

In [None]:
gbm = XGBClassifier(n_jobs=-1).fit(X_train, y_train)
predictions = gbm.predict(X_test)

### submission #002

In [None]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': test.PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("submissions/ensemble_002.csv", index=False)

In [None]:
#Tuning
gbm = XGBClassifier(n_jobs=-1)
gs = GridSearchCV(estimator=gbm, param_grid={#'n_estimators': np.linspace(100, 2000, 20, dtype=int),
                                            'max_depth': [4, 5, 6, 7, 8, 9, 10], 
                                              'min_child_weight': [2, 3, 4],
                                            'gamma': np.linspace(0, 1., 11),
                                            'subsample': np.linspace(0.5, 1., 6),
                                            'colsample_bytree': np.linspace(0.5, 1., 6),
                                            'scale_pos_weight': np.linspace(0.6, 1., 5)},
                  scoring='accuracy', cv=skf)
gs.fit(X_train, y_train);
gs.best_params_

In [None]:
train.shape, X_train.shape

In [None]:
#Tuning
gbm = XGBClassifier(n_jobs=-1)
n_iter_search = 100
rs = RandomizedSearchCV(estimator=gbm, param_distributions={'max_depth': [4, 5, 6, 7, 8, 9, 10], 
                                              'min_child_weight': [2, 3, 4],
                                            'gamma': np.linspace(0, 1., 11),
                                            'subsample': np.linspace(0.5, 1., 6),
                                            'colsample_bytree': np.linspace(0.5, 1., 6),
                                            'scale_pos_weight': np.linspace(0.6, 1., 5)},
                  scoring='accuracy', n_iter=n_iter_search, cv=skf)
rs.fit(X_train, y_train);
rs.best_params_

In [None]:
#Tuning
gbm = XGBClassifier(n_jobs=-1)
n_iter_search = 500
rs = RandomizedSearchCV(estimator=gbm, param_distributions={'max_depth': [4, 5, 6, 7, 8, 9, 10], 
                                              'min_child_weight': [2, 3, 4],
                                            'gamma': np.linspace(0, 1., 11),
                                            'subsample': np.linspace(0.5, 1., 6),
                                            'colsample_bytree': np.linspace(0.5, 1., 6),
                                            'scale_pos_weight': np.linspace(0.6, 1., 5)},
                  scoring='accuracy', n_iter=n_iter_search, cv=skf)
rs.fit(X_train, y_train);
rs.best_params_

### submission #003

In [None]:
params =  {'n_estimators': 2000,
           'subsample': 1.0,
            'scale_pos_weight': 0.8,
            'min_child_weight': 3,
            'max_depth': 5,
            'gamma': 0.9,
            'colsample_bytree': 0.5,
            'n_jobs': -1
          }
gbm = XGBClassifier(**params).fit(X_train, y_train)
predictions = gbm.predict(X_test)

In [None]:
StackingSubmission = pd.DataFrame({ 'PassengerId': test.PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("submissions/ensemble_003.csv", index=False)

### submission #004

In [None]:
x_train = np.concatenate(( 
#                             et_oof_train, 
                        rf_oof_train, 
#     ada_oof_train, gb_oof_train, 
                        svc_oof_train,
                        knn_oof_train), axis=1)
x_test = np.concatenate(( #et_oof_test, 
                        rf_oof_test, 
#     ada_oof_test, gb_oof_test, 
                        svc_oof_test,
                        knn_oof_test), axis=1)

In [None]:
gbm = XGBClassifier(n_jobs=-1).fit(X_train, y_train)
predictions = gbm.predict(X_test)

In [None]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'PassengerId': test.PassengerId,
                            'Survived': predictions })
StackingSubmission.to_csv("submissions/ensemble_004.csv", index=False)