In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split,
                                    GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import Imputer
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
import xgboost as xgb

from mlutils import *
%load_ext autoreload
%autoreload 2

# SEED = 42 # Initial results were performed with a SEED of 42, but let's change things up.
SEED = 100

In [2]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test

In [3]:
# Impute missing features
from sklearn.preprocessing import Imputer

# train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.

X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)

X.head(5)

# Split data.
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=SEED)
X_train = X
y_train = y

# This file demonstrates a VotingClassifier on all of the models.

In [4]:
import xgboost as xgb
from sklearn.ensemble import BaggingClassifier

# For this example, train on all examples with risk of public leaderboard overfitting.

params_xgb = {'max_depth': 5,
         'learning_rate': 0.025, 
         'n_estimators': 724,
         'silent': False,
         'objective': 'binary:logistic',
         'nthread': 4,
         'gamma': 0.6000000000000001, 
         'min_child_weight': 2.0, 
         'subsample': 0.9, 
         'colsample_bytree': 0.5, 
#          'tree_method': 'exact', 
         'seed': SEED,      
#          'booster': 'gbtree', 
#          'eval_metric': 'auc'
}

gbm_model = xgb.XGBClassifier(**params_xgb)
bag_clf = BaggingClassifier(gbm_model, n_estimators=20, random_state=SEED)
# predictions = gbm_model.predict(xgb.DMatrix(X_test))

In [5]:
scores = cross_val_score(
        bag_clf, X, y, scoring='roc_auc', cv=5, n_jobs=1)

In [8]:
print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), 'XGBoost'))

Accuracy: 0.85904 (+/- 0.00806) [XGBoost]


# Best RF (5-fold CV)

In [9]:
from sklearn.ensemble import RandomForestClassifier

params = {'min_samples_leaf': 10, 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 300, 'min_samples_split': 15}

rf = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        max_features=params['max_features']
    )

scores = cross_val_score(
        rf, X, y, scoring='roc_auc', cv=5, n_jobs=-1)

In [11]:
print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), 'RF'))

Accuracy: 0.85950 (+/- 0.00701) [RF]


In [14]:
eclf = VotingClassifier(estimators=[('xgb', gbm_model), ('rf', rf)],voting='soft')
scores = cross_val_score(eclf, X, y, cv=5, scoring='roc_auc')
print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), 'Voting Ensemble'))

Accuracy: 0.86084 (+/- 0.00725) [Voting Ensemble]


In [17]:
from sklearn.neural_network import MLPClassifier

# Standard Scale features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
X_test_scaled = scaler.transform(X_test)

mlp_clf = MLPClassifier(random_state=SEED)
scores = cross_val_score(
        mlp_clf, X_scaled, y, scoring='roc_auc', cv=5, n_jobs=-1)
print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), 'MLP Classifier'))

Accuracy: 0.82116 (+/- 0.01002) [MLP Classifier]


In [21]:
eclf = VotingClassifier(estimators=[('xgb', gbm_model), ('rf', rf)],voting='soft')
scores = cross_val_score(eclf, X_scaled, y, cv=5, scoring='roc_auc')
print("Accuracy: %0.5f (+/- %0.5f) [%s]" % (scores.mean(), scores.std(), 'Voting Ensemble'))

Accuracy: 0.86108 (+/- 0.00716) [Voting Ensemble]


In [22]:
eclf.fit(X_scaled, y)
predictions = eclf.predict_proba(X_test_scaled)[:, 1]

score = scores.mean()
std = scores.std()
model_name = 'voting_best'
outfile = 'output/{}.csv'.format(model_name)


In [23]:
save_model_and_desc(bag_clf, model_name, params,
                  score, std, SEED)
write_results(outfile, predictions)