In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import (cross_val_score, train_test_split,
                                    GridSearchCV, RandomizedSearchCV)
from sklearn.preprocessing import Imputer
%load_ext autoreload
%autoreload 2

In [2]:
train = pd.read_csv('data/train_final.csv')
test = pd.read_csv('data/test_final.csv')
train = train.drop(['id'], axis=1)
test = test.drop(['id'], axis=1)

In [3]:
# Divide dataset into X and y
y = train.Y
X = train.drop(["Y"], axis=1)
X_test = test

In [4]:
# Impute missing features
from sklearn.preprocessing import Imputer

# train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
train_xform = imp.fit_transform(X)
# TODO: Impute dataframe so that F5 uses median
# and F19 uses mean. For now, we'll impute via mean for both.

X = pd.DataFrame(train_xform, columns=X.columns)
test_xform = imp.transform(X_test)
X_test = pd.DataFrame(test_xform, columns=X_test.columns)

X.head(5)

Unnamed: 0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27
0,1.0,0.0,0.107576,0.0,0.0,1.0,1.0,1.0,1.0,0.0,...,104.0,22902.0,1.0,0.0,18.0,0.042295,1.0,0.0,27.0,0.02825
1,1.0,0.0,0.142357,0.0,0.0,7.0,1.0,1.0,1.0,1.0,...,144.0,11400.0,1.0,0.0,8.0,0.021417,1.0,0.0,67.0,0.253574
2,1.0,0.0,0.492318,0.0,3.0,4205.0,1.0,1.0,3.0,1.0,...,112.0,4833.0,1.0,0.0,13.0,0.502212,1.0,1.0,35.0,0.373397
3,1.0,0.0,-0.053028,0.0,2.0,2.0,1.0,1.0,5.0,2.0,...,127.0,3250.0,1.0,1.0,8.0,0.0,1.0,0.0,50.0,0.674254
4,1.0,0.0,0.730797,0.0,0.0,11.0,1.0,1.0,1.0,1.0,...,148.0,4000.0,1.0,1.0,5.0,0.787592,1.0,0.0,71.0,0.371157


In [5]:
# Split data.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42)

In [6]:
 {'booster': 'gbtree', 'seed': 42, 'objective': 'binary:logistic',
  'eval_metric': 'auc', 'nthread': 4, 'colsample_bytree': 0.5, 
  'gamma': 0.8, 'subsample': 0.7000000000000001, 'tree_method': 'exact', 
  'min_child_weight': 2.0, 'n_estimators': 191.0, 'max_depth': 13, 'silent': 1, 'eta': 0.025}

{'booster': 'gbtree',
 'colsample_bytree': 0.5,
 'eta': 0.025,
 'eval_metric': 'auc',
 'gamma': 0.8,
 'max_depth': 13,
 'min_child_weight': 2.0,
 'n_estimators': 191.0,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 42,
 'silent': 1,
 'subsample': 0.7000000000000001,
 'tree_method': 'exact'}

In [9]:
import xgboost as xgb



param_grid = {
              'n_estimators': [100, 191],
              'learning_rate': [0.025],
              'gamma': [0.8], 
              'max_depth': [13], 
              'min_child_weight': [2],
              'subsample': [0.7],
              'colsample_bytree': [0.5], 
              'reg_lambda': [0.01, 0.1, 1.0],
              'reg_alpha': [0, 0.1, 0.5, 1.0],
             }

clf = RandomizedSearchCV(xgb.XGBClassifier(), param_grid, n_iter=24, cv=10, scoring='roc_auc', n_jobs=-1)
%timeit clf.fit(X_train, y_train)

1 loop, best of 3: 31min 34s per loop


In [11]:
print("Best parameter set found on development set with cv={}:\n"
      .format(10))
print(clf.best_params_)
print("Best holdout score found on development set with cv={}:\n"
      .format(10))
print(clf.best_score_)
print()
score = clf.score(X_val, y_val)

print(clf.cv_results_['mean_train_score'])
print(clf.cv_results_['mean_test_score'])
print("Validation score:\n")
print(score)

Best parameter set found on development set with cv=10:

{'reg_alpha': 1.0, 'n_estimators': 100, 'min_child_weight': 2, 'gamma': 0.8, 'learning_rate': 0.025, 'reg_lambda': 1.0, 'max_depth': 13, 'subsample': 0.7, 'colsample_bytree': 0.5}
Best holdout score found on development set with cv=10:

0.859015090233

[ 0.96755749  0.96529295  0.93885507  0.96353637  0.9601218   0.93444747
  0.94000513  0.93747654  0.91986913  0.91889493  0.91731437  0.90657852
  0.98358616  0.98252591  0.96820157  0.98181541  0.98021119  0.96583734
  0.97054964  0.96900996  0.95572476  0.95641709  0.95486418  0.94355303]
[ 0.85714719  0.85692049  0.85867739  0.85689685  0.85657678  0.85871784
  0.85806897  0.85842191  0.85829997  0.85788809  0.85797268  0.85901509
  0.8540899   0.8544546   0.85661111  0.85433322  0.85477472  0.85678396
  0.85561551  0.85531921  0.85703611  0.85659831  0.85698273  0.85819794]
Validation score:

0.857435115985


In [12]:
def write_results(outfile, predictions):
    ID = range(49999, 99998 + 1)
    with open(outfile, 'w') as f:
        f.write('id,Y\n')
        for instance, prediction in zip(ID, predictions):
            f.write('{},{}\n'.format(instance, prediction))

In [14]:
outfile = 'output/xgb_lambda1.0_alpha1.0_subsample0.7_gamma0.8_lr0.025_colsample0.5_minchildw2_maxdepth13_derp.csv'
predictions = clf.predict_proba(X_test)[:, 1]
write_results(outfile, predictions)