In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import extraction
import util
from time import time

X, y, X_holdout, ids = extraction.prepare_data("./data/", drop_categorical=True)

def create_submission(clf, submission_name):
    file_name = submission_name + "_{}.csv".format(time())
    util.note_submission_info("Model: {}".format(clf), file_name)
    util.build_submission(clf, X_holdout, ids, file_name)
    print "Written {}".format(file_name)

In [2]:
print "{} features, {} data points".format(X.shape[1], X.shape[0])

112 features, 114321 data points


In [3]:
# Quickfix: X_holdout is still broken (NaNs)!
import scipy.stats as stats
import numpy as np

col_mean = stats.nanmean(X_holdout,axis=0)
inds = np.where(np.isnan(X_holdout))
X_holdout[inds]=np.take(col_mean,inds[1])

In [4]:
from sklearn import metrics, cross_validation

def check_cv_score(clf, X, y):
    scores = cross_validation.cross_val_score(clf, X, y, scoring="log_loss", cv=5, n_jobs=-1, verbose=1)
    
    mu = scores.mean()
    std = scores.std()
    
    return mu, std

def report_metrics(clf, X_test, y_test):
    y_pred = clf.predict_proba(X_test)[:,0]

    print "Log loss: {}".format(metrics.log_loss(y_test, y_pred))
    print "Accuracy: {}".format(clf.score(X_test, y_test))

In [5]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.1)

## GBT

In [6]:
from sklearn.ensemble import GradientBoostingClassifier

In [7]:
%time
gbt_clf = GradientBoostingClassifier(n_estimators=100, 
                                     max_depth=5, 
                                     subsample=0.8,
                                     learning_rate=0.05,
                                     verbose=1, 
                                     random_state=42)
gbt_clf.fit(X_train, y_train)

Wall time: 0 ns
      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0900           0.0097            5.45m
         2           1.0803           0.0085            4.60m
         3           1.0731           0.0076            4.29m
         4           1.0645           0.0069            4.11m
         5           1.0585           0.0064            3.98m
         6           1.0541           0.0059            3.89m
         7           1.0497           0.0055            3.80m
         8           1.0387           0.0047            3.73m
         9           1.0393           0.0043            3.66m
        10           1.0351           0.0039            3.61m
        20           1.0060           0.0016            3.13m
        30           0.9943           0.0007            2.72m
        40           0.9849           0.0005            2.31m
        50           0.9745           0.0001            1.91m
        60           0.9727           0.0004         

GradientBoostingClassifier(init=None, learning_rate=0.05, loss='deviance',
              max_depth=5, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              random_state=42, subsample=0.8, verbose=1, warm_start=False)

In [8]:
report_metrics(gbt_clf, X_test, y_test)
create_submission(gbt_clf, "gbt")

Log loss: 1.43415375728
Accuracy: 0.77827341905
Written gbt_1454880935.65.csv


In [None]:
from visualization import learning_curve

In [None]:
fig = learning_curve.plot_learning_curve(title="Learning curve of decision stump GBT", 
                                   estimator=gbt_clf, X=X[:5000], y=y[:5000], n_jobs=4, cv=5)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=42, verbose=1)
rf_clf.fit(X_train, y_train)

In [None]:
report_metrics(rf_clf, X_test, y_test)