# Simple Classifier

In this notebook, we train a xgboost classifier using cross-validated random search on top of the pre-trained features.

In [None]:
%matplotlib inline

# system libraries
import os
from glob import glob
import h5py
from time import time

# numerical,image and plotting stuff
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid", {'axes.grid' : False})

In [2]:
# read in data
f = h5py.File('data/train.h5','r')
X_raw = f.get('train/X')[...]
Y = f.get('train/Y')[...]
X_submit_raw = f.get('submit/X')[...]

print(X_raw.shape)

(1480, 7, 7, 512)


We write a pre-processing function that aggregates the raw features into a smaller dataset.

In [3]:
def preproc_X(X_raw):
    X_raw_max = np.max(X_raw,axis=(1,2))
    X_raw_mean = np.mean(X_raw,axis=(1,2))
    X_raw_min = np.min(X_raw,axis=(1,2))
    X_raw_max_mean = X_raw_max - X_raw_mean
    X_raw_mean_min = X_raw_mean - X_raw_min
    X_raw_max_min = X_raw_max - X_raw_min
    X = np.hstack((X_raw_max,X_raw_mean,X_raw_min,X_raw_max_mean,X_raw_mean_min,X_raw_max_min))
    return X

In [4]:
from sklearn.model_selection import train_test_split

X = preproc_X(X_raw)
X_submit = preproc_X(X_submit_raw)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=290615)
print('train: %i test: %i' % (len(y_train),len(y_test)))

train: 1184 test: 296


In [5]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint as randint

# initialize classifier
xgb = XGBClassifier(seed=290615,nthread=4)

# specify parameters and distributions to sample from
param_dist = {"max_depth": randint(1, 8),
              "learning_rate": uniform(0.1, 0.5),
              "n_estimators": randint(100, 1000),
              "subsample": uniform(0.3,0.7),
              "colsample_bytree": uniform(0.3,0.7)}

# run randomized search
random_search = RandomizedSearchCV(xgb, 
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   verbose=2,
                                   random_state=290615)

start = time()
random_search.fit(X_train, y_train)
print('RandomizedSearchCV took %is' % (time() - start))




Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489 
[CV]  n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489, total=  32.0s
[CV] n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   32.1s remaining:    0.0s


[CV]  n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489, total=  28.7s
[CV] n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489 
[CV]  n_estimators=807, max_depth=6, colsample_bytree=0.511422742252, subsample=0.665992412225, learning_rate=0.314762301489, total=  31.0s
[CV] n_estimators=887, max_depth=3, colsample_bytree=0.66625095903, subsample=0.668817467924, learning_rate=0.590255056521 
[CV]  n_estimators=887, max_depth=3, colsample_bytree=0.66625095903, subsample=0.668817467924, learning_rate=0.590255056521, total=  28.4s
[CV] n_estimators=887, max_depth=3, colsample_bytree=0.66625095903, subsample=0.668817467924, learning_rate=0.590255056521 
[CV]  n_estimators=887, max_depth=3, colsample_bytree=0.66625095903, subsample=0.668817467924, learning_rate=0.590255056521, total=  28.5s
[CV] n_estimators=887, max_depth=3, colsample_bytree=0.66625095903, s

[Parallel(n_jobs=1)]: Done 150 out of 150 | elapsed: 59.0min finished


RandomizedSearchCV took 3558s


In [6]:
# helper function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
report(random_search.cv_results_)

Model with rank: 1
Mean validation score: 0.573 (std: 0.011)
Parameters: {'n_estimators': 150, 'max_depth': 4, 'colsample_bytree': 0.638511462894362, 'subsample': 0.65507257356179638, 'learning_rate': 0.35918710119018804}

Model with rank: 2
Mean validation score: 0.566 (std: 0.008)
Parameters: {'n_estimators': 434, 'max_depth': 5, 'colsample_bytree': 0.51890941336944962, 'subsample': 0.55516747192623139, 'learning_rate': 0.16871925868974971}

Model with rank: 3
Mean validation score: 0.559 (std: 0.014)
Parameters: {'n_estimators': 748, 'max_depth': 4, 'colsample_bytree': 0.42356293048922905, 'subsample': 0.73997854297995791, 'learning_rate': 0.1104641057596317}

Model with rank: 3
Mean validation score: 0.559 (std: 0.020)
Parameters: {'n_estimators': 338, 'max_depth': 2, 'colsample_bytree': 0.65638242422127924, 'subsample': 0.88665979477155776, 'learning_rate': 0.15995335224541932}

Model with rank: 3
Mean validation score: 0.559 (std: 0.015)
Parameters: {'n_estimators': 812, 'max_dep

In [7]:
# evaluate on hold-out set
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
y_test_pred_prob = random_search.predict_proba(X_test)
y_test_pred = random_search.predict(X_test)
ll = log_loss(y_pred=y_test_pred_prob,y_true=y_test,eps=1e-15)
acc = accuracy_score(y_pred=y_test_pred,y_true=y_test)
print('log loss of %.4f on test set' % ll)
print('accuracy of %.3f on test set' % acc)
confusion_matrix(y_true=y_test,y_pred=y_test_pred)

log loss of 1.0537 on test set
accuracy of 0.615 on test set


array([[  8,  30,   6],
       [  5, 121,  29],
       [  5,  39,  53]])

In [8]:
# train model on full dataset with the tuned parameters
xgb_final = XGBClassifier(n_estimators=random_search.best_params_['n_estimators'],
                          max_depth=random_search.best_params_['max_depth'],
                          learning_rate=random_search.best_params_['learning_rate'],
                          subsample=random_search.best_params_['subsample'],
                          colsample_bytree=random_search.best_params_['colsample_bytree'],
                          silent=False,
                          seed=290615)
xgb_final.fit(X,Y,eval_metric='mlogloss',verbose=True)

XGBClassifier(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=0.638511462894362, gamma=0,
       learning_rate=0.35918710119018804, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=150, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=290615, silent=False,
       subsample=0.65507257356179638)

In [9]:
# prepare submission file
Y_submit = xgb_final.predict_proba(X_submit)
df_submit = pd.read_csv('data/sample_submission.csv')
prior = df_submit[['Type_1','Type_2','Type_3']].as_matrix()
df_submit[['Type_1','Type_2','Type_3']] = Y_submit
df_submit.to_csv('data/submission/vggfeat_xgbcv_n20.csv',index=False) # ~1.9

# clipped submission
Y_clip = Y_submit.clip(max=0.9,min=0.1)
Y_clip = Y_clip/Y_clip.sum(axis=1,keepdims=True)
df_submit[['Type_1','Type_2','Type_3']] = Y_clip
df_submit.to_csv('data/submission/vggfeat_xgbcv_n20_clip.csv',index=False)# ~1.2

# multiply with prior get posterior probabilities, then clip aggressively
Y_posterior = Y_submit * prior
Y_posterior = Y_posterior/Y_posterior.sum(axis=1,keepdims=True)
Y_clip = Y_posterior.clip(max=0.5,min=0.1)
Y_clip = Y_clip/Y_clip.sum(axis=1,keepdims=True)
df_submit[['Type_1','Type_2','Type_3']] = Y_clip
df_submit.to_csv('data/submission/vggfeat_xgbcv_n20_posterior_clip.csv',index=False)# ~1.1

# take the average with the prior and clip aggressively
Y_avg = Y_submit + prior
Y_avg = Y_avg/Y_avg.sum(axis=1,keepdims=True)
Y_clip = Y_avg.clip(max=0.5,min=0.1)
Y_clip = Y_clip/Y_clip.sum(axis=1,keepdims=True)
df_submit[['Type_1','Type_2','Type_3']] = Y_avg
df_submit.to_csv('data/submission/vggfeat_xgbcv_n20_posterior_avg.csv',index=False)# ~1.1
df_submit[['Type_1','Type_2','Type_3']] = Y_clip
df_submit.to_csv('data/submission/vggfeat_xgbcv_n20_posterior_avg_clip.csv',index=False)# ~1.1