#### Things to do
* Learn GridSearchCV

#### Approach
1. Split into train and test datasets
2. Over/Undersample the train dataset
3. Tune the model
    1. Use the crossval function to find the appropriate n_estimators to start with
    2. GridSearchCV the appropriate parameters
5. Test the model with new parameters
6. Repeat until best results

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, 
                             precision_recall_fscore_support, 
                             average_precision_score, 
                             roc_auc_score, 
                             confusion_matrix,
                             precision_recall_curve)

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [3]:
df = pd.read_csv('data/res_anon_working.csv')

In [4]:
feat_lst = list(df)
feat_lst.pop(-1)
X = df[feat_lst]
y = df['Class']

### Defining tuning function and model evaluation functions

In [34]:
def crossval(model, X, y, cv_folds=5, early_stopping_rounds=50):
    xgb_param = model.get_xgb_params()
    xgb_train = xgb.DMatrix(X_train.as_matrix(), label=y_train.as_matrix())
    print 'Running cross validation...'
    cvresult = xgb.cv(xgb_param, xgb_train, num_boost_round=model.get_params()['n_estimators'],
                      nfold=cv_folds, stratified=True, metrics='auc',
                      early_stopping_rounds=early_stopping_rounds,
                      verbose_eval=False, seed=42)
    print cvresult.loc[cvresult.shape[0]-1]
    print 'Best iteration: ' + str(cvresult.shape[0])

In [17]:
def model_eval(X, y, model):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    print(confusion_matrix(y, y_pred))
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    print('Precision: %f' % (float(tp)/(tp + fp)))
    print('Recall: %f' % (float(tp)/(tp + fn)))
    y_proba = model.predict_proba(X)[:,1]
    print('AUC: %f' % roc_auc_score(y, y_proba))

In [18]:
def pr_eval(X, y, model):
    y_proba = model.predict_proba(X)[:,1]
    average_precision = average_precision_score(y, y_proba)
    print('Average precision-recall score: {0:0.2f}'.format(average_precision))
        
    precision, recall, _ = precision_recall_curve(y, y_proba)
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

In [19]:
def graph_eval(X, y, model):
    p_range = np.arange(0.1, 1, 0.1) 
    p_range
    precisions = []
    recalls = []
    f1s = []
    prob_range = []
    # model.fit(X, y)

    pred = model.predict_proba(X)
    for p in p_range:

        pred_tmp = pred[:, 1].copy()
        pred_tmp[pred_tmp > p] = 1
        pred_tmp[pred_tmp <= p] = 0
        tn, fp, fn, tp = confusion_matrix(y, pred_tmp).ravel()
        if tp == 0: 
            continue
        prec = (float(tp)/(tp + fp))
        rec = (float(tp)/(tp + fn))
        f1 = 2 * prec * rec / (prec + rec)
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        prob_range.append(p)

    # plot the results
    plt.plot(prob_range, precisions, 'k', label='Precision')
    plt.plot(prob_range, recalls, 'g', label='Recall')
    plt.plot(prob_range, f1s, 'b', label='F1')
    plt.xlabel('Prob Range')
    plt.ylabel('Score')
    plt.grid(True)
    plt.legend(bbox_to_anchor=(1.05, 1))

### Splitting dataset for training and testing, oversampling the train dataset

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21, stratify=y)

In [20]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
ros = RandomOverSampler(random_state=0)
X_ros_resampled, y_ros_resampled = ros.fit_sample(X_train, y_train)
from collections import Counter
print sorted(Counter(y_ros_resampled).items())

[(0.0, 61290), (1.0, 61290)]


In [35]:
xgb1 = XGBClassifier(learning_rate=0.1,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     objective='binary:logistic',
                     n_jobs=-1,
                     scale_pos_weight=1,
                     seed=27)
crossval(xgb1, X_ros_resampled, y_ros_resampled,)

Running cross validation...
test-auc-mean     0.741493
test-auc-std      0.006192
train-auc-mean    0.908123
train-auc-std     0.001563
Name: 396, dtype: float64
Best iteration: 397


### Using GridSearchCV to tune max_depth and min_child_weight

In [38]:
param_test1 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=397,
                                                max_depth=5,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.8,
                                                colsample_bytree=0.8,
                                                objective='binary:logistic',
                                                n_jobs=-1,
                                                scale_pos_weight=1,
                                                seed=27
                                               ),
                        param_grid=param_test1,
                        scoring='roc_auc',
                        n_jobs=-1,
                        iid=False,
                        cv=5
                       )
gsearch1.fit(X_ros_resampled, y_ros_resampled)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=397,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'max_depth': [3, 5, 7, 9], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [44]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.80790, std: 0.00196, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.80828, std: 0.00147, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.80876, std: 0.00193, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.91712, std: 0.00078, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.91457, std: 0.00169, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.91471, std: 0.00108, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.97810, std: 0.00099, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.97654, std: 0.00114, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.97336, std: 0.00096, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.99479, std: 0.00042, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.99339, std: 0.00066, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.99187, std: 0.00056, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 9, 'min_child_weight': 1

In [45]:
param_test1 = {
    'max_depth': range(9, 15, 2),
    'min_child_weight': range(1, 6, 2)
}

gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=397,
                                                max_depth=5,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.8,
                                                colsample_bytree=0.8,
                                                objective='binary:logistic',
                                                n_jobs=-1,
                                                scale_pos_weight=1,
                                                seed=27
                                               ),
                        param_grid=param_test1,
                        scoring='roc_auc',
                        n_jobs=-1,
                        iid=False,
                        cv=5
                       )
gsearch1.fit(X_ros_resampled, y_ros_resampled)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=397,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'max_depth': [9, 11, 13], 'min_child_weight': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [46]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99479, std: 0.00042, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.99339, std: 0.00066, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.99187, std: 0.00056, params: {'max_depth': 9, 'min_child_weight': 5},
  mean: 0.99733, std: 0.00033, params: {'max_depth': 11, 'min_child_weight': 1},
  mean: 0.99664, std: 0.00041, params: {'max_depth': 11, 'min_child_weight': 3},
  mean: 0.99590, std: 0.00047, params: {'max_depth': 11, 'min_child_weight': 5},
  mean: 0.99787, std: 0.00030, params: {'max_depth': 13, 'min_child_weight': 1},
  mean: 0.99742, std: 0.00040, params: {'max_depth': 13, 'min_child_weight': 3},
  mean: 0.99695, std: 0.00039, params: {'max_depth': 13, 'min_child_weight': 5}],
 {'max_depth': 13, 'min_child_weight': 1},
 0.99787328461358127)

In [48]:
param_test1 = {
    'max_depth': [12, 13, 14],
    'min_child_weight': [1]
}

gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,
                                                n_estimators=397,
                                                max_depth=5,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.8,
                                                colsample_bytree=0.8,
                                                objective='binary:logistic',
                                                n_jobs=-1,
                                                scale_pos_weight=1,
                                                seed=27
                                               ),
                        param_grid=param_test1,
                        scoring='roc_auc',
                        n_jobs=-1,
                        iid=False,
                        cv=5
                       )
gsearch1.fit(X_ros_resampled, y_ros_resampled)

GridSearchCV(cv=5, error_score='raise',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=397,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8),
       fit_params=None, iid=False, n_jobs=-1,
       param_grid={'max_depth': [12, 13, 14], 'min_child_weight': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [49]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.99768, std: 0.00029, params: {'max_depth': 12, 'min_child_weight': 1},
  mean: 0.99787, std: 0.00030, params: {'max_depth': 13, 'min_child_weight': 1},
  mean: 0.99798, std: 0.00031, params: {'max_depth': 14, 'min_child_weight': 1}],
 {'max_depth': 14, 'min_child_weight': 1},
 0.99797544517994985)