In [1]:
#import libraries
import numpy as np
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.3f' % x)

from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV



In [None]:
def summary(grid_result, printing=False):
    #summarize results
    
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    if printing:
        print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
        
    summary = pd.concat([pd.Series(means, name="mean"), pd.Series(stds, name='std'), pd.DataFrame(params)], axis=1)
    summary = summary.sort_values('mean',ascending=False)
    return summary

## Load Data

In [3]:
#load data
cleaned = pd.read_csv('clean_train_v2.csv', index_col=0)
cleaned_test = pd.read_csv('clean_test_v2.csv', index_col=0)

#create feature/target sets
X_ = cleaned.drop(['OutcomeType','OutcomeSubtype','AnimalID'], axis=1)
y_ = cleaned[['OutcomeType']]

#drop/save test IDs for later
X_TEST = cleaned_test.drop('ID', axis=1)
X_TEST_ids = cleaned_test['ID']

In [4]:
#sanity check -- ensure that train and test datasets have the same features
list(X_.columns) == list(X_TEST.columns)

True

#### Standardize Age

In [5]:
#TRAIN
#scale age
scaler = StandardScaler()
age = scaler.fit_transform(X_['Age'].values.reshape(-1,1))
age = pd.DataFrame(age.reshape(-1))
age.columns = ['age_scaled']

#replace 'Age' with 'age_scaled'
X = pd.concat([X_, age], axis=1) #add scaled age
X = X.drop('Age', axis=1) #drop unscaled age

In [6]:
#TEST
#scale age for TEST
scaler = StandardScaler()
test_age = scaler.fit_transform(X_TEST['Age'].values.reshape(-1,1))
test_age = pd.DataFrame(test_age.reshape(-1))
test_age.columns = ['age_scaled']

#replace 'Age' with 'age_scaled' for TEST
X_test = pd.concat([X_TEST, test_age], axis=1) #add scaled age
X_test = X_test.drop('Age', axis=1) #drop unscaled age

#### Convert to arrays

In [7]:
#format as float numpy arrays
X = np.asarray(X, dtype='float')
X_TEST = np.asarray(X_TEST, dtype='float')

#### Encode target labels

In [8]:
#encode target labels
encodr = LabelEncoder()
y = encodr.fit_transform(y_)

  y = column_or_1d(y, warn=True)


#### Cross-Validation Split

In [9]:
#cross-validation split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

## Baseline Model

First, we will define a few general parameters.

In [10]:
#set seed
seed = 7

#define scoring metric
scoring = 'neg_log_loss'

#define kfold with k=10
kfold = model_selection.KFold(n_splits=10, random_state=seed) #prepare K-fold CV

Now we will define our first model specific parameter -- the objective function. 

In this case, I am choosing the 'softmax' function, for multi-class classification.

In [11]:
#define independent parameters
indie_params = {'objective': 'multi:softmax'}

In [12]:
#build baseline model
model = XGBClassifier(**indie_params)

#cross-validation
cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) #cross-validate model using K-fold CV
cv_results = [abs(result) for result in cv_results] #take absolute value b/c scoring is negative log-loss
cv_results = pd.Series(cv_results)

#print CV estimate
msg = "%s: %f (%f)" % ('XGBoost', cv_results.mean(), cv_results.std())
print(msg)

XGBoost: 0.848218 (0.012015)


## Tuning

Now that we are beginning to tune our model, let's adjust our kfold to k=5.

We will be buiding thousands of models here, so reducing k is a more computationally efficient way of getting our estimates.

To be more specific, reducing k to 5 halves the number of models built, and thereby (approx.) halves the time spent waiting on models to train.

In [13]:
#redefine kfold with k=5
kfold = model_selection.KFold(n_splits=5, random_state=seed)

### Tune 'learning_rate' and 'n_estimators'

The first two parameters to tune with XGBoost are essential to this algorithm.

These need to be tuned together, in order to find a decent starting point.

In [14]:
#define independent parameters
indie_params = {'objective': 'multi:softmax'}

In [15]:
#define parameter search grid
param_grid ={'n_estimators':[100, 200, 300, 400, 500],
             'learning_rate':[0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]}

In [30]:
#build model
model = XGBClassifier(**indie_params)

#grid search
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

#summarize results
summary = summary(grid_result)
summary.sort_values('mean',ascending=False)[:10]

Unnamed: 0,mean,std,learning_rate,n_estimators
25,-0.843,0.006,0.3,100
17,-0.844,0.006,0.1,300
20,-0.844,0.005,0.2,100
21,-0.844,0.007,0.2,200
16,-0.845,0.006,0.1,200
18,-0.845,0.006,0.1,400
19,-0.846,0.007,0.1,500
26,-0.846,0.007,0.3,200
22,-0.847,0.007,0.2,300
15,-0.849,0.005,0.1,100


The top performing combination yielded a neg_log_loss of -0.843 using:

{'learning_rate': 0.3, 'n_estimators': 100}

That being said, we want the learning rate to be as small as possible. By looking at the top models, we see that the second best combination yielded a neg_log_loss of -0.844 using:

{'learning_rate': 0.1, 'n_estimators': 300}

In other words, we can get essentially the same performance using a better learning rate of 0.1 with 300 trees.

### Tune 'max_depth' and 'min_child_weight'

These parameters add constraints on the architecture of the trees.

'max_depth' is the maximum number of nodes allowed from the root to the farthest leaf of a tree. Deeper trees can model more complex relationships by adding more nodes, but as we go deeper, splits become less relevant and are sometimes only due to noise, causing the model to overfit.

'min_child_weight' is the minimum weight (or number of samples if all samples have a weight of 1) required in order to create a new node in the tree. A smaller min_child_weight allows the algorithm to create children that correspond to fewer samples, thus allowing for more complex trees, but again, more likely to overfit.

Thus, those parameters can be used to control the complexity of the trees. It is important to tune them together in order to find a good trade-off between model bias and variance

In [32]:
#update independent parameters
indie_params = {'objective': 'multi:softmax',
                'learning_rate': 0.1,
                'n_estimators': 300}

In [33]:
#define parameter search grid
param_grid = {'max_depth': [1,3,5,7,9],
              'min_child_weight': [1,3,5]}

In [35]:
#build model
model = XGBClassifier(**indie_params)

#grid search
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=3, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

#summarize results
summary = summary(grid_result)
summary.sort_values('mean',ascending=False)[:10]

Unnamed: 0,mean,std,max_depth,min_child_weight
8,-0.843,0.008,5,5
7,-0.843,0.007,5,3
6,-0.843,0.007,5,1
5,-0.843,0.006,3,5
4,-0.844,0.005,3,3
3,-0.844,0.006,3,1
11,-0.853,0.01,7,5
9,-0.853,0.009,7,1
10,-0.854,0.009,7,3
2,-0.867,0.004,1,5


The top 4 performing combinations of 'max_depth' and 'min_child_weight' all have the same mean; that being said, we can tell that one combo is more consistent, by looking at standard deviation.  

I am choosing the combination that produces the smallest mean and standard deviation at index 5:

{'max_depth': 3, 'min_child_weight': 5}

## Tune 'subsample' and 'colsample'

These parameters control the sampling of the dataset that is done at each boosting round.

Instead of using the whole training set every time, we can build a tree on slightly different data at each step, which makes it less likely to overfit to a single sample or feature.

'subsample' corresponds to the fraction of observations (the rows) to subsample at each step. By default it is set to 1 meaning that we use all rows.

'colsample_bytree' corresponds to the fraction of features (the columns) to use. By default it is set to 1 meaning that we will use all features.

In [36]:
#update independent parameters
indie_params = {'objective': 'multi:softmax',
                'learning_rate': 0.1,
                'n_estimators': 300,
                'max_depth': 3,
                'min_child_weight': 5}

In [37]:
#define parameter grid search
param_grid = {'subsample':[0.7, 0.8, 0.9, 1.0],
              'colsample_bytree':[0.7, 0.8, 0.9, 1.0]}

In [None]:
#build model
model = XGBClassifier(**indie_params)

#grid search
grid_search = GridSearchCV(model, param_grid, scoring=scoring, n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X_train, y_train)

In [45]:
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
summary = pd.concat([pd.Series(means, name="mean"), pd.Series(stds, name='std'), pd.DataFrame(params)], axis=1)
summary = summary.sort_values('mean',ascending=False)
summary.head()

Unnamed: 0,mean,std,colsample_bytree,subsample
9,-0.841,0.007,0.9,0.8
2,-0.841,0.006,0.7,0.9
13,-0.841,0.007,1.0,0.8
14,-0.841,0.006,1.0,0.9
5,-0.841,0.007,0.8,0.8


Above, we have our top 5 combinations of 'subsample' and 'colsample_bytree'.  

Following the same logic as with the last set of hyperparameters, we want to minimize both mean and std. As such, we are left to choose between two options:

{'subsample': 0.9, 'colsample_bytree': 1.0}

{'subsample': 0.9, 'colsample_bytree': 0.7}

For now, we will go with the first option.  If we see signs of overfitting, I may revisit this decision.

### xgboost_v1

In [36]:
#update independent parameters
indie_params = {'objective': 'multi:softmax',
                'learning_rate': 0.1,
                'n_estimators': 300,
                'max_depth': 3,
                'min_child_weight': 5,
                'subsample': 0.9,
                'colsample_bytree': 1.0}

In [51]:
#build model
model = XGBClassifier(**indie_params)

#fit model
model.fit(X, y)

#predict classes
predict = model.predict_proba(X_TEST)
predict = pd.DataFrame(predict) #reformat
predict.columns = encodr.inverse_transform(predict.columns) #get field names
predict = pd.concat([X_TEST_ids, predict], axis=1) #add IDs
predict = predict.set_index('ID') #set index
predict.head() #preview

#save results
predict.to_csv('xgboost_v1.csv')

Unnamed: 0_level_0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.331,0.001,0.023,0.488,0.157
2,0.043,0.001,0.107,0.782,0.067
3,0.163,0.003,0.021,0.579,0.234
4,0.193,0.006,0.129,0.068,0.603
5,0.206,0.011,0.074,0.132,0.576


### xgboost_v2

In [55]:
#update independent parameters
indie_params = {'objective': 'multi:softmax',
                'learning_rate': 0.1,
                'n_estimators': 300,
                'max_depth': 3,
                'min_child_weight': 5,
                'subsample': 0.9,
                'colsample_bytree': 0.7}

In [56]:
#build model
model = XGBClassifier(**indie_params)

#fit model
model.fit(X, y)

#predict classes
predict = model.predict_proba(X_TEST)
predict = pd.DataFrame(predict) #reformat
predict.columns = encodr.inverse_transform(predict.columns) #get field names
predict = pd.concat([X_TEST_ids, predict], axis=1) #add IDs
predict = predict.set_index('ID') #set index
predict.head() #preview

#save results
predict.to_csv('xgboost_v2.csv')

Unnamed: 0_level_0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.328,0.001,0.018,0.464,0.188
2,0.049,0.002,0.085,0.778,0.086
3,0.23,0.002,0.017,0.533,0.216
4,0.148,0.005,0.128,0.071,0.647
5,0.23,0.007,0.054,0.151,0.558


### Add Early Stopping to Prevent Overfitting

In [59]:
#build model
model = XGBClassifier(**indie_params)

#define evaluation set
eval_set = [(X_test, y_test)]

model.fit(X_train, y_train, eval_metric='mlogloss', eval_set=eval_set, early_stopping_rounds=10)

#predict classes
predict = model.predict_proba(X_TEST)
predict = pd.DataFrame(predict) #reformat
predict.columns = encodr.inverse_transform(predict.columns) #get field names
predict = pd.concat([X_TEST_ids, predict], axis=1) #add IDs
predict = predict.set_index('ID') #set index
predict.head() #preview

[0]	validation_0-mlogloss:1.5296
Will train until validation_0-mlogloss hasn't improved in 10 rounds.
[1]	validation_0-mlogloss:1.45762
[2]	validation_0-mlogloss:1.39083
[3]	validation_0-mlogloss:1.33906
[4]	validation_0-mlogloss:1.29291
[5]	validation_0-mlogloss:1.25134
[6]	validation_0-mlogloss:1.21698
[7]	validation_0-mlogloss:1.18376
[8]	validation_0-mlogloss:1.15489
[9]	validation_0-mlogloss:1.12922
[10]	validation_0-mlogloss:1.10809
[11]	validation_0-mlogloss:1.08962
[12]	validation_0-mlogloss:1.07055
[13]	validation_0-mlogloss:1.05333
[14]	validation_0-mlogloss:1.03721
[15]	validation_0-mlogloss:1.02467
[16]	validation_0-mlogloss:1.01038
[17]	validation_0-mlogloss:0.998386
[18]	validation_0-mlogloss:0.987078
[19]	validation_0-mlogloss:0.977321
[20]	validation_0-mlogloss:0.9694
[21]	validation_0-mlogloss:0.960874
[22]	validation_0-mlogloss:0.952995
[23]	validation_0-mlogloss:0.945632
[24]	validation_0-mlogloss:0.939866
[25]	validation_0-mlogloss:0.934248
[26]	validation_0-mloglos

[224]	validation_0-mlogloss:0.839933
[225]	validation_0-mlogloss:0.839914
Stopping. Best iteration:
[215]	validation_0-mlogloss:0.839863



Unnamed: 0_level_0,Adoption,Died,Euthanasia,Return_to_owner,Transfer
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.329,0.001,0.016,0.436,0.218
2,0.045,0.002,0.096,0.631,0.226
3,0.33,0.001,0.016,0.467,0.185
4,0.278,0.003,0.094,0.077,0.548
5,0.371,0.002,0.035,0.098,0.493


In [60]:
predict.to_csv('xgboost_v3.csv')