In [41]:
import numpy as np
import pandas as pd

import xgboost as xgb

from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

#reproducibility
seed =123

In [75]:
X ,y = make_classification(
    n_samples = 5000,
    n_features = 7,
    n_informative = 4,
    n_classes = 2,
    weights = [.9, .1],
    shuffle = True,
    random_state=seed
    )

print('There are {} positive instances.'.format(y.sum()))

There are 528 positive instances.


In [76]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, stratify=y, random_state=seed)

print('Total number of positive train instances: {}'.format(y_train.sum()))
print('Total number of positive test instances: {}'.format(y_test.sum()))

Total number of positive train instances: 354
Total number of positive test instances: 174


In [77]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [78]:
params = {
    'objective' : 'binary:logistic',
    'max_depth' : 1,
    'silent':1,
    'eta':1
}

num_rounds = 100

In [79]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_pred = (bst.predict(dtest) > 0.5).astype('int')

In [80]:
pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_pred, name='Predicted'),
    margins = True
)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1464,12,1476
1,54,120,174
All,1518,132,1650


#### Accuracy Paradox

In [81]:
print('Accuracy: {0:2f}'.format(accuracy_score(y_test,y_test_pred)))
print('Precision: {0:2f}'.format(precision_score(y_test,y_test_pred)))
print('Recall: {0:2f}'.format(recall_score(y_test,y_test_pred)))

Accuracy: 0.960000
Precision: 0.909091
Recall: 0.689655


### Custom Weights:

In [86]:
weights = np.zeros(len(y_train))
weights[y_train == 0] = 1
weights[y_train == 1] = 5

dtrain = xgb.DMatrix(X_train, label=y_train, weight = weights) ##Add in the weights
dtest = xgb.DMatrix(X_test)

In [87]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_pred = (bst.predict(dtest) > 0.5).astype('int')

In [88]:
pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_pred, name='Predicted'),
    margins = True
)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1398,78,1476
1,35,139,174
All,1433,217,1650


In [89]:
print('Accuracy: {0:2f}'.format(accuracy_score(y_test,y_test_pred)))
print('Precision: {0:2f}'.format(precision_score(y_test,y_test_pred)))
print('Recall: {0:2f}'.format(recall_score(y_test,y_test_pred)))

Accuracy: 0.931515
Precision: 0.640553
Recall: 0.798851


### Use scale_pos_weight parameter
Automate the process of assigning weights manually by calculating the proportion between the negative and positive instances and setting it to scale_pos_weight parameter

In [90]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

In [91]:
train_labels = dtrain.get_label()

ratio = float(np.sum(train_labels == 0 )) / np.sum(train_labels == 1)
params['scale_pos_weight'] = ratio

In [92]:
bst = xgb.train(params, dtrain, num_rounds)
y_test_pred = (bst.predict(dtest) > 0.5).astype('int')

In [93]:
pd.crosstab(
    pd.Series(y_test, name='Actual'),
    pd.Series(y_test_pred, name='Predicted'),
    margins = True
)

Predicted,0,1,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1355,121,1476
1,30,144,174
All,1385,265,1650


In [74]:
print('Accuracy: {0:2f}'.format(accuracy_score(y_test,y_test_pred)))
print('Precision: {0:2f}'.format(precision_score(y_test,y_test_pred)))
print('Recall: {0:2f}'.format(recall_score(y_test,y_test_pred)))

Accuracy: 0.942424
Precision: 0.673913
Recall: 0.885714
