# Imbalanced Classifiers Show and Tell

The usual imports

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix, accuracy_score, recall_score
from collections import Counter
import pandas as pd
import numpy as np

### Reading in some test data and making it imbalanced

In [22]:
df = pd.read_csv('titanic.csv')

In [23]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S


In [24]:
X = df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare']]
X = X.dropna()
X['sex'] = (X['sex'] == 'female').astype(int)
y = X.pop('survived').values
X = X.values

### Ratio of y before

In [25]:
Counter(y)

Counter({0: 424, 1: 290})

In [26]:
X = np.vstack((X[y == 0], X[y == 1][:10]))
y = np.hstack((y[y == 0], y[y == 1][:10]))

### Ratio of y after imbalanced

In [27]:
Counter(y)

Counter({0: 424, 1: 10})

### A function to display my classifier strength

In [28]:
def display_prediction_outputs(y_true, y_predict):
    accuracy = accuracy_score(y_true, y_predict)
    auc = roc_auc_score(y_true, y_predict)
    tpr = recall_score(y_true, y_predict)
    confusion = confusion_matrix(y_true, y_predict)
    confusion_df = pd.DataFrame(confusion, columns=['Pred 0', 'Pred 1'], index=['Act 0', 'Act 1'])
    confusion_str = confusion_df.to_string()
    output = 'Accuracy: {acc}\n\nROC AUC: {auc}\n\nTrue Positive Rate: {tpr}\n\nConfusion Matrix:\n{conf}'
    return output.format(acc=accuracy, auc=auc, tpr=tpr, conf=confusion_str)

### A Basic Random Forest Model Results

In [29]:
rfc = RandomForestClassifier().fit(X, y)

In [30]:
print display_prediction_outputs(y, rfc.predict(X))

Accuracy: 0.995391705069

ROC AUC: 0.9

True Positive Rate: 0.8

Confusion Matrix:
       Pred 0  Pred 1
Act 0     424       0
Act 1       2       8


### A Random Forest with Over/Up Sampling

This is sampling with replacement (bootstrap) from the minority class to create more of the minority class.

In [31]:
upsample_idx = np.random.choice(len(y[y == 1]), 400)
X_up = np.vstack((X[y == 0], X[y == 1][upsample_idx]))
y_up = np.hstack((y[y == 0], y[y == 1][upsample_idx]))

In [32]:
rfc_up = RandomForestClassifier().fit(X_up, y_up)

In [33]:
print display_prediction_outputs(y, rfc_up.predict(X))

Accuracy: 0.995391705069

ROC AUC: 0.997641509434

True Positive Rate: 1.0

Confusion Matrix:
       Pred 0  Pred 1
Act 0     422       2
Act 1       0      10


### A Random Forest with Under/Down Sampling
This is random sampling from the majority class to get less of the majority class.

In [34]:
downsample_idx = np.random.choice(len(y[y == 0]), 10)
X_down = np.vstack((X[y == 0][downsample_idx], X[y == 1]))
y_down = np.hstack((y[y == 0][downsample_idx], y[y == 1]))

In [35]:
rfc_down = RandomForestClassifier().fit(X_down, y_down)

In [36]:
print display_prediction_outputs(y, rfc_down.predict(X))

Accuracy: 0.566820276498

ROC AUC: 0.778301886792

True Positive Rate: 1.0

Confusion Matrix:
       Pred 0  Pred 1
Act 0     236     188
Act 1       0      10


### A Random Forest with Cost Weighting
This weights the minority class to have a greater impact if an error or imbalance is detected.

class_weight='auto' sets the weight to be number of each class over total length of all classes.

In [37]:
rfc_cost = RandomForestClassifier(class_weight='auto').fit(X, y)

In [38]:
print display_prediction_outputs(y, rfc_cost.predict(X))

Accuracy: 0.990783410138

ROC AUC: 0.897641509434

True Positive Rate: 0.8

Confusion Matrix:
       Pred 0  Pred 1
Act 0     422       2
Act 1       2       8


#### We can also set our own weighting.

In [39]:
rfc_cost2 = RandomForestClassifier(class_weight={0: 0.03, 1: 0.97}).fit(X, y)

In [40]:
print display_prediction_outputs(y, rfc_cost2.predict(X))

Accuracy: 0.997695852535

ROC AUC: 0.95

True Positive Rate: 0.9

Confusion Matrix:
       Pred 0  Pred 1
Act 0     424       0
Act 1       1       9
