In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier

In [2]:
diabetes = pd.read_csv('./Data/pima-indians-diabetes-cleaned.csv')

In [3]:
feature_cols = ['times_pregnant', 'glucose_concentration', 'blood_pressure', 'Triceps_skinfold_thickness',
              '2_Hour_serum_insulin', 'BMI', 'diabetes_pedigree_function', 'Age']
outcome_col = 'Class_Variable'

In [4]:
X = diabetes[feature_cols].to_numpy()
y = diabetes[outcome_col].to_numpy()

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Standardize the training data
means = X_train.mean(axis=0)
stds = X_train.std(axis=0)
X_train = zscore(X_train, axis=0, ddof=0, nan_policy='propagate')

In [7]:
# Standardize the test data according to the training data
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j] = (X_test[i][j] - means[j]) / stds[j] 

In [8]:
# Define Cost Values
loss_11 = 100000 # loss associated with correctly assigning class 1
loss_10 = 7500000 # loss associated with assigning class 0 when we should assign class 1
loss_01 = 100000 # loss associated with assigning class 1 when we should assign class 0
loss_00 = 0 # loss associated with correctly assigning class 0

### Random Forest with our prediction threshold- no reweighting

In [129]:
class RandomForestBinaryMod(RandomForestClassifier):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00
        self.threshold = (loss_01 - loss_00) / ((loss_01 - loss_00) + (loss_10 - loss_11))
        print(f"self.threshold: {self.threshold}")
    
    # override the predict function to predict according to our threshold
    def predict(self, X):
        probs = self.predict_proba(X)
        predictions = [1 if probs[i][1] >= self.threshold else 0 for i in range(len(probs))]
        return predictions
    

    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

In [130]:
rf_cost_sensitive = RandomForestBinaryMod(loss_11, loss_10, loss_01, loss_00, random_state=42)
rf_cost_sensitive.fit(X_train, y_train)
rf_cost_sensitive_pred = rf_cost_sensitive.predict(X_test)


self.threshold: 0.013333333333333334


In [131]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, rf_cost_sensitive_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [ 9 88  1 56]
precision: 0.3888888888888889
recall: 0.9824561403508771


In [132]:
rf_cost_sensitive.score_with_costs(X_test, y_test)

142207.7922077922

In [133]:
rf_cost_sensitive.score(X_test, y_test)

0.42207792207792205

### Random Forest with standard prediction threshold of 0.5- no reweighting

In [14]:
class RandomForestJustScore(RandomForestClassifier):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00

        
    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

In [44]:
rf_noncost_sensitive = RandomForestJustScore(loss_11, loss_10, loss_01, loss_00, random_state=42)
rf_noncost_sensitive.fit(X_train, y_train)
rf_noncost_sensitive_pred = rf_noncost_sensitive.predict(X_test)


In [45]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, rf_noncost_sensitive_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [81 16 23 34]
precision: 0.68
recall: 0.5964912280701754


In [46]:
rf_noncost_sensitive.score_with_costs(X_test, y_test)

1152597.4025974027

In [47]:
rf_noncost_sensitive.score(X_test, y_test)

0.7467532467532467

### Random Forest with standard prediction threshold of 0.5- reweighting according to Elkan
##### [To ignore, Elkan claims it's difficult to do reweighting for tree based methods, need to look into this with more time]

In [138]:
weights = {0: 0.01, 1: 1}
rf_noncost_sensitive_reweight = RandomForestJustScore(loss_11, loss_10, loss_01, loss_00, random_state=42, class_weight=weights)
rf_noncost_sensitive_reweight.fit(X_train, y_train)
rf_noncost_sensitive_reweight_pred = rf_noncost_sensitive_reweight.predict(X_test)


In [139]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, rf_noncost_sensitive_reweight_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [81 16 26 31]
precision: 0.6595744680851063
recall: 0.543859649122807


In [140]:
rf_noncost_sensitive_reweight.score_with_costs(X_test, y_test)

1296753.2467532468

In [141]:
rf_noncost_sensitive_reweight.score(X_test, y_test)

0.7272727272727273