# Logistic Regression

In [1]:
import numpy as np
import pandas as pd
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.neighbors import KernelDensity
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier

In [2]:
diabetes = pd.read_csv('./Data/pima-indians-diabetes-cleaned.csv')

In [3]:
diabetes

Unnamed: 0,times_pregnant,glucose_concentration,blood_pressure,Triceps_skinfold_thickness,2_Hour_serum_insulin,BMI,diabetes_pedigree_function,Age,Class_Variable
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0
...,...,...,...,...,...,...,...,...,...
763,2,122,70,27,0,36.8,0.340,27,0
764,5,121,72,23,112,26.2,0.245,30,0
765,1,126,60,0,0,30.1,0.349,47,1
766,1,93,70,31,0,30.4,0.315,23,0


In [4]:
feature_cols = ['times_pregnant', 'glucose_concentration', 'blood_pressure', 'Triceps_skinfold_thickness',
              '2_Hour_serum_insulin', 'BMI', 'diabetes_pedigree_function', 'Age']
outcome_col = 'Class_Variable'

In [5]:
X = diabetes[feature_cols].to_numpy()
y = diabetes[outcome_col].to_numpy()

In [6]:
diabetes[diabetes["Class_Variable"] == 1]

Unnamed: 0,times_pregnant,glucose_concentration,blood_pressure,Triceps_skinfold_thickness,2_Hour_serum_insulin,BMI,diabetes_pedigree_function,Age,Class_Variable
1,8,183,64,0,0,23.3,0.672,32,1
3,0,137,40,35,168,43.1,2.288,33,1
5,3,78,50,32,88,31.0,0.248,26,1
7,2,197,70,45,543,30.5,0.158,53,1
8,8,125,96,0,0,0.0,0.232,54,1
...,...,...,...,...,...,...,...,...,...
756,0,123,72,0,0,36.3,0.258,52,1
758,6,190,92,0,0,35.5,0.278,66,1
760,9,170,74,31,0,44.0,0.403,43,1
765,1,126,60,0,0,30.1,0.349,47,1


In [7]:
sum(y_train)

NameError: name 'y_train' is not defined

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Standardize the training data
means = X_train.mean(axis=0)
stds = X_train.std(axis=0)
X_train = zscore(X_train, axis=0, ddof=0, nan_policy='propagate')

In [10]:
# Standardize the test data according to the training data
for i in range(len(X_test)):
    for j in range(len(X_test[i])):
        X_test[i][j] = (X_test[i][j] - means[j]) / stds[j] 
                         
                         

In [11]:
# Train Logistic Regression Model with class weights
lr_weights = LogisticRegression(class_weight="balanced")
lr_weights.fit(X_train, y_train)
lr_pred_weights = lr_weights.predict(X_test)

In [12]:
# Train Logistic Regression Model without class weights
lr_no_weights = LogisticRegression()
lr_no_weights.fit(X_train, y_train)
lr_pred_no_weights = lr_no_weights.predict(X_test)

In [13]:
# Confusion Matrix (TN, FP, FN, TP) for weights
metrics.confusion_matrix(lr_pred_weights, y_test).ravel()

array([74, 16, 23, 41])

In [14]:
# Confusion Matrix (TN, FP, FN, TP) for no weights
metrics.confusion_matrix(lr_pred_no_weights, y_test).ravel()

array([83, 22, 14, 35])

In [15]:
lr_weights.score(X_test, y_test)

0.7467532467532467

In [16]:
lr_no_weights.score(X_test, y_test)

0.7662337662337663

# Logistic Regression with Our Prediction Function

In [17]:
class LogisticRegressionBinaryMod(LogisticRegression):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00
        self.threshold = (loss_01 - loss_00) / ((loss_01 - loss_00) + (loss_10 - loss_11))
        print(f"self.threshold: {self.threshold}")
    
    # override the predict function to predict according to our threshold
    def predict(self, X):
        probs = self.predict_proba(X)
        predictions = [1 if probs[i][1] >= self.threshold else 0 for i in range(len(probs))]
        return predictions
    

    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

# Logistic Regression- with additional cost sensitive score function

In [18]:
class LogisticRegressionJustScore(LogisticRegression):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00

        
    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

# Dummy Classifier- with additional cost sensitive score function

In [19]:
class DummyClassifierJustScore(DummyClassifier):
    def __init__(self, loss_11, loss_10, loss_01, loss_00, **kwds):
        super().__init__(**kwds)
        
        # comes from equation 8 in the writeup
        self.loss_11 = loss_11
        self.loss_10 = loss_10
        self.loss_01 = loss_01
        self.loss_00 = loss_00

        
    # write a score function that will calculate the score of classifications given our costs
    def score_with_costs(self, X, y, sample_weight=None):
        y_pred = self.predict(X)
        
        give_loss_vectorized = np.vectorize(self.give_loss)
        losses = give_loss_vectorized(y, y_pred)
        
        return np.average(losses, weights=sample_weight)
        
        
    def give_loss(self, y, y_pred):
        if y == 1 and y_pred == 1:
            return self.loss_11
        elif y == 1 and y_pred == 0:
            return self.loss_10
        elif y == 0 and y_pred == 1:
            return self.loss_01
        elif y == 0 and y_pred == 0:
            return self.loss_00
        else:
            raise Exception(f"Expected y: {y} and y_pred: {y_pred} to equal to 0 or 1")

### Penalties for the following classifiers defined once here

In [20]:
loss_11 = 100000 # loss associated with correctly assigning class 1
loss_10 = 7500000 # loss associated with assigning class 0 when we should assign class 1
loss_01 = 100000 # loss associated with assigning class 1 when we should assign class 0
loss_00 = 0 # loss associated with correctly assigning class 0

### Logistic Regression with our prediction threshold- no reweighting

In [21]:
lr_cost_sensitive = LogisticRegressionBinaryMod(loss_11, loss_10, loss_01, loss_00)
lr_cost_sensitive.fit(X_train, y_train)
lr_cost_sensitive_pred = lr_cost_sensitive.predict(X_test)

self.threshold: 0.013333333333333334


In [22]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, lr_cost_sensitive_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [ 2 95  0 57]
precision: 0.375
recall: 1.0


In [23]:
lr_cost_sensitive.score_with_costs(X_test, y_test)

98701.2987012987

In [24]:
lr_cost_sensitive.score(X_test, y_test)

0.38311688311688313

### Logistic Regression with our prediction threshold- reweighting

In [25]:
lr_cost_sensitive_reweight = LogisticRegressionBinaryMod(loss_11, loss_10, loss_01, loss_00, class_weight="balanced")
lr_cost_sensitive_reweight.fit(X_train, y_train)
lr_cost_sensitive_reweight_pred = lr_cost_sensitive_reweight.predict(X_test)

self.threshold: 0.013333333333333334


In [26]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
metrics.confusion_matrix(y_test, lr_cost_sensitive_reweight_pred).ravel()

array([ 2,  0, 95, 57])

In [27]:
lr_cost_sensitive_reweight.score_with_costs(X_test, y_test)

98701.2987012987

In [28]:
lr_cost_sensitive_reweight.score(X_test, y_test)

0.38311688311688313

### Logistic Regression with standard prediction threshold of 0.5- no reweighting

In [29]:
lr = LogisticRegressionJustScore(loss_11, loss_10, loss_01, loss_00)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

In [30]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, lr_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [83 14 22 35]
precision: 0.7142857142857143
recall: 0.6140350877192983


In [31]:
lr.score_with_costs(X_test, y_test)

1103246.7532467532

In [32]:
lr.score(X_test, y_test)

0.7662337662337663

### Logistic Regression with standard prediction threshold of 0.5- reweighting (according to Elkan)

In [39]:
weights = {0: 0.013 / (1-0.013), 1: 1}
weights2 = {0: 0.25, 1: 1}
lr_reweight = LogisticRegressionJustScore(loss_11, loss_10, loss_01, loss_00, class_weight=weights2)
lr_reweight.fit(X_train, y_train)
lr_reweight_pred = lr_reweight.predict(X_test)

In [40]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, lr_reweight_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [62 35  6 51]
precision: 0.5930232558139535
recall: 0.8947368421052632


In [35]:
lr_reweight.score_with_costs(X_test, y_test)

1533116.8831168832

In [36]:
lr_reweight.score(X_test, y_test)

0.7337662337662337

### Dummy Classifier that ignores features to serve as baseline- always predicts majority class

In [141]:
dummy_classifier_majority = DummyClassifierJustScore(loss_11, loss_10, loss_01, loss_00)
dummy_classifier_majority.fit(X_train, y_train)
dummy_classifier_majority_pred = dummy_classifier_majority.predict(X_test)

In [142]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, dummy_classifier_majority_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [97  0 57  0]
precision: nan
recall: 0.0


  after removing the cwd from sys.path.


In [143]:
dummy_classifier_majority.score_with_costs(X_test, y_test)

2775974.025974026

In [144]:
dummy_classifier_majority.score(X_test, y_test)

0.6298701298701299

### Dummy Classifier that ignores features to serve as baseline- always predict minority class 

In [145]:
dummy_classifier_minority = DummyClassifierJustScore(loss_11, loss_10, loss_01, loss_00, strategy="constant", constant=1)
dummy_classifier_minority.fit(X_train, y_train)
dummy_classifier_minority_pred = dummy_classifier_minority.predict(X_test)


In [146]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, dummy_classifier_minority_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [ 0 97  0 57]
precision: 0.37012987012987014
recall: 1.0


In [147]:
dummy_classifier_minority.score_with_costs(X_test, y_test)

100000.0

In [148]:
dummy_classifier_minority.score(X_test, y_test)

0.37012987012987014

### Dummy Classifier that ignores features to serve as baseline- pick randomly according to prior classes

In [173]:
dummy_classifier_stratified = DummyClassifierJustScore(loss_11, loss_10, loss_01, loss_00, strategy="stratified", random_state=42)
dummy_classifier_stratified.fit(X_train, y_train)
dummy_classifier_stratified_pred = dummy_classifier_stratified.predict(X_test)

In [174]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, dummy_classifier_stratified_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [67 30 39 18]
precision: 0.375
recall: 0.3157894736842105


In [175]:
dummy_classifier_stratified.score_with_costs(X_test, y_test)

1930519.4805194805

In [176]:
dummy_classifier_stratified.score(X_test, y_test)

0.551948051948052

### Dummy Classifier that ignores features to serve as baseline- pick totally randomly

In [177]:
dummy_classifier_random = DummyClassifierJustScore(loss_11, loss_10, loss_01, loss_00, strategy="uniform", random_state=42)
dummy_classifier_random.fit(X_train, y_train)
dummy_classifier_random_pred = dummy_classifier_random.predict(X_test)



In [178]:
# Confusion Matrix (TN, FP, FN, TP) for cost-sensitive
cm = metrics.confusion_matrix(y_test, dummy_classifier_random_pred).ravel()
print(f"confusion matrix: {cm}")
print(f"precision: {cm[3] / (cm[3] + cm[1])}")
print(f"recall: {cm[3] / (cm[3] + cm[2])}")

confusion matrix: [42 55 30 27]
precision: 0.32926829268292684
recall: 0.47368421052631576


In [179]:
dummy_classifier_random.score_with_costs(X_test, y_test)

1514285.7142857143

In [180]:
dummy_classifier_random.score(X_test, y_test)

0.44805194805194803