In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
diabetes_df = pd.read_csv('../week_13/diabetes.csv')
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [2]:
diabetes_df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
#our previous model had a recall of .5
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome',axis = 1)
y = diabetes_df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

#Standardize
sc = StandardScaler()
X_train_scaler = sc.fit_transform(X_train)
X_test_scaler = sc.fit_transform(X_test)

# Oversampling 

In [5]:
#Resample the training data with RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state = 42)
X_resampled, y_resampled = ros.fit_resample(X_train_scaler, y_train)


In [6]:
#train using the resampled data
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [8]:
#calculate the accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test_scaler)
balanced_accuracy_score(y_test, y_pred)
#increased accuracy from .6 to .74 by using oversampling

0.7403703703703703

In [9]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
confusion_matrix(y_test, y_pred)

array([[74, 26],
       [14, 40]], dtype=int64)

In [10]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.74      0.74      0.79      0.74      0.55       100
          1       0.61      0.74      0.74      0.67      0.74      0.55        54

avg / total       0.76      0.74      0.74      0.74      0.74      0.55       154



# Undersampling 

In [12]:
#clusterCentroids uses k-means to reduce the number of samples. 
from imblearn.under_sampling import ClusterCentroids
rus = ClusterCentroids(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train_scaler, y_train)

In [13]:
model = LogisticRegression(random_state = 42)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=42)

In [14]:
confusion_matrix(y_test, y_pred)

array([[74, 26],
       [14, 40]], dtype=int64)

In [15]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.84      0.74      0.74      0.79      0.74      0.55       100
          1       0.61      0.74      0.74      0.67      0.74      0.55        54

avg / total       0.76      0.74      0.74      0.74      0.74      0.55       154

