In [1]:
# Note: Data shall be scaled prior to training because by default option perform 'Regularization'.
# In this code, scaling is not done first time and scaling is done in the second iteration - this is for discussion purpose. 

In [2]:
import pandas as pd
#import numpy as np

import matplotlib.pyplot as plt   
%matplotlib inline 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
# read file

pima_df = pd.read_csv("pima_diabetes.csv")

In [None]:
pima_df.head(10)

In [None]:
pima_df.info()  

In [None]:
#Lets analysze the distribution of the various attributes
pima_df.describe()

In [None]:
# Let us look at the target column which is 'class' to understand how the data is distributed amongst the various values

pima_df['Class'].value_counts()
# Most are not diabetic. The ratio is almost 1:2 in favor or class 0.  The model's ability to predict class 0 will 
# be better than predicting class 1. 

In [None]:
# Pairplot using sns

#sns.pairplot(pima_df, hue='Class', diag_kind="kde")
sns.pairplot(pima_df, diag_kind="kde", hue = "Class")

In [None]:
X = pima_df.drop(["Class"], axis=1)

In [None]:
y = pima_df["Class"]

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0, stratify=pima_df['Class'])
#print(X_train.head())
#print(y_train.head())

In [None]:
from sklearn.linear_model import LogisticRegression

# Fit the model 
### Solver 'liblinear' works well for small dataset.
### Else can go with new default in Python 'lbfgs'
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)


In [None]:
y_predict = model.predict(X_test)


In [None]:
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))
print(metrics.confusion_matrix(y_test, y_predict))

In [None]:
print(metrics.classification_report(y_test, y_predict))

In [None]:
# Optional - Format the confustion matrix
metrics.ConfusionMatrixDisplay.from_predictions( y_test, y_predict, cmap='Blues')

In [None]:
# Format the Confustion Matrix (Old method)
#from sklearn.metrics import plot_confusion_matrix
#plot_confusion_matrix(model, X_test, y_test, cmap= plt.cm.Blues, values_format='d')


In [None]:
# Iteration 2 - Standard Scalar
# -----------
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()


# Iteration 3 - try with MinMax scalar
# -----------
#from sklearn.preprocessing import MinMaxScaler 
#scaler = MinMaxScaler() 


X_train_scaled = scaler.fit_transform(X_train)  
X_test_scaled = scaler.transform(X_test)  

In [None]:
# Fit the model on 30%
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_predict = model.predict(X_test_scaled)

print(model.score(X_train_scaled, y_train))
print(model.score(X_test_scaled, y_test))

print(metrics.confusion_matrix(y_test, y_predict))

In [None]:
# ***********************************************
# NOTE - Scaling is required because of 'Regularization' performed in Logistic Regression
# Do NOT use  Logistic Regression without scaling when using default parameters
# In this program it is done only for discussion purpose
# ***********************************************