<a href="https://colab.research.google.com/github/vatsalapushkar10/Modeling-and-Simulation-of-Glucose-Insulin-Metabolism/blob/main/Modeling_and_Simulation_of_Glucose_Insulin_Metabolism_using_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_validate

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

In [None]:
data = pd.read_csv('/content/diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
fig = make_subplots(rows=2, cols=4, subplot_titles=('<b>Distribution of Pregnancies</b>','<b>Distribution of Glucose</b>', '<b>Distribution of Blood Pressure</b>',
'<b>Distribution of Skin Thickness</b>', '<b>Distribution of Insulin</b>', '<b>Distribution of BMI</b>', '<b>Distribution of Diabetes Pedigree Func.</b>',
   '<b>Distribution of Age</b>' ))

fig.add_trace(go.Histogram(x=data['Pregnancies']), row=1, col=1)
fig.add_trace(go.Histogram(x=data['Glucose']), row=1, col=2)
fig.add_trace(go.Histogram(x=data['BloodPressure']), row=1, col=3)
fig.add_trace(go.Histogram(x=data['SkinThickness']), row=1, col=4)
fig.add_trace(go.Histogram(x=data['Insulin']), row=2, col=1)
fig.add_trace(go.Histogram(x=data['BMI']), row=2, col=2)
fig.add_trace(go.Histogram(x=data['DiabetesPedigreeFunction']), row=2, col=3)
fig.add_trace(go.Histogram(x=data['Age']), row=2, col=4)

In [None]:
fig.update_layout(showlegend=False, width=1200, height=400, autosize=False, margin=dict(t=15, b=0, l=5, r=5), template="simple_white",)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
fig.update_annotations(font_size=10)
# Reduce opacity
fig.update_traces(opacity=0.75)
fig.show()

In [None]:
data=data.drop_duplicates()

In [None]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [None]:
data['Glucose'] = data['Glucose'].replace(0,data['Glucose'].mean())
data['BloodPressure'] = data['BloodPressure'].replace(0,data['BloodPressure'].mean())
#skewed distribution
data['SkinThickness'] = data['SkinThickness'].replace(0,data['SkinThickness'].median())
data['Insulin'] = data['Insulin'].replace(0,data['Insulin'].median())
data['BMI'] = data['BMI'].replace(0,data['BMI'].median())

In [None]:
X = data.drop(['Outcome'], axis=1)
y = data['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = X_train.fillna(X_train.mean())
X_test = X_test.fillna(X_test.mean())

In [None]:
print("Train set shape - X:", X_train.shape, "y:", y_train.shape)
print("Test set shape - X:", X_test.shape, "y:", y_test.shape)

Train set shape - X: (614, 8) y: (614,)
Test set shape - X: (154, 8) y: (154,)


In [None]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]
    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car
cat_cols, num_cols, cat_but_car = grab_col_names(data)


Observations: 768
Variables: 9
cat_cols: 1
num_cols: 8
cat_but_car: 0
num_but_cat: 1


In [None]:
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])
#When predicting other data, the parameters of predicted data must be used
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.639947,0.865276,-0.021044,0.831114,-0.608201,0.167240,0.468492,1.425995,1
1,-0.844885,-1.205989,-0.516583,0.180566,-0.608201,-0.851551,-0.365061,-0.190672,0
2,1.233880,2.015979,-0.681762,-0.469981,-0.608201,-1.331838,0.604397,-0.105584,1
3,-0.844885,-1.074480,-0.516583,-0.469981,-0.006185,-0.633239,-0.920763,-1.041549,0
4,-1.141852,0.503626,-2.663916,0.831114,0.695378,1.549885,5.484909,-0.020496,1
...,...,...,...,...,...,...,...,...,...
763,1.827813,-0.679954,0.309315,2.240633,0.809145,0.065361,-0.908682,2.532136,0
764,-0.547919,0.010468,-0.186224,-0.036283,-0.608201,0.632973,-0.398282,-0.531023,0
765,0.342981,-0.022409,-0.021044,-0.469981,0.164466,-0.909768,-0.685193,-0.275760,0
766,-0.844885,0.141977,-1.012121,-0.469981,-0.608201,-0.342155,-0.371101,1.170732,1


In [None]:
outcome_mapping = {0: 'Not Diabetic', 1: 'Diabetic'}

# Map the values in the 'Outcome' column to their meanings
data['Outcome_Label'] = data['Outcome'].map(outcome_mapping)

# Display the counts of each outcome
outcome_counts = data['Outcome_Label'].value_counts()

# Display the mapping of values to meanings
print(outcome_mapping)

# Display the counts of each outcome
print(outcome_counts)

{0: 'Not Diabetic', 1: 'Diabetic'}
Not Diabetic    500
Diabetic        268
Name: Outcome_Label, dtype: int64


In [None]:
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [None]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.77
Confusion Matrix:
[[83 16]
 [19 36]]
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.84      0.83        99
           1       0.69      0.65      0.67        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

