## Insuring that the good python version is installed

In [66]:
import sys
if sys.version_info[0:2] != (3, 12 ):
    raise Exception('Requires python 3.12')

### Importing all useful libraries

In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split , cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
import joblib
import os

### Importing the dataset

In [68]:
csv_path = os.path.join("..","data", 'TAIPEI_diabetes.csv')

Converting the csv file to a parquet file for better performance

In [69]:
pd.read_csv(csv_path).to_parquet("..//data//TAIPEI_diabetes.parquet", index=False)

Transforming the parquet file to a datafarme

In [70]:
df= pd.read_parquet("..//data//TAIPEI_diabetes.parquet", engine='auto')
df.set_index('PatientID', inplace=True)

### Dataset exploration

In [71]:
df.shape

(15000, 9)

In [72]:
print(df.columns)

Index(['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],
      dtype='object')


In [73]:
df.head()

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,0
1147438,8,92,93,47,36,21.240576,0.158365,23,0
1640031,7,115,47,52,35,41.511523,0.079019,23,0
1883350,9,103,78,25,304,29.582192,1.28287,43,1
1424119,1,85,59,27,35,42.604536,0.549542,22,0


In [74]:
df.describe()

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,3.224533,107.856867,71.220667,28.814,137.852133,31.509646,0.398968,30.137733,0.333333
std,3.39102,31.981975,16.758716,14.555716,133.068252,9.759,0.377944,12.089703,0.47142
min,0.0,44.0,24.0,7.0,14.0,18.200512,0.078044,21.0,0.0
25%,0.0,84.0,58.0,15.0,39.0,21.259887,0.137743,22.0,0.0
50%,2.0,104.0,72.0,31.0,83.0,31.76794,0.200297,24.0,0.0
75%,6.0,129.0,85.0,41.0,195.0,39.259692,0.616285,35.0,1.0
max,14.0,192.0,117.0,93.0,799.0,56.034628,2.301594,77.0,1.0


There is no NA in any of the feature values meaning that the data is "relatively" clean.

In [75]:
num_dup=df.index.duplicated().sum()
print(f'Number of duplicated IDs in PatientID: {num_dup}')

Number of duplicated IDs in PatientID: 105


In [76]:
df.loc[df.index[df.index.duplicated()].unique()].head(50)

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1429759,8,92,67,48,171,36.300158,0.153597,39,1
1429759,1,113,65,20,73,45.593722,0.768987,46,1
1149892,0,126,52,43,35,21.173514,0.268624,23,0
1149892,6,148,55,11,22,27.019037,0.300583,55,0
1861748,2,170,64,42,97,26.643396,0.195699,41,1
1861748,10,138,53,45,195,43.715952,0.260733,25,0
1033352,0,157,97,34,221,36.263025,0.182434,60,0
1033352,9,111,100,8,79,31.445866,0.283731,38,1
1806908,1,111,81,31,191,20.20412,0.096086,22,0
1806908,7,97,66,45,178,39.075164,0.112161,26,0


When examining the first 20 duplicates, we observed that only the IDs are duplicated, while the feature values differ. 
This variation indicates that these are not exact duplicates.  

Showing the count and average number of pregnancies for cases where pregnancies exceed 3 in the dataset.

In [77]:
df.loc[df.Pregnancies >= 3].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,6820.0,6820.0,6820.0,6820.0,6820.0,6820.0,6820.0,6820.0,6820.0
mean,6.396774,110.154692,72.158211,29.895455,154.299707,32.576146,0.429397,32.318915,0.578152


Showing the count and average number of pregnancies for cases where pregnancies exceed 11 in the dataset.

In [78]:
df.loc[df.Pregnancies > 11].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,12.819355,115.883871,73.354839,34.129032,194.864516,35.319763,0.517137,36.851613,1.0


Showing the count and average number of pregnancies for cases where number of pregnancies is 0 in the dataset.

In [79]:
df.loc[df.Pregnancies == 0].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,4377.0,4377.0,4377.0,4377.0,4377.0,4377.0,4377.0,4377.0,4377.0
mean,0.0,105.003655,69.937172,27.365776,114.208362,30.065794,0.356197,27.290153,0.012109


Displaying the count and mean BMI for individuals with a BMI below 25 in the dataset.

In [80]:
df.loc[df.BMI<25].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,5416.0,5416.0,5416.0,5416.0,5416.0,5416.0,5416.0,5416.0,5416.0
mean,2.54062,106.175222,70.667836,27.741507,122.049114,20.542919,0.369461,28.196824,0.103582


Displaying the count and mean BMI for individuals with a BMI above 30 in the dataset.

In [81]:
df.loc[df.BMI>=30].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,8248.0,8248.0,8248.0,8248.0,8248.0,8248.0,8248.0,8248.0,8248.0
mean,3.469932,108.252546,71.370999,29.134093,143.344447,39.316327,0.41089,30.740907,0.411857


Displaying the count and mean SerumInsulin for individuals with a SerumInsulin below 16 in the dataset.

In [82]:
df.loc[df.SerumInsulin<16].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0,289.0
mean,2.577855,104.982699,68.951557,28.965398,14.453287,31.327474,0.352801,28.321799,0.134948


Displaying the count and mean SerumInsulin for individuals with a SerumInsulin above 166 in the dataset.

In [83]:
df.loc[df.SerumInsulin>166].describe().loc[['count', 'mean']]

Unnamed: 0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
count,5077.0,5077.0,5077.0,5077.0,5077.0,5077.0,5077.0,5077.0,5077.0
mean,3.61296,108.820563,71.476462,29.180225,283.839472,32.11702,0.418762,31.401615,0.454205


Replacing 0 and 1 in diabetis status by Non Diabetic and Diabetic repectively.

In [84]:
df2=df.copy()
df2["Diabetic"]=df2["Diabetic"].replace([0,1],["Non diabetic","Diabetic"])

In [85]:
df2.head(15)

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic
1883350,9,103,78,25,304,29.582192,1.28287,43,Diabetic
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic
1619297,0,82,92,9,253,19.72416,0.103424,26,Non diabetic
1660149,0,133,47,19,227,21.941357,0.17416,21,Non diabetic
1458769,0,67,87,43,36,18.277723,0.236165,26,Non diabetic
1201647,8,80,95,33,24,26.624929,0.443947,53,Diabetic
1403912,1,72,31,40,42,36.889576,0.103944,26,Non diabetic


In [86]:
plt.pie(df2.Diabetic.value_counts(),
        labels= df2.Diabetic.unique(),
        autopct='%1.2f%%',
        explode=[0.05,0.05],
        shadow =True,
       colors = ["c","y"],
       startangle=60)

plt.title("Distribution of diabetic and non-diabetic in the dataset")
plt.show()

df2.Diabetic.value_counts()


  plt.show()


Diabetic
Non diabetic    10000
Diabetic         5000
Name: count, dtype: int64

In [87]:
diabetic_data = df2[df2["Diabetic"] == "Diabetic"]
non_diabetic_data = df2[df2["Diabetic"] == "Non diabetic"]

pregnancy_values = sorted(df2["Pregnancies"].unique())
bin_edges = np.arange(min(pregnancy_values) - 0.5, 
                      max(pregnancy_values) + 1.5, 1)  # Adjust the bin edges for centering

# Plot histograms for both diabetic and non-diabetic groups
plt.hist(diabetic_data["Pregnancies"], 
         bins=bin_edges, 
         alpha=0.5, 
         label='Diabetic', 
         edgecolor='black', 
         align='mid')
plt.hist(non_diabetic_data["Pregnancies"], 
         bins=bin_edges, 
         alpha=0.5, 
         label='Non-Diabetic', 
         edgecolor='black', 
         align='mid')

plt.xticks(pregnancy_values) 
plt.xlabel('Number of Pregnancies')
plt.ylabel('Frequency')
plt.title('Histogram of Pregnancies Among Diabetic and Non-Diabetic Individuals')
plt.legend()
plt.show()


  plt.show()


In [88]:
df3=df2.copy()

Categorizing BMI values and adding a new column cat_BMI to the dataset (not for cats! 😆).

In [89]:
def categorize_bmi(bmi):
    if 18.5 <= bmi <= 24.9:
        return 'normal'
    elif 25 <= bmi <= 29.9:
        return 'overweight'
    elif bmi >= 30:
        return 'obese'
    else:
        return 'underweight'

df3['cat_BMI'] = df3['BMI'].apply(categorize_bmi)
df3.head()

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,cat_BMI
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic,obese
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic,normal
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic,obese
1883350,9,103,78,25,304,29.582192,1.28287,43,Diabetic,overweight
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic,obese


Categorizing SerumInsulin values and adding a new column cat_Insulin to the dataset.

In [90]:
def categorize_insulin(insulin):
    if insulin < 16:
        return 'low'
    elif 16 <= insulin <= 166:
        return 'normal'
    else:
        return 'high'
df3['cat_Insulin'] = df3['SerumInsulin'].apply(categorize_insulin)
df3.head()

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,cat_BMI,cat_Insulin
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic,obese,normal
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic,normal,normal
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic,obese,normal
1883350,9,103,78,25,304,29.582192,1.28287,43,Diabetic,overweight,high
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic,obese,normal


Categorizing DiastolicBloodPressure values and adding a new column cat_BP to the dataset.

In [91]:
def categorize_bp(bp):
    if bp < 80:
        return 'low'
    elif 80 <= bp <= 120:
        return 'normal'
    else:
        return 'high'
df3['cat_BP'] = df3['DiastolicBloodPressure'].apply(categorize_bp)
df3.head(20)

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,cat_BMI,cat_Insulin,cat_BP
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic,obese,normal,normal
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic,normal,normal,normal
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic,obese,normal,low
1883350,9,103,78,25,304,29.582192,1.28287,43,Diabetic,overweight,high,low
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic,obese,normal,low
1619297,0,82,92,9,253,19.72416,0.103424,26,Non diabetic,normal,high,normal
1660149,0,133,47,19,227,21.941357,0.17416,21,Non diabetic,normal,high,low
1458769,0,67,87,43,36,18.277723,0.236165,26,Non diabetic,underweight,normal,normal
1201647,8,80,95,33,24,26.624929,0.443947,53,Diabetic,overweight,normal,normal
1403912,1,72,31,40,42,36.889576,0.103944,26,Non diabetic,obese,normal,low


Categorizing PlasmaGlucose values and adding a new column cat_PlasmaGlucose to the dataset.

In [92]:
def categorize_plasma_glucose(pg):
    if pg < 140:
        return 'normal'
    elif 140 <= pg <= 199:
        return 'prediabetes'
    else:
        return 'diabetes'

# Apply the function to create a new column for categorized plasma glucose
df3['cat_PlasmaGlucose'] = df3['PlasmaGlucose'].apply(categorize_plasma_glucose)

# Display the first few rows of the dataframe
df3.head(10)

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,cat_BMI,cat_Insulin,cat_BP,cat_PlasmaGlucose
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic,obese,normal,normal,prediabetes
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic,normal,normal,normal,normal
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic,obese,normal,low,normal
1883350,9,103,78,25,304,29.582192,1.28287,43,Diabetic,overweight,high,low,normal
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic,obese,normal,low,normal
1619297,0,82,92,9,253,19.72416,0.103424,26,Non diabetic,normal,high,normal,normal
1660149,0,133,47,19,227,21.941357,0.17416,21,Non diabetic,normal,high,low,normal
1458769,0,67,87,43,36,18.277723,0.236165,26,Non diabetic,underweight,normal,normal,normal
1201647,8,80,95,33,24,26.624929,0.443947,53,Diabetic,overweight,normal,normal,normal
1403912,1,72,31,40,42,36.889576,0.103944,26,Non diabetic,obese,normal,low,normal


Categorizing TricepsThickness values and adding a new column cat_TricepsThickness to the dataset.

In [93]:
def categorize_triceps_thickness(tt):
    if tt < 12:
        return 'low'
    elif 12 <= tt <= 29:
        return 'normal'
    else:
        return 'high'

df3['cat_TricepsThickness'] = df3['TricepsThickness'].apply(categorize_triceps_thickness)

df3
#df3.groupby('Diabetic', group_keys=False).apply(lambda x: x.sample(n=7, random_state=42), include_groups=False).reset_index(drop=True)

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic,cat_BMI,cat_Insulin,cat_BP,cat_PlasmaGlucose,cat_TricepsThickness
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1354778,0,171,80,34,23,43.509726,1.213191,21,Non diabetic,obese,normal,normal,prediabetes,high
1147438,8,92,93,47,36,21.240576,0.158365,23,Non diabetic,normal,normal,normal,normal,high
1640031,7,115,47,52,35,41.511523,0.079019,23,Non diabetic,obese,normal,low,normal,high
1883350,9,103,78,25,304,29.582192,1.282870,43,Diabetic,overweight,high,low,normal,normal
1424119,1,85,59,27,35,42.604536,0.549542,22,Non diabetic,obese,normal,low,normal,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490300,10,65,60,46,177,33.512468,0.148327,41,Diabetic,obese,high,low,normal,high
1744410,2,73,66,27,168,30.132636,0.862252,38,Diabetic,obese,high,low,normal,normal
1742742,0,93,89,43,57,18.690683,0.427049,24,Non diabetic,normal,normal,normal,normal,high
1099353,0,132,98,18,161,19.791645,0.302257,23,Non diabetic,normal,normal,normal,normal,normal


Comparing the mean of each feature in each disease status.

In [94]:
piv_table = df2.pivot_table(values=['Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],index="Diabetic",aggfunc="mean")

piv_table

Unnamed: 0_level_0,Age,BMI,DiabetesPedigree,DiastolicBloodPressure,PlasmaGlucose,Pregnancies,SerumInsulin,TricepsThickness
Diabetic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Diabetic,35.9952,34.414834,0.48999,73.3846,113.6462,5.1778,184.4034,31.9532
Non diabetic,27.209,30.057052,0.353457,70.1387,104.9622,2.2479,114.5765,27.2444


In [95]:
categories = piv_table.columns
diabetic_means = piv_table.iloc[0,:].values
non_diabetic_means = piv_table.iloc[1,:].values

# Bar chart setup
x = np.arange(len(categories))  # X-axis positions for features
width = 0.35  # Width of the bars

plt.figure(figsize=(12, 6))

# Plot bars for diabetic and non-diabetic groups
plt.bar(x - width/2, 
        diabetic_means, 
        width, 
        label='Diabetic', 
        color='y')
plt.bar(x + width/2, 
        non_diabetic_means, 
        width, 
        label='Non-Diabetic',
        color='c')

# Add labels, title, and legend
plt.xlabel('Features')
plt.ylabel('Average values')
plt.title('Comparison of feature averages: Diabetic vs. Non-Diabetic')
plt.xticks(x, categories, rotation=45)  # Add feature names on x-axis
plt.legend()

plt.tight_layout()
plt.show()

  plt.show()


In [96]:
x_prime=x[[0,2]]

plt.bar( x_prime- width/2, 
        diabetic_means[[2,5]], 
        label='Diabetic', 
        color='y')
plt.bar(x_prime + width/2, 
        non_diabetic_means[[2,5]], 
        label='Non-Diabetic', 
        color='c')
plt.xlabel('Features')
plt.ylabel('Average values')
plt.title('Comparison of feature averages: Diabetic vs. Non-Diabetic')
plt.xticks(x_prime, categories[[2,5]], rotation=45)  # Add feature names on x-axis
plt.legend()

plt.tight_layout()
plt.show()

  plt.show()


In [97]:
#lets see if we can see better in correlation matrix now

In [98]:
plt.figure(figsize=(16, 6))
# define the mask to set the values in the upper triangle to True
mask = np.triu(np.ones_like(df.corr()))
heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Lower triangle correlation heatmap of features')

Text(0.5, 1.0, 'Lower triangle correlation heatmap of features')

In [99]:
plt.figure(figsize=(15, 10))  
for i, feature in enumerate(categories, 1):
    plt.subplot(3, 3, i) 
    sns.boxplot(data=df2, y='Diabetic', x=feature, hue='Diabetic', palette="RdBu", legend=False)
    plt.title(f" Boxlot: {feature} by Diabetes Status")
    plt.xlabel("")
    plt.ylabel(feature)

plt.tight_layout() 
plt.show()

  plt.show()


In [100]:
plt.figure(figsize=(15, 10)) 
for i, feature in enumerate(categories, 1):
    plt.subplot(3, 3, i) 
    sns.violinplot(data=df2, y='Diabetic', x=feature, hue='Diabetic', palette="RdBu", legend=False)
    plt.title(f"Violin Plot: {feature} by Diabetes Status")
    plt.xlabel("")
    plt.ylabel(feature)

plt.tight_layout()  
plt.show()

  plt.show()


# SCALING

In [101]:
scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df.drop(columns=['Diabetic'])), 
                         columns=df.drop(columns=['Diabetic']).columns, 
                         index=df.index)

scaled_df.head()

Unnamed: 0_level_0,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1354778,-0.950935,1.974401,0.523884,0.356298,-0.863136,1.229683,2.154424,-0.755853
1147438,1.408315,-0.495823,1.299626,1.249448,-0.765438,-1.052302,-0.636632,-0.590417
1640031,1.113409,0.223356,-1.445306,1.592967,-0.772953,1.024922,-0.846581,-0.590417
1883350,1.703222,-0.151868,0.404539,-0.262036,1.248633,-0.197512,2.338793,1.063938
1424119,-0.656029,-0.714703,-0.729237,-0.124629,-0.772953,1.136926,0.398417,-0.673135


In [102]:
# Displaying the distribution of the  scaled data
plt.figure(figsize=(15, 10)) 
for i, feature in enumerate(scaled_df.columns, 1):
    plt.subplot(3, 3, i) 
    sns.violinplot(data=scaled_df, y=df2['Diabetic'], x=feature, hue=df2['Diabetic'], palette="RdBu", legend=False)
    plt.title(f"Violin Plot: {feature} by Diabetes Status")
    plt.xlabel("")
    plt.ylabel(feature)

plt.tight_layout()  
plt.show()

  plt.show()


# MODELS TRAINING

## Data splitting

In [103]:
X = scaled_df  
y = df['Diabetic'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Logistic regression

In [104]:
lr_model = LogisticRegression(max_iter=1000)

lr_model.fit(X_train, y_train)

y_pred_lr = lr_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_lr)
labels = lr_model.classes_  

plt.figure(figsize=(2, 2))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Confusion matrix')
plt.show()

print("\nLogistic Regression classification report:\n", classification_report(y_test, y_pred_lr))

print("\nCoefficients:\n", lr_model.coef_)
print("\nIntercept:\n", lr_model.intercept_)


  plt.show()



Logistic Regression classification report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85      3005
           1       0.73      0.57      0.64      1495

    accuracy                           0.79      4500
   macro avg       0.77      0.73      0.74      4500
weighted avg       0.78      0.79      0.78      4500


Coefficients:
 [[0.91743026 0.27913694 0.17059346 0.33980193 0.53074207 0.45943886
  0.37558231 0.69901575]]

Intercept:
 [-1.00090718]


In [105]:
lr_precision, lr_recall, lr_fscore, lr_support = precision_recall_fscore_support(y_test, y_pred_lr)

### Decison tree

In [106]:
dt_model = DecisionTreeClassifier( max_depth=5, random_state=42)

dt_model.fit(X_train, y_train)

y_pred_dt = dt_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_dt)
labels = dt_model.classes_ 


plt.figure(figsize=(2, 2))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Confusion matrix')
plt.show()

print("\nDecision tree classification Report:\n", classification_report(y_test, y_pred_dt))

print("\nFeature Importances:\n")
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': dt_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)


# Optional: Plot Feature Importances
plt.figure(figsize=(8, 4))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importances')
plt.show()


Decision tree classification Report:
               precision    recall  f1-score   support

           0       0.93      0.90      0.92      3005
           1       0.82      0.87      0.84      1495

    accuracy                           0.89      4500
   macro avg       0.88      0.89      0.88      4500
weighted avg       0.90      0.89      0.89      4500


Feature Importances:

                  Feature  Importance
0             Pregnancies    0.554377
5                     BMI    0.228715
4            SerumInsulin    0.106186
7                     Age    0.097404
1           PlasmaGlucose    0.010806
3        TricepsThickness    0.001562
6        DiabetesPedigree    0.000951
2  DiastolicBloodPressure    0.000000


  plt.show()
  plt.figure(figsize=(8, 4))
  plt.show()


### Random forest

In [107]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

cm = confusion_matrix(y_test, y_pred_rf)
labels = rf_model.classes_ 

plt.figure(figsize=(2, 2))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('Actual labels')
plt.title('Confusion matrix')
plt.show()

print("\nRandom Foret classification report:\n", classification_report(y_test, y_pred_rf))

print("\nFeature Importances:\n")
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': rf_model.feature_importances_})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)
print(feature_importances)

plt.figure(figsize=(8, 4))
sns.barplot(x='Importance', y='Feature', data=feature_importances)
plt.title('Feature Importances')
plt.show()


Random Foret classification report:
               precision    recall  f1-score   support

           0       0.94      0.96      0.95      3005
           1       0.91      0.88      0.90      1495

    accuracy                           0.93      4500
   macro avg       0.93      0.92      0.92      4500
weighted avg       0.93      0.93      0.93      4500


Feature Importances:

                  Feature  Importance
0             Pregnancies    0.345571
5                     BMI    0.180496
7                     Age    0.176082
4            SerumInsulin    0.111852
1           PlasmaGlucose    0.087922
3        TricepsThickness    0.039675
2  DiastolicBloodPressure    0.033140
6        DiabetesPedigree    0.025262


  plt.show()
  plt.show()


#### Saving the models

In [108]:
os.makedirs("../models", exist_ok=True)

joblib.dump(rf_model, "../models/random_forest_model.pkl")
joblib.dump(dt_model, "../models/decision_tree_model.pkl")
joblib.dump(lr_model, "../models/logistic_regression_model.pkl")

['../models/logistic_regression_model.pkl']

### Graph results of metrics of the 3 models

In [109]:
metrics = ["Precision", "Recall", "F1-Score", "Accuracy"]
models = ["Logistic Regression", "Decision Tree", "Random Forest"]

#logistic regression metrics

lr_precision = precision_score(y_test, y_pred_lr, average='macro')
lr_recall = recall_score(y_test, y_pred_lr, average='macro')
lr_f1 = f1_score(y_test, y_pred_lr, average='macro')
lr_accuracy = accuracy_score(y_test, y_pred_lr)

# decision tree metrics

dt_precision = precision_score(y_test, y_pred_dt, average='macro')
dt_recall = recall_score(y_test, y_pred_dt, average='macro')
dt_f1 = f1_score(y_test, y_pred_dt, average='macro')
dt_accuracy = accuracy_score(y_test, y_pred_dt)


#random forest metrics

rf_precision = precision_score(y_test, y_pred_rf, average='macro')
rf_recall = recall_score(y_test, y_pred_rf, average='macro')
rf_f1 = f1_score(y_test, y_pred_rf, average='macro')
rf_accuracy = accuracy_score(y_test, y_pred_rf)


lr_values = [lr_precision, lr_recall, lr_f1, lr_accuracy]
dt_values = [dt_precision, dt_recall, dt_f1, dt_accuracy]
rf_values = [rf_precision, rf_recall, rf_f1, rf_accuracy]


In [110]:
metric_values = np.array([lr_values, dt_values, rf_values])
x = np.arange(len(metrics)) 
width = 0.25 

fig, ax = plt.subplots(figsize=(10, 6))

ax.bar(x - width, metric_values[0], width, label="Logistic Regression")
ax.bar(x, metric_values[1], width, label="Decision Tree")
ax.bar(x + width, metric_values[2], width, label="Random Forest")
ax.set_xlabel("Metrics")
ax.set_ylabel("Scores")
ax.set_title("Performance comparison of machine learning models")
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend(loc='center left',fontsize ="x-small")


plt.show()

  plt.show()


In [111]:
cv_f1_scores = cross_val_score(rf_model, X, y, cv=5, scoring='f1_macro')
cv_accuracy_scores = cross_val_score(rf_model, X, y, cv=5, scoring='accuracy')

# Print Cross-validation results
print("Cross-Validation Results:")
print("-" * 80)
print("F1 Macro Scores:", cv_f1_scores)
print("Mean F1 Score:", np.mean(cv_f1_scores))
print("F1 Score Standard Deviation:", np.std(cv_f1_scores))

print("\nAccuracy Scores:", cv_accuracy_scores)
print("Mean Accuracy:", np.mean(cv_accuracy_scores))
print("Accuracy Standard Deviation:", np.std(cv_accuracy_scores))

Cross-Validation Results:
--------------------------------------------------------------------------------
F1 Macro Scores: [0.92426502 0.93176299 0.92188065 0.92279409 0.93237763]
Mean F1 Score: 0.9266160757209825
F1 Score Standard Deviation: 0.004522070085730203

Accuracy Scores: [0.93366667 0.93966667 0.931      0.93166667 0.94033333]
Mean Accuracy: 0.9352666666666666
Accuracy Standard Deviation: 0.003968766950969921


## PIMA dataset exploration

In [112]:
# Visualising the PIMA dataset distributions using violin plots
pima_path = os.path.join("..","data", 'PIMA_diabetes.csv')
pima = pd.read_csv(pima_path)
pima['Outcome'] = pima['Outcome'].replace({0: 'Non-Diabetic', 1: 'Diabetic'})

In [113]:
plt.figure(figsize=(15, 10)) 
for i, feature in enumerate(pima.drop(columns=["Outcome"]).columns, 1):
    plt.subplot(3, 3, i) 
    sns.violinplot(data=pima, y=pima["Outcome"], x=feature, hue=pima["Outcome"], palette="Greens", legend=False)
    plt.title(f"Violin plot: {feature} by Diabetes Status")
    plt.xlabel("")
    plt.ylabel(feature)

plt.tight_layout()  
plt.show()

  plt.show()
