Importing Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

Data Analysis

In [2]:
data = pd.read_csv("diabetes_prediction_project.csv")
data = pd.DataFrame(data)
temp_data = data.copy()

In [3]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
data.shape

(100000, 9)

In [5]:
data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [7]:
data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [8]:
data['diabetes'].value_counts()

diabetes
0    91500
1     8500
Name: count, dtype: int64

Data Preprocessing

In [9]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [10]:
data['gender'].value_counts()

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [11]:
# Since there are only 18 rows with gender as "Other" we drop them.
data = data[data["gender"] != "Other"]
data.shape

(99982, 9)

In [12]:
data['smoking_history'].value_counts()

smoking_history
No Info        35810
never          35092
former          9352
current         9286
not current     6439
ever            4003
Name: count, dtype: int64

In [13]:
# former, ever and not-current means the same thing> Hence we replace ever and not-current with former
data['smoking_history'].replace(['not current', 'ever'], 'former', inplace=True)
data['smoking_history'].replace('No Info', float('nan'), inplace=True)

In [14]:
data['smoking_history'].unique()

array(['never', nan, 'current', 'former'], dtype=object)

In [14]:
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [15]:
data = pd.get_dummies(data, columns=['gender'], prefix='gender', drop_first=True)
data['gender'] = data['gender_Male'].astype(int)
data = data.drop('gender_Male', axis = 1)
data.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,never,25.19,6.6,140,0,0
1,54.0,0,0,,27.32,6.6,80,0,0
2,28.0,0,0,never,27.32,5.7,158,0,1
3,36.0,0,0,current,23.45,5.0,155,0,0
4,76.0,1,1,current,20.14,4.8,155,0,1


In [16]:
data.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,never,25.19,6.6,140,0,0
1,54.0,0,0,,27.32,6.6,80,0,0
2,28.0,0,0,never,27.32,5.7,158,0,1
3,36.0,0,0,current,23.45,5.0,155,0,0
4,76.0,1,1,current,20.14,4.8,155,0,1


In [17]:
data['smoking_history'].value_counts()

smoking_history
never      35092
former     19794
current     9286
Name: count, dtype: int64

In [18]:
data.isnull().sum()

age                        0
hypertension               0
heart_disease              0
smoking_history        35810
bmi                        0
HbA1c_level                0
blood_glucose_level        0
diabetes                   0
gender                     0
dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder

null_mask = data['smoking_history'].isnull()

encoder = LabelEncoder()
data.loc[~null_mask,'smoking_history'] = encoder.fit_transform(data.loc[~null_mask,'smoking_history'])
data.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,2.0,25.19,6.6,140,0,0
1,54.0,0,0,,27.32,6.6,80,0,0
2,28.0,0,0,2.0,27.32,5.7,158,0,1
3,36.0,0,0,0.0,23.45,5.0,155,0,0
4,76.0,1,1,0.0,20.14,4.8,155,0,1


In [20]:
from sklearn.impute import KNNImputer

# Extract the 'smoking_history' column as a Pandas DataFrame
smoking_history_column = data[['smoking_history']]

# Apply KNN imputation
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(smoking_history_column)

# Replace the original 'smoking_history' column with the imputed values
data['smoking_history'] = imputed_data

data.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,2.0,25.19,6.6,140,0,0
1,54.0,0,0,1.402138,27.32,6.6,80,0,0
2,28.0,0,0,2.0,27.32,5.7,158,0,1
3,36.0,0,0,0.0,23.45,5.0,155,0,0
4,76.0,1,1,0.0,20.14,4.8,155,0,1


Making the partitions

In [21]:
condition1 = (data['HbA1c_level'] < 6.7) & (data['blood_glucose_level'] < 210)
partition1 = data[condition1]

In [22]:
partition1.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,2.0,25.19,6.6,140,0,0
1,54.0,0,0,1.402138,27.32,6.6,80,0,0
2,28.0,0,0,2.0,27.32,5.7,158,0,1
3,36.0,0,0,0.0,23.45,5.0,155,0,0
4,76.0,1,1,0.0,20.14,4.8,155,0,1


In [23]:
partition1.shape

(94295, 9)

In [24]:
partition1['diabetes'].unique()

array([0, 1], dtype=int64)

In [25]:
condition2 = (data['HbA1c_level'] < 6.7) & (data['blood_glucose_level'] >= 210)
partition2 = data[condition2]

In [26]:
partition2.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
38,50.0,1,0,0.0,27.32,5.7,260,1,1
87,36.0,0,0,0.0,32.27,6.2,220,1,0
125,77.0,0,0,2.0,31.7,6.5,280,1,0
146,53.0,0,0,0.0,30.8,6.6,280,1,1
199,43.0,0,0,2.0,26.71,6.5,300,1,0


In [27]:
partition2['diabetes'].unique()

array([1], dtype=int64)

In [28]:
partition2.shape

(1792, 9)

In [29]:
condition3 = (data['HbA1c_level'] >= 6.7)
partition3 = data[condition3]

In [30]:
partition3.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
40,73.0,0,0,1.0,25.91,9.0,160,1,1
53,53.0,0,0,1.0,27.32,7.0,159,1,0
55,50.0,0,0,1.0,37.16,9.0,159,1,1
59,67.0,0,0,2.0,63.48,8.8,155,1,0
81,57.0,0,0,1.402138,27.32,8.2,126,1,1


In [31]:
partition3.shape

(3895, 9)

In [32]:
partition3['diabetes'].unique()

array([1], dtype=int64)

As we can see, partition2 and partition3 have only "1" as value in column "diabetes". Hence we will only work on "partition1

In [33]:
partition1.head()

Unnamed: 0,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes,gender
0,80.0,0,1,2.0,25.19,6.6,140,0,0
1,54.0,0,0,1.402138,27.32,6.6,80,0,0
2,28.0,0,0,2.0,27.32,5.7,158,0,1
3,36.0,0,0,0.0,23.45,5.0,155,0,0
4,76.0,1,1,0.0,20.14,4.8,155,0,1


In [34]:
partition1['diabetes'].value_counts()

diabetes
0    91482
1     2813
Name: count, dtype: int64

We can see, that the dataset is imbalanced, hence we will use the concept of oversampling t make the dataset balanced. But before oversampling, we will split the data into training and testing sets.

In [35]:
from sklearn.model_selection import train_test_split
X = partition1.drop(['diabetes'], axis = 1)
y = partition1['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [36]:
# Applying Oversampling
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=2)

# Appling oversampling on X_train and y_train and saving the results in themselves
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [37]:
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_train_resampled: ", X_train_resampled.shape)
print("X_train_resampled: ", y_train_resampled.shape)

X_train:  (75436, 8)
y_train:  (75436,)
X_train_resampled:  (146376, 8)
X_train_resampled:  (146376,)


Now we have a balanced dataset and we are reasy to make a model

<big>Model 1: Logistic Regression</big>

In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create a Logistic Regression model
logistic_regression_model = LogisticRegression(random_state=42)

# Train the model
logistic_regression_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = logistic_regression_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7868
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.79      0.88     18294
           1       0.10      0.75      0.17       565

    accuracy                           0.79     18859
   macro avg       0.54      0.77      0.53     18859
weighted avg       0.96      0.79      0.86     18859



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<big>Model 2: KNN classifier</big>

In [39]:
from sklearn.neighbors import KNeighborsClassifier

# KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8603
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.87      0.92     18294
           1       0.12      0.56      0.19       565

    accuracy                           0.86     18859
   macro avg       0.55      0.72      0.56     18859
weighted avg       0.96      0.86      0.90     18859



<big>Model 3: XGBoost</big>

In [40]:
import xgboost as xgb

# XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Train the model
xgb_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9640
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18294
           1       0.29      0.14      0.19       565

    accuracy                           0.96     18859
   macro avg       0.63      0.56      0.58     18859
weighted avg       0.95      0.96      0.96     18859



<big>Model 5: Random Forest Classifier</big>

In [41]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100,criterion='gini',random_state=42)

# Train the model
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9574
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.98     18294
           1       0.22      0.17      0.19       565

    accuracy                           0.96     18859
   macro avg       0.60      0.57      0.58     18859
weighted avg       0.95      0.96      0.95     18859



<big>Model 4: SVC</big>

In [None]:
from sklearn.svm import SVC

# SVC model
svc_model = SVC(random_state=42)

# Train the model
svc_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = svc_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Print classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
