In [1]:
### Importing Required Libraries
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pickle

In [2]:
### Importing CSV File
df = pd.read_csv("diabetes_prediction_dataset.csv")
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [3]:
### Duplicate Values Count
df.duplicated().sum()

3854

In [4]:
### Remove Duplicate Values
df.drop_duplicates(inplace=True)

In [5]:
df.shape

(96146, 9)

### EDA is already done in `notebook.ipynb` file

In [6]:
### Doing One-Hot Encoding of 'gender' and 'smoking_history' columns
dummy_df = pd.get_dummies(df[["gender","smoking_history"]], drop_first=True)
dummy_df.head()

Unnamed: 0,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0
2,1,0,0,0,0,1,0
3,0,0,1,0,0,0,0
4,1,0,1,0,0,0,0


In [7]:
### Concatinating One-Hot Encoding columns with other columns
X = pd.concat([df[['age','hypertension','heart_disease','bmi','HbA1c_level','blood_glucose_level']],dummy_df], axis=1)
X.head()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,0,0,0,0,0,1,0
1,54.0,0,0,27.32,6.6,80,0,0,0,0,0,0,0
2,28.0,0,0,27.32,5.7,158,1,0,0,0,0,1,0
3,36.0,0,0,23.45,5.0,155,0,0,1,0,0,0,0
4,76.0,1,1,20.14,4.8,155,1,0,1,0,0,0,0


In [8]:
### Defining y
y = df['diabetes']

In [10]:
### Value Counts of y; is person diabetic or not
y.value_counts()

0    87664
1     8482
Name: diabetes, dtype: int64

We can see our y is imbalanced. So for solving this problem, we oversample our dataset with creating imaginary '1' values in dataset.

In [11]:
### SMOTE
smote = SMOTE(sampling_strategy='minority')
X, y = smote.fit_resample(X, y)

y.value_counts()

0    87664
1    87664
Name: diabetes, dtype: int64

In [12]:
### Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10, stratify=y)

In [22]:
### Training our model
model = RandomForestClassifier(n_estimators=50, criterion='gini')

model.fit(X_train,y_train)

In [23]:
### Predicting for X_test
y_pred = model.predict(X_test)

In [24]:
### Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     17533
           1       0.98      0.97      0.97     17533

    accuracy                           0.97     35066
   macro avg       0.97      0.97      0.97     35066
weighted avg       0.97      0.97      0.97     35066



In [25]:
### Saving pickle file
with open('model.pickle','wb') as f:
    pickle.dump(model,f)