# 1. Importing Libraries

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn. preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.naive_bayes import CategoricalNB, BernoulliNB
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# 2. Importing Dataset

In [None]:
df = pd.read_csv('Healthcare-Diabetes.csv')

In [None]:
df.rename(columns = {'Outcome' : 'Diabetic'}, inplace = True)
df.head(5)

# 4. Feature Engineering

**Removing the irrelevant variables from the dataset.**

The `Id` doesn't help in predicting the outcome variable. Therefore, we will remove it along with other variables which shows very low or no correlation with the target variable.

In [None]:
df = df.drop(['Id','BloodPressure', 'SkinThickness','Insulin', 'DiabetesPedigreeFunction'], axis = 1)
df.head()

- The variables Glucose (0.46) and BMI (0.28) are normally distributed and highly correlated with diabetes, making them strong potential predictor variables. Age (0.23) and Pregnancies (0.22) also show a notable correlation.
- In addition, DiabetesPedigreeFunction (0.16) and Insulin (0.12) exhibit some correlation with diabetes.
- Given that the outcome variable is a binary categorical variable (diabetic or non-diabetic), we will need classification models to analyze and predict these relationships.

# 5. Naive Bayes

In [None]:
# defining predictor variable
y = df['Diabetic']
y

In [None]:
# defining target variable

x = df.copy()
x = x.drop(columns = [ 'Diabetic' ])
x.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, stratify = y, random_state = 0)
print(f" x train: {x_train.shape} \n x test:  {x_test.shape} \n y train: {y_train.shape} \n y test:  {y_test.shape}")

In [None]:
nb = BernoulliNB(binarize=0.0)
model = nb.fit(x_train, y_train)
model

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
# evaluating the model using metrics
print(f"Accuracy: {round(accuracy_score(y_test, y_pred),3)}")
print(f"Precision: {round(precision_score(y_test, y_pred),3)}")
print(f"recall: {round(recall_score(y_test, y_pred),3)}")
print(f"F1 Score: {round(f1_score(y_test, y_pred),3)}")


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = nb.classes_)
disp.plot()

The Naive Bayes model predicted the value 0 for all the observations, resulting in a significant number of false negatives and false positives.

# 6. Random Forest Model

In [None]:
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, stratify = y_train, random_state = 0, test_size = 0.20)

In [None]:
# defining the hyperparamters
cv_params = {
    'max_depth' : [8,10,14,20],
    'n_estimators' : [20, 40, 60, 80, 100],
    'min_samples_leaf' : [0.25, 0.5, 1],
    'min_samples_split' : [0.001, 0.01, 0.05],
    'max_features' : ['sqrt'],
    'max_samples' : [.5, .9]
}

In [None]:
split_index = [0 if x in x_val.index else -1 for x in x_train.index]
custom_split = PredefinedSplit(split_index)

In [None]:
clf = RandomForestClassifier(random_state = 0)

In [None]:
rf = GridSearchCV(clf, cv_params, cv = custom_split, refit = 'f1', n_jobs = -1, verbose = 1)

In [None]:
rf.fit(x_train, y_train)

In [None]:
rf.best_params_

In [None]:
# using optimal paramters for grid search cross validation
rf_op = RandomForestClassifier(n_estimators = 20, max_depth = 20, max_features = 'sqrt', max_samples = 0.9, min_samples_leaf = 1, 
                              min_samples_split = 0.001, random_state = 0)

In [None]:
rf_op.fit(x_train, y_train)

In [None]:
y_pred = rf_op.predict(x_test)
y_pred

In [None]:
# evaluating the model using metrics
print(f"Accuracy: {round(accuracy_score(y_test, y_pred),3)}")
print(f"Precision: {round(precision_score(y_test, y_pred),3)}")
print(f"recall: {round(recall_score(y_test, y_pred),3)}")
print(f"F1 Score: {round(f1_score(y_test, y_pred),3)}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = rf_op.classes_)
                              
disp.plot()

# 7. Conclusion

In conclusion, the model exhibits strong performance across multiple evaluation metrics:

- Accuracy: 0.984, indicating a high overall correctness in its predictions.
- Precision: 0.995, demonstrating a high ability to correctly identify positive cases without many false positives.
- Recall: 0.958, highlighting the model's capability to capture a significant portion of true positive cases.
- F1 Score: 0.976, representing a balanced measure that combines precision and recall effectively.
- These results suggest that the model is robust and proficient in its task, achieving a high degree of accuracy and precision while maintaining a respectable recall rate.