# Diabetes Prediction Model

In [None]:
"""
Data Analysis inspired by:
- Original notebook: https://www.kaggle.com/code/yousseftaha3/diabetes-prediction-random-forest-97-3
- Author: Youssef Taha
- License: Apache 2.0

Modified and adapted for Intelligent Health Prediction System - university project.
"""

In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, \
    classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("../data/diabetes_prediction_dataset.csv")
df.info()

In [None]:
print("Shape:", df.shape)

In [None]:
df.describe()

In [None]:
df.sample(5)

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
print("Duplicates:", df.duplicated().sum())

# Data Cleaning

In [None]:
df = df.drop_duplicates()

cat_cols = ['gender', 'smoking_history']
for col in cat_cols:
    df[col] = df[col].astype('category')

In [None]:
print("Duplicates:", df.duplicated().sum())

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.sample(5)

# Data Visualization

In [None]:
plt.figure(figsize=(12, 5))
df['age'].hist(bins=30)
plt.title("Age Distribution")
plt.show()

In [None]:
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
for col in categorical_cols:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=col, data=df)
    plt.title(f"Count of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.boxplot(x=df['bmi'])
plt.title("BMI Boxplot")
plt.show()

In [None]:
plt.figure(figsize=(6, 5))
sns.scatterplot(data=df, x='age', y='bmi', hue='diabetes')
plt.title("Age vs BMI (colored by diabetes)")
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.scatterplot(x="bmi", y="blood_glucose_level", hue="diabetes", data=df)
plt.title("BMI vs Glucose")
plt.show()

In [None]:
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Feature Engineering & Preprocessing

In [None]:
df_encoded = df.copy()
label_cols = ['gender', 'smoking_history']
le = LabelEncoder()
for col in label_cols:
    df_encoded[col] = le.fit_transform(df_encoded[col])

In [None]:
X = df_encoded.drop('diabetes', axis=1)
y = df_encoded['diabetes']

rf = RandomForestClassifier(random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print(importances)

selected_features = importances.head(5).index.tolist()
print("\nSelected Features:", selected_features)

X_selected = X[selected_features]

In [None]:
plt.figure(figsize=(8, 5))
importances.plot(kind='bar', color='teal')
plt.title("Feature Importances (Random Forest)")
plt.ylabel("Importance")
plt.xlabel("Feature")
plt.tight_layout()
plt.show()

## Only 5 of the features are important for prediction. We will select only those features for model training.

In [None]:
X_final = X_selected
y_final = y

X_train, X_temp, y_train, y_temp = train_test_split(
    X_final, y_final,
    test_size=0.3,
    random_state=42,
    stratify=y_final
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=42,
    stratify=y_temp
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

In [None]:
X_train.sample(5)

In [None]:
y_train.sample(5)

In [None]:
print("Original:", Counter(y_train))

target_minority = 10000

n_min = sum(y_train == 1)
n_maj = sum(y_train == 0)

max_minority = min(target_minority, n_maj)

sampling_ratio = max_minority / n_maj

sm = SMOTE(sampling_strategy=sampling_ratio, random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("Counts after SMOTE:", Counter(y_train_res))
print("Train after SMOTE:", X_train_res.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

In [None]:
scaler = StandardScaler()
X_train_res_scaled = scaler.fit_transform(X_train_res)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Model Selection

## 1. Logistic Regression


In [None]:
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_res_scaled, y_train_res)

y_val_pred = log_reg.predict(X_val_scaled)
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Logistic Regression accuracy:", accuracy_score(y_val, y_val_pred))

##  2. Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_res, y_train_res)

y_val_pred_dt = dt_model.predict(X_val)
print(classification_report(y_val, y_val_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_dt))
print("Decision Tree accuracy:", accuracy_score(y_val, y_val_pred_dt))

## 3. Random Forest Classifier

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_res, y_train_res)

y_val_pred_rf = rf_model.predict(X_val)
print(classification_report(y_val, y_val_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_rf))
print("Random Forest accuracy:", accuracy_score(y_val, y_val_pred_rf))

## 3.*  Random Forest with class_weight='balanced'

In [None]:
rf_balanced = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)

rf_balanced.fit(X_train_res, y_train_res)

y_val_pred_bal = rf_balanced.predict(X_val)
print(classification_report(y_val, y_val_pred_bal))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred_bal))
print("Random Forest (class_weight='balanced') accuracy:", accuracy_score(y_val, y_val_pred_bal))

### Optimal threshold search

In [None]:
y_val_probs = rf_balanced.predict_proba(X_val)[:, 1]

thresholds = np.arange(0.1, 0.9, 0.01)

best_f1 = 0
best_threshold = 0
f1_scores = []

for thr in thresholds:
    y_val_pred_thr = (y_val_probs >= thr).astype(int)
    f1 = f1_score(y_val, y_val_pred_thr, pos_label=1)
    f1_scores.append(f1)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = thr

print(f"Best threshold for class 1: {best_threshold}")
print(f"Best F1-score for class 1: {best_f1}")

In [None]:
threshold = best_threshold
y_val_pred = (y_val_probs >= threshold).astype(int)

val_acc = accuracy_score(y_val, y_val_pred)
print("Random Forest - Validation Accuracy :", val_acc)
print(classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


## 4. Multi-Layer Perceptron - Neural Network

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(128, 64, 32),  # 3 warstwy ukryte
    activation='relu',
    solver='adam',
    alpha=0.0001,
    batch_size=32,
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

mlp.fit(X_train_res_scaled, y_train_res)
y_val_pred_mlp = mlp.predict(X_val_scaled)

print(classification_report(y_val, y_val_pred_mlp))
print("MLP Accuracy:", accuracy_score(y_val, y_val_pred_mlp))


## 5. Deep Neural Network

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential([
    Dense(256, activation='relu', input_shape=(5,)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(32, activation='relu'),
    Dropout(0.2),

    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
)

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, min_delta=0.001)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7)

history = model.fit(
    X_train_res_scaled, y_train_res,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

y_val_pred_nn = (model.predict(X_val_scaled) > 0.5).astype(int)
print(classification_report(y_val, y_val_pred_nn))


## 6. Gradient Boosting Classifier - XGBoost

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    scale_pos_weight=len(y_train_res[y_train_res == 0]) / len(y_train_res[y_train_res == 1]),
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb_model.fit(
    X_train_res, y_train_res,
    eval_set=[(X_val, y_val)],
    verbose=False
)

y_val_pred_xgb = xgb_model.predict(X_val)
print(classification_report(y_val, y_val_pred_xgb))
print("XGBoost Accuracy:", accuracy_score(y_val, y_val_pred_xgb))


## 7. K-Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(
    n_neighbors=11,
    weights='distance',
    metric='minkowski',
    p=2
)

knn_model.fit(X_train_res_scaled, y_train_res)
y_val_pred_knn = knn_model.predict(X_val_scaled)

print(classification_report(y_val, y_val_pred_knn))
print("KNN Accuracy:", accuracy_score(y_val, y_val_pred_knn))

# Model Training : Chosen model Random Forest with class_weight='balanced'

In [None]:
X_train_final = np.concatenate([X_train, X_val], axis=0)
y_train_final = np.concatenate([y_train, y_val], axis=0)

In [None]:
rf_final = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight='balanced'
)
rf_final.fit(X_train_final, y_train_final)

In [None]:
y_test_probs = rf_final.predict_proba(X_test.values)[:, 1]
threshold = 0.89
y_test_pred = (y_test_probs >= threshold).astype(int)

In [None]:
test_acc = accuracy_score(y_test, y_test_pred)
print("Random Forest - Test Accuracy:", test_acc)
print(classification_report(y_test, y_test_pred))

## Model Evaluation

In [None]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, zero_division=0)
recall = recall_score(y_test, y_test_pred, zero_division=0)
f1 = f1_score(y_test, y_test_pred, zero_division=0)
roc_auc = roc_auc_score(y_test, y_test_probs)

print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
print(f"ROC-AUC  : {roc_auc:.4f}\n")
print(classification_report(y_test, y_test_pred))

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_probs)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

param_distributions = {
    'n_estimators': randint(100, 500),
    'max_depth': [10, 15, 20, 25, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None]
}

rf = RandomForestClassifier(random_state=42, class_weight='balanced')

random_search = RandomizedSearchCV(
    rf,
    param_distributions,
    n_iter=20,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

random_search.fit(X_train_res, y_train_res)

print("Best params:", random_search.best_params_)
print("Best F1 Score (CV):", random_search.best_score_)

best_rf = random_search.best_estimator_

y_val_pred = best_rf.predict(X_val)
print(classification_report(y_val, y_val_pred))


In [None]:
y_proba = best_rf.predict_proba(X_test)[:, 1]
threshold = 0.89
y_pred_thresh = (y_proba >= threshold).astype(int)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_thresh))

In [None]:
print('Train Accuracy : ', best_rf.score(X_train, y_train))
print('Test Accuracy : ', best_rf.score(X_test, y_test))

# Result & Interpretation

In [None]:
print(classification_report(y_test, y_pred_thresh))
print("Final Test Accuracy:", accuracy_score(y_test, y_pred_thresh))
# Final Test Accuracy: 0.9730273193731799

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred_thresh)
TN, FP, FN, TP = cm.ravel()

print("True Negatives:", TN)
print("False Positives:", FP)
print("False Negatives:", FN)
print("True Positives:", TP)

classification_error = (FP + FN) / (TP + TN + FP + FN)
print(f"Classification Error: {classification_error:.4f}")

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, y_pred_thresh)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("Confusion Matrix", fontsize=16)
plt.show()

### Model saving

In [None]:
import pickle

model_path = "../trained_models/diabetes-random_forest_model.pkl"

with open(model_path, 'wb') as f:
    pickle.dump(best_rf, f)