# Prediction model for diabetes dataset

This notebook assumes the use of the cleaned dataset (notebook EDA.ipynb).

In [None]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier


In [None]:

# Load dataset
data = pd.read_csv('../datasets/diabetes_cleaned.csv')


In [None]:

# Define target and features
target_feature = 'CLASS'
X = data.drop(columns=[target_feature])
y = data[target_feature]


In [None]:

# Standardize the features
'''
StandardScaler perform standardization by removing the mean and scaling to unit variance.
This means that every feature will have a mean of 0 and a standard deviation of 1.
'''

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# plot the distribution of the features after normalization
plt.figure(figsize=(12, 6))
for i in range(X_scaled.shape[1]):
    plt.subplot(3, 4, i + 1)
    sns.histplot(X_scaled[:, i], kde=True)
    plt.title(X.columns[i])
plt.tight_layout()
plt.show()



In [None]:
# Function to train a chosen model using a chosen splitting method, then use it to predict a single new patient
# Method can be 'holdout' or 'kfold'

def predict_patient(patient_data, model_name='Logistic Regression', split_method='holdout'):
    model = models[model_name]
    if split_method == 'holdout':
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
        model.fit(X_train, y_train)
    elif split_method == 'kfold':
        for train_index, test_index in skf.split(X_scaled, y):
            X_train, y_train = X_scaled[train_index], y.iloc[train_index]
            model.fit(X_train, y_train)
            break  # Just train on the first fold
    else:
        raise ValueError("Unknown split method. Use 'holdout' or 'kfold'.")

    # Scale the patient input with the same scaler
    patient_array = scaler.transform([patient_data.values])
    prediction = model.predict(patient_array)[0]
    probability = model.predict_proba(patient_array)[0][prediction]
    output = list([prediction, probability])
    print(f"Prediction: {'Diabetic' if output[0] == 1 else 'Non-diabetic'}, Probability: {output[1]*100:.2f}%")


    retunr output

In [None]:

# Helper function to evaluate models 
def evaluate_model(model, X_train, X_test, y_train, y_test):
    '''
    Evaluate the model using confusion matrix, classification report, and accuracy score.
    Inputs:
        model: The machine learning model to evaluate.
        X_train: Training features.
        X_test: Testing features.
        y_train: Training labels.
        y_test: Testing labels.
    Outputs:
        None
        (just prints the evaluation metrics)
    '''
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
    print("Accuracy:", accuracy_score(y_test, predictions))


In [None]:
# --- SPLIT METHOD 1: SIMPLE HOLDOUT ---
print("\n===== Holdout Split Evaluation =====")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

for name, model in models.items():
    print(f"\nModel: {name}")
    evaluate_model(model, X_train, X_test, y_train, y_test)


In [None]:
# --- SPLIT METHOD 2: STRATIFIED K-FOLD CROSS-VALIDATION ---
# This method splits the dataset into K folds (here, 5), preserving class distribution in each fold
print("\n===== Stratified K-Fold Cross-Validation =====")
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate models fold-by-fold using evaluate_model
for name, model in models.items():
    print(f"\nModel: {name}")
    fold_idx = 1
    accuracies = []
    for train_index, test_index in skf.split(X_scaled, y):
        print(f"\n--- Fold {fold_idx} ---")
        X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
        evaluate_model(model, X_train_fold, X_test_fold, y_train_fold, y_test_fold)
        fold_idx += 1


In [None]:
# === VISUALIZE PREDICTIONS ===
# Plot confusion matrix as heatmap for any model

def plot_confusion_matrix(y_true, y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()

In [None]:
# Example usage for each method
for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    plot_confusion_matrix(y_test, y_pred, title=f"Confusion Matrix - {name}")

In [None]:
# === PREDICTION INTERFACE FUNCTIONS ===

# Function to allow the user to input values for each feature (except the target) to simulate a patient
# Returns a pandas Series that can be used as input for a model

# Function to create a patient input by asking the user, validating suspicious and general values
def create_patient_input(feature_names):
    print("\nPlease enter values for the following features:")
    data = {}
    for feature in feature_names:
        col_min = X[feature].min()
        col_max = X[feature].max()
        print(f"{feature} (valid range: {round(col_min, 2)} - {round(col_max, 2)})")
        while True:
            try:
                value = float(input(f"Enter value for {feature}: "))
                if col_min <= value <= col_max:
                    break
                else:
                    print(f"Value out of range. Please enter a value between {col_min} and {col_max}.")
            except ValueError:
                print("Invalid input. Please enter a number.")
        data[feature] = value
    return pd.Series(data)


In [None]:

# Example usage
print("\n=== Example: Create Patient Input ===")
patient_data = create_patient_input(X.columns)
print("\nPatient Data:")
print(patient_data)
# Predict the class for the created patient data



=== Example: Create Patient Input ===

Please enter values for the following features:
Gender (valid range: 0 - 1)
AGE (valid range: 25.0 - 79.0)
Urea (valid range: 1.1 - 22.0)
Value out of range. Please enter a value between 1.1 and 22.0.
Cr (valid range: 20.0 - 370.0)
HbA1c (valid range: 0.9 - 16.0)
Chol (valid range: 0.0 - 10.3)
TG (valid range: 0.3 - 8.7)
HDL (valid range: 0.4 - 5.0)
LDL (valid range: 0.3 - 9.9)
VLDL (valid range: 0.1 - 3.5)
BMI (valid range: 19.0 - 47.75)

Patient Data:
Gender      1.0
AGE        29.0
Urea       10.0
Cr        200.0
HbA1c      10.0
Chol        9.0
TG          1.0
HDL         2.0
LDL         2.0
VLDL        1.0
BMI        30.0
dtype: float64

Prediction: Diabetic
Prediction result: Diabetic




In [74]:
# Select a patient from the dataset who does not have diabetes (CLASS = 0)
non_diabetic_patient = data[data[target_feature] == 0].iloc[0].drop(target_feature)

# Predict the class for the selected patient
prediction = predict_patient(non_diabetic_patient, model_name='Logistic Regression', split_method='holdout')
print(f"Prediction: {'Diabetic' if prediction[0] == 1 else 'Non-diabetic'}, Probability: {prediction[1]*100:.2f}%")

Prediction: Non-diabetic, Probability: 70.67%




In [None]:

# Function to train a chosen model using a chosen splitting method, then use it to predict a single new patient
# Method can be 'holdout' or 'kfold'

def predict_patient(patient_data, model_name='Logistic Regression', split_method='holdout'):
    model = models[model_name]
    if split_method == 'holdout':
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)
        model.fit(X_train, y_train)
    elif split_method == 'kfold':
        for train_index, test_index in skf.split(X_scaled, y):
            X_train, y_train = X_scaled[train_index], y.iloc[train_index]
            model.fit(X_train, y_train)
            break  # Just train on the first fold
    else:
        raise ValueError("Unknown split method. Use 'holdout' or 'kfold'.")

    # Scale the patient input with the same scaler
    patient_array = scaler.transform([patient_data.values])
    prediction = model.predict(patient_array)[0]
    print(f"\nPrediction: {'Diabetic' if prediction == 1 else 'Non-diabetic'}")
    return prediction


- **Precision**: Measures how many of the positive predictions were actually correct.
  - Formula: TruePositives / (TruePositives + FalsePositives)
  - High precision means few false positives.
  - Example: If a model predicts 100 patients as diabetic and 90 are truly diabetic, precision is 0.90.

- **Recall (Sensitivity)**: Measures how many actual positive cases were correctly predicted.
  - Formula: TruePositives / (TruePositives + FalseNegative)
  - High recall means few false negatives.
  - Example: If there are 100 diabetic patients and the model finds 95 of them, recall is 0.95.

- **F1-Score**: Harmonic mean of precision and recall. A balanced metric when classes are imbalanced.
  - Formula: 2 * (Precision * Recall) / (Precision + Recall)
  - Useful when you care equally about precision and recall.

- **Support**: The number of actual samples for each class in the test set.
  - Helps interpret performance per class.

- **Accuracy**: Overall, how often is the classifier correct?
  - Formula: (TruePositives + TrueNegative) / (Total predictions)

- **Macro avg**: Average of precision, recall, and F1-score calculated independently for each class.
  - Treats all classes equally.

- **Weighted avg**: Same as macro avg, but each class's score is weighted by its support.
  - More informative when classes are imbalanced.
