# Reproducing Paper Experiments on PIMA Dataset + Gradient Boosting Classifier
This notebook reproduces the classification experiments from the research paper using the **Pima Indians Diabetes Dataset**.

Models implemented from the paper:
- Logistic Regression
- KNN
- SVM
- Naive Bayes
- Decision Tree
- Random Forest

### âœ” Additional Model Added
- **Gradient Boosting Classifier** (Extra model requested)

### Metrics Computed
- Accuracy
- Error Rate
- Sensitivity (Recall)
- Specificity
- Precision
- F1 Score
- MCC
- AUC
- 10-fold Cross Validation Accuracy


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
%matplotlib inline
print('Imports complete.')

In [None]:
# Load Dataset
df = pd.read_csv('pima.csv')
df.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df.head()

In [None]:
# Preprocessing: Replace zero values with median
cols_zero = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in cols_zero:
    df[col] = df[col].replace(0, df[col].median())
df.describe()

In [None]:
# Train-Test Split
X = df.drop('Outcome', axis=1)
y = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)
print(X_train.shape, X_test.shape)

In [None]:
# Metric Function
def compute_metrics(y_true, y_pred, y_prob=None):
    acc = accuracy_score(y_true, y_pred)
    err = 1 - acc
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp)

    auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None

    return {
        'Accuracy': acc,
        'Error': err,
        'Precision': precision,
        'Sensitivity': recall,
        'Specificity': specificity,
        'F1': f1,
        'MCC': mcc,
        'AUC': auc
    }

print('Metric function ready.')

In [None]:
# Models to Evaluate
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(probability=True),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()  # Extra model
}

results = {}

In [None]:
# Train & Evaluate All Models
for name, model in models.items():
    print(f'\nTraining: {name}')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None

    metrics = compute_metrics(y_test, y_pred, y_prob)

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    cv_score = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy').mean()
    metrics['10-Fold CV Accuracy'] = cv_score

    results[name] = metrics

results

In [None]:
# Convert to DataFrame
results_df = pd.DataFrame(results).T
results_df

In [None]:
# Save Summary
results_df.to_csv('pima_results_with_gradient_boosting.csv', index=True)
print('Saved as pima_results_with_gradient_boosting.csv')