In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold

# Load the balanced data

balanced_data = pd.read_csv('csvs/balanced_data.csv')

# Separate features and target variable

X = balanced_data.drop(columns=['Diabetes_012'])
y = balanced_data['Diabetes_012']

# Split the data into training (80%) and test (20%) sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting datasets

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (512887, 18)
X_test shape: (128222, 18)
y_train shape: (512887,)
y_test shape: (128222,)


In [34]:
# Will need these later!

%store X_train
%store y_test
%store X_test
%store y_test
%store X
%store y

Stored 'X_train' (DataFrame)
Stored 'y_test' (Series)
Stored 'X_test' (DataFrame)
Stored 'y_test' (Series)
Stored 'X' (DataFrame)
Stored 'y' (Series)


Next, lets use 5 fold cross validation

In [35]:
# Cross-validation with 5 folds

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Display the indices for each fold

for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
    print(f"Fold {fold + 1}:")
    print(f"  Train: {len(train_index)}")
    print(f"  Validation: {len(val_index)}")


Fold 1:
  Train: 410309
  Validation: 102578
Fold 2:
  Train: 410309
  Validation: 102578
Fold 3:
  Train: 410310
  Validation: 102577
Fold 4:
  Train: 410310
  Validation: 102577
Fold 5:
  Train: 410310
  Validation: 102577


Now that that is done, its time to select a model. Since we are talking about diabetes or not, this is a classification problem. Ill start with logistic regression as a baseline model.

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Initialize Logistic Regression model

baseline_model = LogisticRegression(max_iter=1000, random_state=42)

# Train the model on the training set

baseline_model.fit(X_train, y_train)

# Make predictions on the test set

y_pred = baseline_model.predict(X_test)

# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Baseline Model - Logistic Regression:")
print(f"  Accuracy: {accuracy:.4f}")
print(f"  F1 Score: {f1:.4f}")


Baseline Model - Logistic Regression:
  Accuracy: 0.5341
  F1 Score: 0.5282


Great, lets move onto model selection and hypertuning

Ill train these models and use cross-validation to evaluate their performance.

In [37]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score

# Load the balanced dataset

balanced_data = pd.read_csv('csvs/balanced_data.csv')

# Split data into features and target

X = balanced_data.drop(columns='Diabetes_012')
y = balanced_data['Diabetes_012']

# Use smaller subset of the data

X_subset, _, y_subset, _ = train_test_split(X, y, train_size=0.05, stratify=y, random_state=42)

# Define models with reduced complexity (faster times)

models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42),  # reduced number of trees
    "SVM": SVC(random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, random_state=42)  # reduced number of trees
}

# Evaluate models using fewer cross-validation folds

def evaluate_models(models, X_train, y_train, cv=3):
    results = {}
    for model_name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_macro')
        results[model_name] = scores
        print(f"{model_name} F1 Score: {scores.mean():.4f} (+/- {scores.std():.4f})")
    return results

# Evaluate models

results = evaluate_models(models, X_subset, y_subset)


Decision Tree F1 Score: 0.6407 (+/- 0.0018)
Random Forest F1 Score: 0.7399 (+/- 0.0016)
SVM F1 Score: 0.5342 (+/- 0.0025)
XGBoost F1 Score: 0.7376 (+/- 0.0043)


In [39]:
# To use in next notebook

%store X_subset
%store y_subset

Stored 'X_subset' (DataFrame)
Stored 'y_subset' (Series)


Random forest model has the highest f1 score. it is the best preformance among the models. 