In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

# Load the data
df = pd.read_csv('data_model.csv')

# Separate features and target
X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data shapes:")
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Data shapes:
Training set shape: (51347, 16)
Testing set shape: (12837, 16)


In [5]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier()
}

results = {}

for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    }

# Create a DataFrame with results
results_df = pd.DataFrame(results).round(4)
print("\
Model Performance Metrics:")
print(results_df)

Training Logistic Regression...
Training Random Forest...
Training SVM...
Training KNN...
Model Performance Metrics:
           Logistic Regression  Random Forest     SVM     KNN
Accuracy                0.9753            1.0  0.9874  0.9818
Precision               0.9817            1.0  1.0000  0.9901
Recall                  0.7918            1.0  0.8861  0.8446
F1-Score                0.8766            1.0  0.9396  0.9116
ROC-AUC                 0.9628            1.0  0.9921  0.9673


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


X = df.drop('diabetes', axis=1)
y = df['diabetes']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Train the model
print("Training Random Forest...")
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Evaluate the model
results_rf = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1-Score': f1_score(y_test, y_pred_rf),
    'ROC-AUC': roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
}

# Create a DataFrame with results
results_rf_df = pd.DataFrame([results_rf], index=['Random Forest']).round(4)
print("Random Forest Performance Metrics:")
print(results_rf_df)


import joblib

# Specify the filename for saving the model
joblib_filename = 'rf_model.joblib'

# Save the trained model as a .joblib file
joblib.dump(rf, joblib_filename)

print(f"Model saved as {joblib_filename}")



Training Random Forest...
Random Forest Performance Metrics:
               Accuracy  Precision  Recall  F1-Score  ROC-AUC
Random Forest       1.0        1.0     1.0       1.0      1.0
Model saved as rf_model.joblib
