In [1]:
# Install required libraries for imbalanced learning and XGBoost
!pip install -q imbalanced-learn xgboost
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import warnings
import joblib
from google.colab import files
# Suppress the specific UserWarning from sklearn
warnings.filterwarnings("ignore", category=UserWarning, module='sklearn.base')
# Load Dataset
url = 'https://raw.githubusercontent.com/plotly/datasets/master/diabetes.csv'
try:
    data = pd.read_csv(url)
except Exception as e:
    print("Error loading data:", e)

# Data Preprocessing
# Handle missing values and separate features/target
for column in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    data.loc[data[column] == 0, column] = data[column].median()

X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Balance classes using SMOTE
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Check the shapes of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Define Models for Comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=200),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
}

# Model Training and Evaluation with K-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for model_name, model in models.items():
    with np.errstate(divide='ignore', invalid='ignore'):
        accuracy = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy').mean() * 100
        precision = cross_val_score(model, X_train, y_train, cv=kf, scoring='precision').mean() * 100
        results[model_name] = {'Accuracy': accuracy, 'Precision': precision}

# Display results for each model
for model_name, metrics in results.items():
    print(f"\n{model_name} Performance:")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.2f}")

best_model_name = max(results, key=lambda x: (results[x]['Accuracy'], results[x]['Precision']))
best_model_metrics = results[best_model_name]
print(f"\nBest Model: {best_model_name}")
print(f"Accuracy: {best_model_metrics['Accuracy']:.2f}%", f"Precision: {best_model_metrics['Precision']:.2f}%")
# Test the Best Model on Sample Input Data
# Here, we select the best model based on accuracy and precision
best_model = models[best_model_name]
best_model.fit(X_train, y_train)  # Train on the full training dataset after K-Fold validation

try:
    # Collect user inputs for each feature
    user_input = []
    user_input.append(float(input("Pregnancies (0-17): ")))
    user_input.append(float(input("Glucose (0-199): ")))
    user_input.append(float(input("Blood Pressure (0-122): ")))
    user_input.append(float(input("Skin Thickness (0-99): ")))
    user_input.append(float(input("Insulin (0-846): ")))
    user_input.append(float(input("BMI (0-67): ")))
    user_input.append(float(input("Diabetes Pedigree Function (0.078-2.42): ")))
    user_input.append(float(input("Age (21-81): ")))

    # Convert to numpy array and scale the input
    user_input_scaled = scaler.transform(np.array([user_input]))

    # Make prediction
    with np.errstate(divide='ignore', invalid='ignore'):
        prediction = best_model.predict(user_input_scaled)
        prediction_proba = best_model.predict_proba(user_input_scaled)

    # Output prediction and probability
    print(f"\nSample Prediction (0 = No Diabetes, 1 = Diabetes): {prediction[0]}")
    print(f"Probability of Diabetes: {prediction_proba[0][1]:.2f}")

except ValueError:
    print("Invalid input. Please enter numeric values.")
# After training your best model
joblib.dump(best_model, 'diabetes_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

files.download('diabetes_model.pkl')
files.download('scaler.pkl')