
 **Diabetes Prediction Project - Compare Models**


In [44]:
# 1️⃣ Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import joblib

In [45]:
# 2️⃣ Load Dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# 2️⃣ Load Dataset
data = pd.read_csv('/content/diabetes.csv')
print("Dataset Loaded Successfully!")
print(data.head())

Dataset Loaded Successfully!
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0          NaN      148             72             35        0  33.6   
1          1.0       85             66             29        0  26.6   
2          8.0      183             64              0        0  23.3   
3          1.0       89             66             23       94  28.1   
4          0.0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [None]:
'''
#file upload
from google.colab import files
uploaded = files.upload() '''

'\n#file upload\nfrom google.colab import files\nuploaded = files.upload() '

In [None]:
'''
print("Dataset Loaded Successfully!")
data = pd.read_csv('diabetes.csv')
print(data.head()) '''

'\nprint("Dataset Loaded Successfully!")\ndata = pd.read_csv(\'diabetes.csv\')\nprint(data.head()) '

In [47]:
# 3️⃣ Handle Missing/Zero Values
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero:
    data[col] = data[col].replace(0, np.nan)   # zero → NaN

# ✅ Impute Missing Values (fill NaN with median)
imputer = SimpleImputer(strategy="median")
X = imputer.fit_transform(data.drop('Outcome', axis=1))
y = data['Outcome']

In [48]:
# 4️⃣ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [49]:
# 5️⃣ Feature Scaling for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# 6️⃣ Train Models
# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

# Decision Tree
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)

In [50]:
# 7️⃣ Evaluate Models
print("✅ Logistic Regression Accuracy:", accuracy_score(y_test, log_model.predict(X_test_scaled)))
print("✅ Decision Tree Accuracy:", accuracy_score(y_test, tree_model.predict(X_test)))

✅ Logistic Regression Accuracy: 0.7532467532467533
✅ Decision Tree Accuracy: 0.7207792207792207


In [51]:
# 8️⃣ Save Models and Scaler
joblib.dump(imputer, 'imputer.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(log_model, 'log_model.pkl')
joblib.dump(tree_model, 'tree_model.pkl')
print("Models and Scaler Saved Successfully!")

Models and Scaler Saved Successfully!


In [52]:
# 9️⃣ Prediction Function - Compare Both Models
def predict_diabetes_compare():
    # Load models
    scaler = joblib.load('scaler.pkl')
    log_model = joblib.load('log_model.pkl')
    tree_model = joblib.load('tree_model.pkl')

    # Input from user
    print("\nEnter Patient Data:")
    Pregnancies = float(input("Pregnancies: "))
    Glucose = float(input("Glucose: "))
    BloodPressure = float(input("BloodPressure: "))
    SkinThickness = float(input("SkinThickness: "))
    Insulin = float(input("Insulin: "))
    BMI = float(input("BMI: "))
    DiabetesPedigreeFunction = float(input("DiabetesPedigreeFunction: "))
    Age = float(input("Age: "))

    patient_data = np.array([[Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age]])

    # Scale for Logistic Regression
    patient_scaled = scaler.transform(patient_data)

    # Predictions
    pred_log = log_model.predict(patient_scaled)[0]
    pred_tree = tree_model.predict(patient_data)[0]

    # Output side-by-side
    result_dict = {
        'Model': ['Logistic Regression', 'Decision Tree'],
        'Prediction': [pred_log, pred_tree],
        'Outcome': ['Diabetes (1)' if pred_log==1 else 'No Diabetes (0)',
                    'Diabetes (1)' if pred_tree==1 else 'No Diabetes (0)']
    }

    result_df = pd.DataFrame(result_dict)
    print("\nPrediction Results Comparison:")
    print(result_df)

In [53]:
# 1️⃣1️⃣ Test Prediction
# Uncomment below to run prediction
# predict_diabetes_compare()
predict_diabetes_compare()


Enter Patient Data:
Pregnancies: 0
Glucose: 80
BloodPressure: 140
SkinThickness: 35
Insulin: 18
BMI: 33
DiabetesPedigreeFunction: .267
Age: 25

Prediction Results Comparison:
                 Model  Prediction          Outcome
0  Logistic Regression           0  No Diabetes (0)
1        Decision Tree           0  No Diabetes (0)
