 1. Load Required Libraries

In [28]:
import kagglehub

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, accuracy_score , classification_report

2. Load the Diabetes Dataset

In [29]:
# Define the dataset path
dataset_path = "/Users/Dataghost/.cache/kagglehub/datasets/uciml/pima-indians-diabetes-database/versions/1/diabetes.csv"

# Load the dataset
df = pd.read_csv(dataset_path, encoding="ISO-8859-1")

# Display the first few rows
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


 3. Preprocess the Data

In [30]:
# Separate features (X) and target variable (y)


X = df.drop(columns=['Outcome']) # Features

y = df['Outcome']  # Target (0 = No Diabetes, 1 = Diabetes)

In [31]:
# Split dataset into Training (80%) and Testing (20%)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state= 42, stratify= y)

In [32]:
# Standardize the features (scale all values to a similar range)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)

 4. Train a Logistic Regression Model

In [33]:
# Train Logistic Regression Model

model = LogisticRegression()

model.fit(X_train_scaled, y_train)


# Make Predictions

y_pred = model.predict(X_test_scaled)

5. Evaluate Model Performance

In [34]:
# Model Accuracy

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Confusion Matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Classification Report (Precision, Recall, F1-Score)

print("Classification Report:\n", classification_report(y_test, y_pred))


Model Accuracy: 0.7143
Confusion Matrix:
 [[82 18]
 [26 28]]
Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.82      0.79       100
           1       0.61      0.52      0.56        54

    accuracy                           0.71       154
   macro avg       0.68      0.67      0.67       154
weighted avg       0.71      0.71      0.71       154



6. Make a Real Prediction

In [35]:
# Example patient data: [Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age]

patient_data = np.array([[2, 160, 70, 30, 120, 28.0, 0.5, 45]])  # Example Patient


patient_df = pd.DataFrame(patient_data, columns=X.columns)  # Preserve feature names

# Standardize the input data (same scaling as training)

patient_df_scaled = scaler.transform(patient_data)



In [36]:
# Predict Probability and Final Outcome

probability = model.predict_proba(patient_df_scaled) [0][1]  # Probability of having diabetes

prediction  = model.predict(patient_df_scaled)   # Final prediction (0 or 1)


# Print Results

print(f"Predicted Probability of Diabetes: {probability:.4f}")
print(f"Final Prediction (0 = No Diabetes, 1 = Diabetes): {prediction}")

Predicted Probability of Diabetes: 0.5399
Final Prediction (0 = No Diabetes, 1 = Diabetes): [1]
