In [1]:
# --- Import Libraries ---
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

# --- (a) Load Dataset ---
warnings.filterwarnings('ignore')
df = pd.read_csv("pima-indians-diabetes.csv")

print("--- Data Head ---")
print(df.head())

# --- (b) Data Preprocessing ---
# Rename columns if needed (depends on your CSV)
df.columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]

# Check for missing values
print("\n--- Missing Values ---")
print(df.isnull().sum())

# Fill missing numeric values (if any)
df.fillna(df.mean(numeric_only=True), inplace=True)

# --- (c) Define Features (X) and Target (y) ---
X = df.drop(columns=['Outcome'])
y = df['Outcome']

# --- (d) Split Data into Train and Test ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- (e) Apply Gaussian Na√Øve Bayes Classifier ---
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)

# --- (f) Evaluate the Model ---
y_pred = nb_clf.predict(X_test)

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


--- Data Head ---
   6  148  72  35    0  33.6  0.627  50  1
0  1   85  66  29    0  26.6  0.351  31  0
1  8  183  64   0    0  23.3  0.672  32  1
2  1   89  66  23   94  28.1  0.167  21  0
3  0  137  40  35  168  43.1  2.288  33  1
4  5  116  74   0    0  25.6  0.201  30  0

--- Missing Values ---
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

--- Model Evaluation ---
Accuracy: 0.67

Confusion Matrix:
[[75 25]
 [26 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.75      0.75       100
           1       0.53      0.52      0.52        54

    accuracy                           0.67       154
   macro avg       0.64      0.63      0.63       154
weighted avg       0.67      0.67      0.6