<a href="https://colab.research.google.com/github/trivedidharmik/cs3735/blob/main/Trivedi_Dharmik_Asmt5Part1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Naïve Bayes Classifier on Three Datasets
This notebook applies the Naïve Bayes classifier to the Breast Cancer, Car Evaluation, and Mushroom datasets.


## 1. Setup Environment
Install required packages and import libraries.

In [None]:
!pip install ucimlrepo
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, CategoricalNB
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



# 2. Breast Cancer Dataset
**Target Variable**: Diagnosis (Malignant/Benign)

In [None]:
breast_cancer = fetch_ucirepo(id=17)
X_bc = breast_cancer.data.features
y_bc = breast_cancer.data.targets

# Preprocess target
le = LabelEncoder()
y_bc = le.fit_transform(y_bc['Diagnosis'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_bc, y_bc, test_size=0.3, random_state=42)

# Train and evaluate
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("Breast Cancer Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Breast Cancer Results:
Accuracy: 0.9415
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       108
           1       0.93      0.90      0.92        63

    accuracy                           0.94       171
   macro avg       0.94      0.93      0.94       171
weighted avg       0.94      0.94      0.94       171

Confusion Matrix:
 [[104   4]
 [  6  57]]


#3. Car Evaluation Dataset
**Target Variable**: Class (Car Acceptability)

In [None]:
car_eval = fetch_ucirepo(id=19)
X_car = car_eval.data.features
y_car = car_eval.data.targets

# Encode features and target
encoder = OrdinalEncoder()
X_car_encoded = encoder.fit_transform(X_car).astype(int)
y_car_encoded = LabelEncoder().fit_transform(y_car['class'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_car_encoded, y_car_encoded, test_size=0.3, random_state=42)

# Train and evaluate
nb = CategoricalNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("\nCar Evaluation Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Car Evaluation Results:
Accuracy: 0.8324
              precision    recall  f1-score   support

           0       0.65      0.60      0.62       118
           1       0.45      0.26      0.33        19
           2       0.89      0.97      0.93       358
           3       1.00      0.33      0.50        24

    accuracy                           0.83       519
   macro avg       0.75      0.54      0.60       519
weighted avg       0.83      0.83      0.82       519

Confusion Matrix:
 [[ 71   5  42   0]
 [ 14   5   0   0]
 [  9   1 348   0]
 [ 16   0   0   8]]


# 4. Mushroom Dataset
**Target Variable**: Poisonous (Edible/Poisonous)

In [None]:
mushroom = fetch_ucirepo(id=73)
X_mush = mushroom.data.features
y_mush = mushroom.data.targets

# Handle missing values
X_mush.replace('?', np.nan, inplace=True)
imputer = SimpleImputer(strategy='most_frequent')
X_mush_imputed = pd.DataFrame(imputer.fit_transform(X_mush), columns=X_mush.columns)

# Encode features and target
encoder = OrdinalEncoder()
X_mush_encoded = encoder.fit_transform(X_mush_imputed).astype(int)
y_mush_encoded = LabelEncoder().fit_transform(y_mush['poisonous'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_mush_encoded, y_mush_encoded, test_size=0.3, random_state=42)

# Train and evaluate
nb = CategoricalNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

print("\nMushroom Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Mushroom Results:
Accuracy: 0.9459
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1257
           1       0.99      0.90      0.94      1181

    accuracy                           0.95      2438
   macro avg       0.95      0.94      0.95      2438
weighted avg       0.95      0.95      0.95      2438

Confusion Matrix:
 [[1247   10]
 [ 122 1059]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_mush.replace('?', np.nan, inplace=True)
