In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

In [2]:
# Step 1: Load dataset with correct headers
column_names = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]
df = pd.read_csv("adult.csv", header=None, names=column_names, skipinitialspace=True)

In [3]:
# Step 2: Handle missing values represented by '?'
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

In [4]:
# Step 3: Encode categorical features
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [5]:
# Step 4: Train-test split
X = df.drop("income", axis=1)
y = df["income"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# Step 5: Train Naive Bayes model
model = GaussianNB()
model.fit(X_train, y_train)

In [7]:
# Step 6: Make predictions
y_pred = model.predict(X_test)

In [9]:
# Step 7: Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("=== Model Evaluation ===")
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

=== Model Evaluation ===
Accuracy: 79.32%
Precision: 0.6207302709069493
Recall: 0.4620780359491451
F1 Score: 0.5297813520985172

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.90      0.87      6768
           1       0.62      0.46      0.53      2281

    accuracy                           0.79      9049
   macro avg       0.73      0.68      0.70      9049
weighted avg       0.78      0.79      0.78      9049

