In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.utils import shuffle

In [4]:
df = pd.read_csv("../Assignment 9/bank-full.csv", sep=';')   # original file uses semicolon separator

binary_cols = ['default', 'housing', 'loan', 'y']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

# Separate features and target
X = df.drop('y', axis=1)
y = df['y'].values

# One-hot encode categorical columns (non-binary)
cat_cols = ['job', 'marital', 'education', 'contact', 'month', 'poutcome']
X_encoded = pd.get_dummies(X, columns=cat_cols, drop_first=True)

# Standardize numeric features
scaler = StandardScaler()
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
X_encoded[num_cols] = scaler.fit_transform(X_encoded[num_cols])

# Ensure all features are numeric
X_encoded = X_encoded.astype(float)

# Convert to NumPy array explicitly as float
X = np.asarray(X_encoded.values, dtype=float)

# Shuffle data
X, y = shuffle(X, y, random_state=42)


# Train-test split (80:20)
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]

print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}\n")

Training samples: 36168, Test samples: 9043
Number of features: 42



In [5]:
class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        n_features = X.shape[1]
        self.mean = np.zeros((len(self.classes), n_features))
        self.var = np.zeros((len(self.classes), n_features))
        self.prior = np.zeros(len(self.classes))

        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.mean[idx, :] = X_c.mean(axis=0)
            self.var[idx, :] = X_c.var(axis=0) + 1e-6  # add small value to avoid /0
            self.prior[idx] = X_c.shape[0] / X.shape[0]

    def _gaussian_log_prob(self, class_idx, x):
        mean = self.mean[class_idx]
        var = self.var[class_idx]
        # log of Gaussian probability density
        log_prob = -0.5 * np.sum(np.log(2. * np.pi * var))
        log_prob -= 0.5 * np.sum(((x - mean) ** 2) / var)
        return log_prob

    def predict(self, X):
        y_pred = []
        for x in X:
            posteriors = []
            for idx, c in enumerate(self.classes):
                log_prior = np.log(self.prior[idx])
                log_likelihood = self._gaussian_log_prob(idx, x)
                posterior = log_prior + log_likelihood
                posteriors.append(posterior)
            y_pred.append(self.classes[np.argmax(posteriors)])
        return np.array(y_pred)

In [6]:
print("Training Gaussian Naive Bayes model...\n")
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)

Training Gaussian Naive Bayes model...



In [7]:
print("\nðŸ”¹ Evaluation Metrics (on Test Set):\n")
print(f"Accuracy  : {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision : {precision_score(y_test, y_pred):.4f}")
print(f"Recall    : {recall_score(y_test, y_pred):.4f}")
print(f"F1-score  : {f1_score(y_test, y_pred):.4f}")

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No', 'Yes']))


ðŸ”¹ Evaluation Metrics (on Test Set):

Accuracy  : 0.8633
Precision : 0.4280
Recall    : 0.4835
F1-score  : 0.4541

Confusion Matrix:
[[7293  687]
 [ 549  514]]

Classification Report:
              precision    recall  f1-score   support

          No       0.93      0.91      0.92      7980
         Yes       0.43      0.48      0.45      1063

    accuracy                           0.86      9043
   macro avg       0.68      0.70      0.69      9043
weighted avg       0.87      0.86      0.87      9043

