In [3]:
# Implemented from scratch:
# 1. Softmax Logistic Regression for MNIST dataset
# 2. Naive Bayes for Bank dataset


# %% [markdown]
# ### Cell 1: Import Libraries


# %%
import os, time, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)
%matplotlib inline
plt.rcParams['figure.figsize'] = (5,4)

In [4]:
# Cell 2: Define Metric Functions

def confusion_matrix_multiclass(y_true, y_pred, labels=None):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    if labels is None:
        labels = np.unique(np.concatenate([y_true, y_pred]))
    label_to_idx = {l: i for i, l in enumerate(labels)}
    cm = np.zeros((len(labels), len(labels)), dtype=int)
    for t, p in zip(y_true, y_pred):
        cm[label_to_idx[t], label_to_idx[p]] += 1
    return cm, labels


def precision_recall_f1_from_cm(cm):
    tp = np.diag(cm)
    fp = cm.sum(axis=0) - tp
    fn = cm.sum(axis=1) - tp
    precision = np.where(tp + fp == 0, 0, tp / (tp + fp))
    recall = np.where(tp + fn == 0, 0, tp / (tp + fn))
    f1 = np.where(precision + recall == 0, 0, 2 * precision * recall / (precision + recall))
    return precision, recall, f1


def print_metrics_from_labels(y_true, y_pred, labels=None, name="Model"):
    cm, labels = confusion_matrix_multiclass(np.array(y_true), np.array(y_pred), labels=labels)
    precision, recall, f1 = precision_recall_f1_from_cm(cm)
    acc = cm.diagonal().sum() / cm.sum()
    print(f"\n=== {name} ===")
    print(f"Accuracy: {acc:.4f}")
    for lab, p, r, f in zip(labels, precision, recall, f1):
        print(f"Label {lab}: P={p:.4f}, R={r:.4f}, F1={f:.4f}")


In [6]:
# Cell 3: Load MNIST Dataset

MNIST_TRAIN_PATH = '/mnt/data/train.csv' if os.path.exists('/mnt/data/train.csv') else 'train.csv'
MNIST_TEST_PATH  = '/mnt/data/test.csv' if os.path.exists('/mnt/data/test.csv') else 'test.csv'

if not os.path.exists(MNIST_TRAIN_PATH) or not os.path.exists(MNIST_TEST_PATH):
    print('MNIST files missing!')
else:
    train = pd.read_csv(MNIST_TRAIN_PATH)
    test  = pd.read_csv(MNIST_TEST_PATH)

    # Separate features and labels
    if 'label' in train.columns:
        X_train = train.drop(columns=['label']).values / 255.0
        y_train = train['label'].values
    else:
        X_train = train.iloc[:, 1:].values / 255.0
        y_train = train.iloc[:, 0].values

    if 'label' in test.columns:
        X_test = test.drop(columns=['label']).values / 255.0
        y_test = test['label'].values
    else:
        X_test = test.values / 255.0
        y_test = None

    print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")


Training samples: 42000, Testing samples: 28000


In [9]:
# Cell 4: Train Softmax Logistic Regression

def one_hot(y, K):
    oh = np.zeros((len(y), K))
    oh[np.arange(len(y)), y] = 1
    return oh


def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    ez = np.exp(z)
    return ez / ez.sum(axis=1, keepdims=True)


num_classes = len(np.unique(y_train))
W = np.zeros((X_train.shape[1], num_classes))
b = np.zeros((1, num_classes))

epochs = 100
lr = 0.5

y_oh = one_hot(y_train, num_classes)

for ep in range(1, epochs + 1):
    logits = X_train.dot(W) + b
    probs = softmax(logits)
    error = probs - y_oh
    gradW = X_train.T.dot(error) / X_train.shape[0]
    gradb = error.mean(axis=0, keepdims=True)
    W -= lr * gradW
    b -= lr * gradb

    if ep % 20 == 0:
        preds = np.argmax(softmax(X_train.dot(W) + b), axis=1)
        acc = (preds == y_train).mean()
        print(f"Epoch {ep}: Train Accuracy = {acc:.4f}")


Epoch 20: Train Accuracy = 0.8620
Epoch 40: Train Accuracy = 0.8786
Epoch 60: Train Accuracy = 0.8872
Epoch 80: Train Accuracy = 0.8921
Epoch 100: Train Accuracy = 0.8961


In [12]:
# Cell 5: Evaluate MNIST Model (fixed)
logits = X_test.dot(W) + b
preds = np.argmax(softmax(logits), axis=1)

# If test labels available, print metrics. Otherwise save predictions to CSV.
if 'y_test' in globals() and y_test is not None:
    # Ensure arrays are 1-D and same length
    y_true_arr = np.array(y_test).reshape(-1)
    preds_arr = np.array(preds).reshape(-1)
    if y_true_arr.shape[0] != preds_arr.shape[0]:
        print(f"Mismatch in sizes: y_test has {y_true_arr.shape[0]} rows but preds has {preds_arr.shape[0]} rows.")
    else:
        print_metrics_from_labels(y_true_arr, preds_arr, name="MNIST Logistic Regression")
else:
    print("No test labels found (y_test is None). Saving predictions to 'mnist_test_predictions.csv'.")
    # If test had an 'id' column it could be included; otherwise just save index and predicted label
    out_df = pd.DataFrame({'index': np.arange(len(preds)), 'predicted_label': preds})
    out_df.to_csv('mnist_test_predictions.csv', index=False)
    print("Saved mnist_test_predictions.csv in the current folder.")


No test labels found (y_test is None). Saving predictions to 'mnist_test_predictions.csv'.
Saved mnist_test_predictions.csv in the current folder.


In [14]:
# Cell 6: Display Sample Predictions

def array_to_image(pixel_array):
    plt.imshow(np.array(pixel_array).reshape(28, 28), cmap='gray')
    plt.axis('off')
    plt.show()


def predict_and_show_from_array(pixel_array):
    x = np.array(pixel_array).reshape(1, -1) / 255.0
    probs = softmax(x.dot(W) + b)
    label = int(np.argmax(probs))
    array_to_image(pixel_array)
    print(f"Predicted Label: {label}, Confidence: {probs.max():.4f}")


In [18]:
# Cell 7: Load and Prepare Bank Dataset (Fixed with target detection)

BANK_PATH = '/mnt/data/bank-full.csv' if os.path.exists('/mnt/data/bank-full.csv') else 'bank-full.csv'

if not os.path.exists(BANK_PATH):
    print('Bank dataset missing!')
else:
    # Load and shuffle
    df = pd.read_csv(BANK_PATH).sample(frac=1, random_state=42).reset_index(drop=True)

    # Try to detect target column automatically
    target_col = None
    for possible in ['y', 'deposit', 'output', 'class', 'target']:
        if possible in df.columns:
            target_col = possible
            break

    if target_col is None:
        print("❌ Target column not found! Check your CSV headers.")
    else:
        print(f"✅ Detected target column: '{target_col}'")

        # Split 80:20 for train/test
        split = int(0.8 * len(df))
        df_train = df.iloc[:split].reset_index(drop=True)
        df_test = df.iloc[split:].reset_index(drop=True)

        # Define numeric and categorical columns
        numeric_cols = [c for c in ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous'] if c in df.columns]
        categorical_cols = [c for c in df.columns if c not in numeric_cols + [target_col]]

        # Map target labels to binary
        label_map = {'no': 0, 'yes': 1}
        y_train = df_train[target_col].map(label_map).values
        y_test = df_test[target_col].map(label_map).values

        print(f"Train size: {len(df_train)}, Test size: {len(df_test)}")
        print(f"Numeric columns: {numeric_cols}")
        print(f"Sample categorical columns: {categorical_cols[:5]}")


❌ Target column not found! Check your CSV headers.


In [None]:
# ### Cell 8: Train Naive Bayes Model


# %%
classes = np.unique(y_train)
prior, cont_stats, cat_counts = {}, {c:{} for c in classes}, {c:{} for c in classes}
for c in classes:
sub = df_train[df_train['y'].map(label_map)==c]
prior[c] = len(sub)/len(df_train)
for col in numeric_cols:
vals = sub[col].astype(float).values
mu, var = vals.mean(), vals.var(ddof=0) if vals.var()!=0 else 1e-6
cont_stats[c][col] = (mu,var)
for col in categorical_cols:
cat_counts[c][col] = sub[col].astype(str).value_counts().to_dict()


vocab = {col: pd.concat([df_train[col].astype(str), df_test[col].astype(str)]).unique().tolist() for col in categorical_cols}


def log_gauss(x, mu, var):
return -0.5*(math.log(2*math.pi*var)+((x-mu)**2)/var)


def nb_predict_row(row):
best, bestlp = None, -1e18
for c in classes:
lp = math.log(prior[c]+1e-12)
for col in numeric_cols:
mu,var = cont_stats[c][col]
lp += log_gauss(float(row[col]), mu, var)
for col in categorical_cols:
v = str(row[col])
counts = cat_counts[c].get(col, {})
cnt = counts.get(v,0)
total = sum(counts.values())
V = len(vocab[col])
prob = (cnt+1)/(total+V)
lp += math.log(prob+1e-12)
if lp>bestlp:
bestlp, best = lp, c
return int(best)

In [None]:
# ### Cell 9: Evaluate Naive Bayes


# %%
y_pred_nb = np.array([nb_predict_row(row) for _,row in df_test.iterrows()])
print_metrics_from_labels(y_test, y_pred_nb, labels=[0,1], name="Naive Bayes (bank-full)")


# %% [markdown]
# ### Cell 10: Display Priors and Example Stats


# %%
for c in classes:
print(f"Class {c} prior: {prior[c]:.4f}")
for col in numeric_cols[:4]:
mu,var = cont_stats[1][col]
print(f"{col}: mean={mu:.3f}, var={var:.3f}")