In [273]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

In [274]:
import os
os.getcwd()
os.chdir('/Users/nguyenviet/Desktop/CSC311/group-project')  # Change to project root directory

In [275]:
X_train = pd.read_csv('data/train/features_train_with_bow.csv', header=None)
y_train_one = pd.read_csv('data/train/targets_train_one_hot.csv', header=None)
y_train = pd.read_csv('data/train/targets_train_enc.csv', header=None)

X_valid = pd.read_csv('data/valid/features_valid_with_bow.csv', header=None)
y_valid_one = pd.read_csv('data/valid/targets_valid_one_hot.csv', header=None)
y_valid = pd.read_csv('data/valid/targets_valid_enc.csv', header=None)

X_test = pd.read_csv('data/test/features_test_with_bow.csv', header=None)
y_test_one = pd.read_csv('data/test/targets_test_one_hot.csv', header=None)
y_test = pd.read_csv('data/test/targets_test_enc.csv', header=None)

In [276]:
y_valid_one
np.array(X_train)
np.int64(np.ones(2))

array([1, 1])

In [277]:
# X_train.insert(0, 'bias', 1)

# Softmax function

In [278]:
def softmax(z):
    z_max = np.max(z, axis=1, keepdims=True)
    e_z = np.exp(z - z_max)
    return e_z / np.sum(e_z, axis=1, keepdims=True)

def predict(X, W, b):
    logits = X @ W + b
    probs = softmax(logits)
    return np.argmax(probs, axis=1)

# Loss function (Cross-Entropy Loss)

In [279]:
# def cross_entropy_loss(y_true, y_pred):
#     """
#     Calculates the cross-entropy loss.
#     y_true: One-hot encoded true labels.
#     y_pred: Predicted probabilities from softmax.
#     """
#     m = y_true.shape[0]
#     # Avoid log(0) by clipping predicted probabilities
#     y_pred = np.clip(y_pred, 1e-12, 1 - 1e-12)
#     loss = -np.sum(y_true * np.log(y_pred)) / m
#     return loss

def cross_entropy_loss(y_true, y_pred, W=None, lambda_reg=0):
    """
    Extended to include regularization term
    """
    m = y_true.shape[0]
    y_pred = np.clip(y_pred, 1e-12, 1 - 1e-12)
    data_loss = -np.sum(y_true * np.log(y_pred)) / m
    
    # Add regularization loss if weights are provided
    if W is not None and lambda_reg > 0:
        reg_loss = (lambda_reg / 2) * np.sum(W * W)
        return data_loss + reg_loss
    else:
        return data_loss

# Gradient Descent

In [280]:
# def fit_softmax_regression(X, y_one_hot, learning_rate, num_iterations):
#     np.random.seed(42)
#     X = np.asarray(X)          # safer, works with DataFrame or ndarray
#     y_one_hot = np.asarray(y_one_hot)
#     n_samples, n_features = X.shape
#     n_classes = y_one_hot.shape[1]

#     W = np.random.randn(n_features, n_classes) * 0.01
#     b = np.zeros(n_classes)           # ← this is the missing bias

#     for i in range(num_iterations):
#         logits = X @ W + b            # ← add bias here
#         y_pred = softmax(logits)

#         if i % 100 == 0:
#             loss = cross_entropy_loss(y_one_hot, y_pred)
#             print(f"Iteration {i}, Loss: {loss:.6f}")

#         dW = X.T @ (y_pred - y_one_hot) / n_samples
#         db = np.sum(y_pred - y_one_hot, axis=0) / n_samples   # ← bias gradient

#         W -= learning_rate * dW
#         b -= learning_rate * db

#     return W, b

def fit_softmax_regression(X, y_one_hot, learning_rate, num_iterations, C=1.0):
    """
    C: Inverse of L2 regularization strength
    - Smaller C = stronger regularization
    - Larger C = weaker regularization
    """
    X = np.asarray(X)
    y_one_hot = np.asarray(y_one_hot)
    n_samples, n_features = X.shape
    n_classes = y_one_hot.shape[1]

    W = np.random.randn(n_features, n_classes) * 0.01
    b = np.zeros(n_classes)

    # Convert C to lambda (regularization strength)
    # C = 1/lambda, so lambda = 1/C
    lambda_reg = 1.0 / C if C > 0 else 0

    for i in range(num_iterations):
        logits = X @ W + b
        y_pred = softmax(logits)

        if i % 100 == 0:
            # Add regularization to loss calculation
            data_loss = cross_entropy_loss(y_one_hot, y_pred)
            reg_loss = (lambda_reg / 2) * np.sum(W * W)  # L2 regularization term
            total_loss = data_loss + reg_loss
            print(f"Iteration {i}, Loss: {total_loss:.6f} (data: {data_loss:.6f}, reg: {reg_loss:.6f})")

        # Add regularization to gradient
        dW = (X.T @ (y_pred - y_one_hot)) / n_samples + lambda_reg * W
        db = np.sum(y_pred - y_one_hot, axis=0) / n_samples

        W -= learning_rate * dW
        b -= learning_rate * db

    return W, b

In [281]:
def tune_hyperparameters(X_train, y_train_one, X_valid, y_valid_one):
    learning_rates = [0.001, 0.005, 0.01, 0.05, 0.1]
    num_iterations_list = [500, 1000, 2000]
    C_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]  # Inverse L2 regularization strength
    
    best_val_acc = 0
    best_params = {}

    for lr in learning_rates:
        for num_iter in num_iterations_list:
            for C in C_values:
                W, b = fit_softmax_regression(X_train.values, y_train_one.values, 
                                             learning_rate=lr, num_iterations=num_iter, C=C)
                val_pred = predict(X_valid.values, W, b)
                val_acc = np.mean(val_pred == y_valid.values.ravel())

                print(f"LR: {lr}, Iter: {num_iter}, C: {C}, Val Acc: {val_acc:.4f}")

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_params = {'learning_rate': lr, 'num_iterations': num_iter, 'C': C}

    print(f"Best Validation Accuracy: {best_val_acc:.4f} with params: {best_params}")
    return best_params

In [282]:
tune_hyperparameters(X_train, y_train_one, X_valid, y_valid_one)

Iteration 0, Loss: 229.981088 (data: 1.094060, reg: 228.887028)
Iteration 100, Loss: 1.098169 (data: 1.097973, reg: 0.000196)
Iteration 200, Loss: 1.097939 (data: 1.097744, reg: 0.000195)
Iteration 300, Loss: 1.097723 (data: 1.097530, reg: 0.000194)
Iteration 400, Loss: 1.097522 (data: 1.097329, reg: 0.000193)
LR: 0.001, Iter: 500, C: 0.001, Val Acc: 0.3691
Iteration 0, Loss: 24.336271 (data: 1.104655, reg: 23.231616)
Iteration 100, Loss: 1.096432 (data: 1.094505, reg: 0.001927)
Iteration 200, Loss: 1.096221 (data: 1.094304, reg: 0.001917)
Iteration 300, Loss: 1.096022 (data: 1.094115, reg: 0.001907)
Iteration 400, Loss: 1.095837 (data: 1.093939, reg: 0.001897)
LR: 0.001, Iter: 500, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.399387 (data: 1.094307, reg: 2.305080)
Iteration 100, Loss: 1.390478 (data: 1.073561, reg: 0.316917)
Iteration 200, Loss: 1.121749 (data: 1.066936, reg: 0.054813)
Iteration 300, Loss: 1.085733 (data: 1.064707, reg: 0.021026)
Iteration 400, Loss: 1.080848 (data: 

  reg_loss = (lambda_reg / 2) * np.sum(W * W)  # L2 regularization term


Iteration 400, Loss: inf (data: 19.436994, reg: inf)
LR: 0.005, Iter: 500, C: 0.001, Val Acc: 0.3221
Iteration 0, Loss: 24.076361 (data: 1.103207, reg: 22.973153)
Iteration 100, Loss: 1.095660 (data: 1.093772, reg: 0.001888)
Iteration 200, Loss: 1.094944 (data: 1.093096, reg: 0.001848)
Iteration 300, Loss: 1.094432 (data: 1.092616, reg: 0.001816)
Iteration 400, Loss: 1.094065 (data: 1.092275, reg: 0.001790)
LR: 0.005, Iter: 500, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.391403 (data: 1.106836, reg: 2.284567)
Iteration 100, Loss: 1.080092 (data: 1.063650, reg: 0.016442)
Iteration 200, Loss: 1.079705 (data: 1.063378, reg: 0.016327)
Iteration 300, Loss: 1.079474 (data: 1.063301, reg: 0.016172)
Iteration 400, Loss: 1.079300 (data: 1.063258, reg: 0.016042)
LR: 0.005, Iter: 500, C: 0.1, Val Acc: 0.4161
Iteration 0, Loss: 1.331686 (data: 1.100290, reg: 0.231396)
Iteration 100, Loss: 1.091465 (data: 0.985410, reg: 0.106055)
Iteration 200, Loss: 1.018177 (data: 0.941759, reg: 0.076418)
Iter

  dW = (X.T @ (y_pred - y_one_hot)) / n_samples + lambda_reg * W


Iteration 400, Loss: 1.094064 (data: 1.092274, reg: 0.001790)
Iteration 500, Loss: 1.093802 (data: 1.092033, reg: 0.001769)
Iteration 600, Loss: 1.093615 (data: 1.091863, reg: 0.001752)
Iteration 700, Loss: 1.093482 (data: 1.091744, reg: 0.001738)
Iteration 800, Loss: 1.093387 (data: 1.091661, reg: 0.001726)
Iteration 900, Loss: 1.093319 (data: 1.091603, reg: 0.001717)
LR: 0.005, Iter: 1000, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.276099 (data: 1.097766, reg: 2.178333)
Iteration 100, Loss: 1.080059 (data: 1.063588, reg: 0.016471)
Iteration 200, Loss: 1.079683 (data: 1.063370, reg: 0.016314)
Iteration 300, Loss: 1.079457 (data: 1.063297, reg: 0.016161)
Iteration 400, Loss: 1.079287 (data: 1.063255, reg: 0.016032)
Iteration 500, Loss: 1.079159 (data: 1.063236, reg: 0.015924)
Iteration 600, Loss: 1.079063 (data: 1.063231, reg: 0.015832)
Iteration 700, Loss: 1.078991 (data: 1.063236, reg: 0.015755)
Iteration 800, Loss: 1.078936 (data: 1.063247, reg: 0.015689)
Iteration 900, Loss: 1.0

{'learning_rate': 0.05, 'num_iterations': 1000, 'C': 10.0}

In [283]:
# Tune all hyperparameters including C
best_params = tune_hyperparameters(X_train, y_train_one, X_valid, y_valid_one)

# Train with best parameters including C
W, b = fit_softmax_regression(X_train.values, y_train_one.values, 
                             learning_rate=best_params['learning_rate'],
                             num_iterations=best_params['num_iterations'],
                             C=best_params['C'])
val_pred = predict(X_valid.values, W, b)
# test_pred = predict(X_test.values, W, b)

val_acc = np.mean(val_pred == y_valid.values.ravel())
# test_acc = np.mean(test_pred == y_test.values.ravel())

Iteration 0, Loss: 227.192413 (data: 1.098565, reg: 226.093848)
Iteration 100, Loss: 1.098169 (data: 1.097973, reg: 0.000196)
Iteration 200, Loss: 1.097939 (data: 1.097744, reg: 0.000195)
Iteration 300, Loss: 1.097723 (data: 1.097530, reg: 0.000194)
Iteration 400, Loss: 1.097522 (data: 1.097329, reg: 0.000193)
LR: 0.001, Iter: 500, C: 0.001, Val Acc: 0.3691
Iteration 0, Loss: 23.851482 (data: 1.102282, reg: 22.749200)
Iteration 100, Loss: 1.096432 (data: 1.094505, reg: 0.001927)
Iteration 200, Loss: 1.096220 (data: 1.094304, reg: 0.001917)
Iteration 300, Loss: 1.096022 (data: 1.094115, reg: 0.001907)
Iteration 400, Loss: 1.095836 (data: 1.093939, reg: 0.001897)
LR: 0.001, Iter: 500, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.313281 (data: 1.094921, reg: 2.218360)
Iteration 100, Loss: 1.378691 (data: 1.073257, reg: 0.305434)
Iteration 200, Loss: 1.120135 (data: 1.066731, reg: 0.053404)
Iteration 300, Loss: 1.085497 (data: 1.064607, reg: 0.020890)
Iteration 400, Loss: 1.080798 (data: 

  reg_loss = (lambda_reg / 2) * np.sum(W * W)  # L2 regularization term


Iteration 200, Loss: 1.094945 (data: 1.093097, reg: 0.001848)
Iteration 300, Loss: 1.094432 (data: 1.092616, reg: 0.001816)
Iteration 400, Loss: 1.094065 (data: 1.092275, reg: 0.001790)
LR: 0.005, Iter: 500, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.403429 (data: 1.091719, reg: 2.311710)
Iteration 100, Loss: 1.080138 (data: 1.063629, reg: 0.016509)
Iteration 200, Loss: 1.079739 (data: 1.063390, reg: 0.016349)
Iteration 300, Loss: 1.079500 (data: 1.063309, reg: 0.016190)
Iteration 400, Loss: 1.079319 (data: 1.063262, reg: 0.016057)
LR: 0.005, Iter: 500, C: 0.1, Val Acc: 0.4094
Iteration 0, Loss: 1.327112 (data: 1.097410, reg: 0.229702)
Iteration 100, Loss: 1.091461 (data: 0.986445, reg: 0.105016)
Iteration 200, Loss: 1.018115 (data: 0.942383, reg: 0.075732)
Iteration 300, Loss: 0.993346 (data: 0.921563, reg: 0.071784)
Iteration 400, Loss: 0.984707 (data: 0.911085, reg: 0.073622)
LR: 0.005, Iter: 500, C: 1.0, Val Acc: 0.6309
Iteration 0, Loss: 1.114815 (data: 1.091644, reg: 0.023171)

  dW = (X.T @ (y_pred - y_one_hot)) / n_samples + lambda_reg * W


Iteration 400, Loss: 1.094065 (data: 1.092275, reg: 0.001790)
Iteration 500, Loss: 1.093803 (data: 1.092034, reg: 0.001769)
Iteration 600, Loss: 1.093615 (data: 1.091864, reg: 0.001752)
Iteration 700, Loss: 1.093482 (data: 1.091744, reg: 0.001738)
Iteration 800, Loss: 1.093387 (data: 1.091661, reg: 0.001726)
Iteration 900, Loss: 1.093319 (data: 1.091603, reg: 0.001717)
LR: 0.005, Iter: 1000, C: 0.01, Val Acc: 0.3691
Iteration 0, Loss: 3.373858 (data: 1.108798, reg: 2.265060)
Iteration 100, Loss: 1.080068 (data: 1.063637, reg: 0.016432)
Iteration 200, Loss: 1.079688 (data: 1.063372, reg: 0.016316)
Iteration 300, Loss: 1.079461 (data: 1.063298, reg: 0.016163)
Iteration 400, Loss: 1.079290 (data: 1.063256, reg: 0.016034)
Iteration 500, Loss: 1.079161 (data: 1.063236, reg: 0.015925)
Iteration 600, Loss: 1.079064 (data: 1.063231, reg: 0.015834)
Iteration 700, Loss: 1.078992 (data: 1.063236, reg: 0.015756)
Iteration 800, Loss: 1.078937 (data: 1.063247, reg: 0.015690)
Iteration 900, Loss: 1.0

In [284]:
y_valid_true = y_valid.values.ravel()

# y_test_true = y_test.values.ravel()

print("=== VALIDATION SET ===")
print(classification_report(y_valid_true, val_pred))

# print("=== TEST SET ===")
# print(classification_report(y_test_true, test_pred, digits=4))

print(f"Validation Accuracy: {val_acc:.4f}")
# print(f"Test Accuracy: {test_acc:.4f}")

=== VALIDATION SET ===
              precision    recall  f1-score   support

           0       0.82      0.96      0.88        55
           1       0.72      0.69      0.70        48
           2       0.71      0.59      0.64        46

    accuracy                           0.76       149
   macro avg       0.75      0.75      0.74       149
weighted avg       0.75      0.76      0.75       149

Validation Accuracy: 0.7584
