# Assignment 3: Predicting Mapping Penalties with ANN
**Due:** June 5, 2025, 11:59 PM

**Author:** Tony Liang

**Student Number:** 20990204

In this assignment, a feed-forward artificial neural network (ANN) is implemented from scratch to predict the penalty score of a mapping between tasks and employees.

In this notebook we will:
1. Load the 100 mappings dataset  
2. Preprocess & encode into 110-dim vectors  
3. Define two ANN architectures (Model A & Model B)  
4. Implement forward, backward, updates by hand  
5. Train via mini-batch SGD over grid of hyperparameters  
6. Produce the eight required comparison plots  
7. Export results for report submission  



## Assignment Imports

In [2]:
import numpy as np
import pandas as pd
import matplotlib . pyplot as plt
import time

# For reproducibility
np.random.seed(42)

!git clone https://github.com/tonyzrl/ANN_Assignment

# Task data: ID, Estimated Time, Difficulty, Deadline, Skill Required
tasks = [{"id": "T1", "estimated_time": 4, "difficulty": 3, "deadline": 8, "skill_required": "A"},
        {"id": "T2", "estimated_time": 6, "difficulty": 5, "deadline": 12, "skill_required": "B"},
        {"id": "T3", "estimated_time": 2, "difficulty": 2, "deadline": 6, "skill_required": "A"},
        {"id": "T4", "estimated_time": 5, "difficulty": 4, "deadline": 10, "skill_required": "C"},
        {"id": "T5", "estimated_time": 3, "difficulty": 1, "deadline": 7, "skill_required": "A"},
        {"id": "T6", "estimated_time": 8, "difficulty": 6, "deadline": 15, "skill_required": "B"},
        {"id": "T7", "estimated_time": 4, "difficulty": 3, "deadline": 9, "skill_required": "C"},
        {"id": "T8", "estimated_time": 7, "difficulty": 5, "deadline": 14, "skill_required": "B"},
        {"id": "T9", "estimated_time": 2, "difficulty": 2, "deadline": 5, "skill_required": "A"},
        {"id": "T10", "estimated_time": 6, "difficulty": 4, "deadline": 11, "skill_required": "C"},]

# Employee data: ID, Available hours, Skill level, Skills
employees = [{"id": "E1", "hours_avail": 10, "skill_level": 4, "skills": ["A", "C"]},
            {"id": "E2", "hours_avail": 12, "skill_level": 6, "skills": ["A", "B", "C"]},
            {"id": "E3", "hours_avail": 8, "skill_level": 3, "skills": ["A"]},
            {"id": "E4", "hours_avail": 15, "skill_level": 7, "skills": ["B", "C"]},
            {"id": "E5", "hours_avail": 9, "skill_level": 5, "skills": ["A", "C"]}]

Cloning into 'ANN_Assignment'...
remote: Enumerating objects: 46, done.[K
remote: Counting objects: 100% (46/46), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 46 (delta 11), reused 25 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (46/46), 17.31 KiB | 17.31 MiB/s, done.
Resolving deltas: 100% (11/11), done.


## Data Loading & Preprocessing

In [4]:
df = pd.read_csv('/content/ANN_Assignment/data/task_assignment_data.csv')
data = df.values

def one_hot_encode(skills):
    """
    One-hot encode a list of skills, e.g. ['A','C'] -> [1,0,1].
    """
    mapping = {'A': 0, 'B': 1, 'C': 2}
    vec = [0, 0, 0]
    for s in skills:
        vec[mapping[s]] = 1
    return vec

def construct_input_vector(mapping_row):
    """
    Given one row of the mapping CSV (task→employee assignments + penalty),
    plus the list of task & employee, construct the 110-dim vector.
    """
    input_vector = []
    # First 10 entries are employee assignments; last entry is penalty
    assignments = mapping_row[:10]

    for idx, emp_id in enumerate(assignments, start=1):
        task_id = f"T{idx}"
        # Find the task dict
        task = next(t for t in tasks if t["id"] == task_id)
        # Find the employee dict
        emp = next(e for e in employees if e["id"] == emp_id)

        # Task features: [time, difficulty, deadline] + one-hot(required skill)
        task_features = [
            task["estimated_time"],
            task["difficulty"],
            task["deadline"]
        ] + one_hot_encode(task["skill_required"])

        # Employee features: [hours_avail, skill_level] + one-hot(skills)
        emp_features = [
            emp["hours_avail"],
            emp["skill_level"],
        ] + one_hot_encode(emp["skills"])

        input_vector.extend(task_features + emp_features)

    return np.array(input_vector)

# ──────────────────────────────────────────────────────────────────────────────
# Processing Data
# ──────────────────────────────────────────────────────────────────────────────
assignments = data[:, :10]      # shape (m, 10) of employee IDs
penalties   = data[:, 10]       # shape (m,)

# 2. Vectorize each row into a 110-dim feature vector
X_raw = np.array([
    construct_input_vector(list(row))[0]
    for row in data
])                                # shape (m, 110)
y_raw = penalties.reshape(-1, 1) # shape (m, 1)

# 4. Transpose for Lab9 convention: columns are examples
X = X_raw.T  # now (110, m)
y = y_raw.T  # now (1,   m)

# 5. Shuffle
perm = np.random.permutation(X.shape[1])
X, y = X[:, perm], y[:, perm]

# 6. Split 70/15/15
m       = X.shape[1]
n_train = int(0.7 * m)
n_val   = int(0.15 * m)

X_train, y_train = X[:, :n_train],          y[:, :n_train]
X_val,   y_val   = X[:, n_train:n_train+n_val], y[:, n_train:n_train+n_val]
X_test,  y_test  = X[:, n_train+n_val:],     y[:, n_train+n_val:]

In [5]:
vector = ["E2","E3","E3","E2","E2","E2","E1","E5","E1","E5",4.6000000000000005]
input = construct_input_vector(vector)
print(input)
print(input.shape)

[ 4  3  8  1  0  0 12  6  1  1  1  6  5 12  0  1  0  8  3  1  0  0  2  2
  6  1  0  0  8  3  1  0  0  5  4 10  0  0  1 12  6  1  1  1  3  1  7  1
  0  0 12  6  1  1  1  8  6 15  0  1  0 12  6  1  1  1  4  3  9  0  0  1
 10  4  1  0  1  7  5 14  0  1  0  9  5  1  0  1  2  2  5  1  0  0 10  4
  1  0  1  6  4 11  0  0  1  9  5  1  0  1]
(110,)


## Activation Functions

In [6]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_deriv(a):
    return a * (1 - a)

def relu(z):
    return np.maximum(0, z)

def relu_deriv(z):
    return (z > 0).astype(float)

## Model A Definitions

In [7]:
class NeuralNetworkA :
  def __init__ (self, activation =("relu")):

    # Define activation function
    if activation == 'relu':
        self.act_func, self.act_func_deriv = relu, relu_deriv
    else:
        self.act_func, self.act_prime_deriv = sigmoid, sigmoid_deriv

    # Layer params
    self.W1 = np.random.randn(256, 110) * 0.01
    self.b1 = np.zeros((256, 1))
    self.W2 = np.random.randn(1, 256) * 0.01
    self.b2 = np.zeros((1, 1))

  def forward(self , x):
    """
    x: (110, m)  # columns are examples
    returns: A2 (1, m) and cache for backprop
    """
    # Layer 1
    Z1 = self.W1.dot(x) + self.b1    # (256, m)
    A1 = self.act_func(Z1)                # (256, m)
    # Layer 2 (output, linear)
    Z2 = self.W2.dot(A1) + self.b2   # (1, m)
    A2 = Z2                          # identity

    cache = (x, Z1, A1, Z2, A2)
    return A2, cache

  def backward(self, y_true, cache):
    """
    y_true: (1, m) true values
    cache: x, Z1, A1, Z2, A2
    returns: grads dict {dW1, db1, dW2, db2}
    """
    x, Z1, A1, Z2, A2 = cache
    m = x.shape[1]

    # output layer gradient (MSE)
    dZ2 = 2 * (A2 - y_true) / m            # (1, m)
    dW2 = dZ2.dot(A1.T)               # (1, 256)
    db2 = np.sum(dZ2, axis=1, keepdims=True)  # (1,1)

    # hidden layer gradient
    dA1 = self.W2.T.dot(dZ2)          # (256, m)
    dZ1 = dA1 * self.act_func_deriv(Z1)    # (256, m)
    dW1 = dZ1.dot(x.T)                # (256, 110)
    db1 = np.sum(dZ1, axis=1, keepdims=True)  # (256,1)

    grads = {'dW1':dW1, 'db1':db1, 'dW2':dW2, 'db2':db2}
    return grads

  def update_params (self, grads, lr):
    self.W1 -= lr * grads['dW1']
    self.b1 -= lr * grads['db1']
    self.W2 -= lr * grads['dW2']
    self.b2 -= lr * grads['db2']


## Model B Definitions


In [8]:
class NeuralNetworkB:
    def __init__(self, activation='relu'):

        # Define activation function
        if activation == 'relu':
            self.act_func, self.act_func_deriv = relu, relu_deriv
        else:
            self.act_func, self.act_func_deriv = sigmoid, sigmoid_deriv

        # Layer params
        self.W1 = np.random.randn(128, 110) * 0.01
        self.b1 = np.zeros((128, 1))
        self.W2 = np.random.randn(128, 128) * 0.01
        self.b2 = np.zeros((128, 1))
        self.W3 = np.random.randn(1, 128) * 0.01
        self.b3 = np.zeros((1, 1))

    def forward(self, x):
        """
        x: (110, m)
        returns: A3 (1, m) and cache
        """
        Z1 = self.W1.dot(x) + self.b1      # (128, m)
        A1 = self.act_func(Z1)                  # (128, m)
        Z2 = self.W2.dot(A1) + self.b2     # (128, m)
        A2 = self.act_func(Z2)                  # (128, m)
        Z3 = self.W3.dot(A2) + self.b3     # (1, m)
        A3 = Z3                            # identity

        cache = (x, Z1, A1, Z2, A2, Z3, A3)
        return A3, cache

    def backward(self, y_true, cache):
        """
        y_true: (1, m)
        cache: X, Z1, A1, Z2, A2, Z3, A3
        returns: grads dict
        """
        x, Z1, A1, Z2, A2, Z3, A3 = cache
        m = x.shape[1]

        # output layer
        dZ3 = 2 * (A3 - y_true) / m              # (1, m)
        dW3 = dZ3.dot(A2.T)                 # (1,128)
        db3 = np.sum(dZ3, axis=1, keepdims=True)

        # hidden layer 2
        dA2 = self.W3.T.dot(dZ3)            # (128, m)
        dZ2 = dA2 * self.act_func_deriv(Z2)      # (128, m)
        dW2 = dZ2.dot(A1.T)                 # (128,128)
        db2 = np.sum(dZ2, axis=1, keepdims=True)

        # hidden layer 1
        dA1 = self.W2.T.dot(dZ2)            # (128, m)
        dZ1 = dA1 * self.act_func_deriv(Z1)      # (128, m)
        dW1 = dZ1.dot(x.T)                  # (128,110)
        db1 = np.sum(dZ1, axis=1, keepdims=True)

        return {'dW1':dW1,'db1':db1,'dW2':dW2,'db2':db2,'dW3':dW3,'db3':db3}

    def update_params(self, grads, lr):
        self.W1 -= lr * grads['dW1']
        self.b1 -= lr * grads['db1']
        self.W2 -= lr * grads['dW2']
        self.b2 -= lr * grads['db2']
        self.W3 -= lr * grads['dW3']
        self.b3 -= lr * grads['db3']

## Training Loop

In [8]:
# Hyperparameter grid
learning_rates = [0.01, 0.001, 0.0001]
batch_sizes    = [8, 16, 32]
activations    = ['sigmoid', 'relu']
epochs         = 100  # or as required

# Placeholder for results
results = []

# Assume you have:
# X_train, y_train, X_val, y_val, X_test, y_test defined,
# and a NeuralNetwork class with:
#   model = NeuralNetwork([110,256,1], activation=phi)
#   y_hat, cache = model.forward(xb)
#   grads = model.backward(xb, yb, cache)
#   model.update_params(grads, lr)

for alpha in learning_rates:
    for batch_size in batch_sizes:
        for func in activations:
            # Initialize model for Model A architecture
            model = NeuralNetwork([110, 256, 1], activation=func)
            train_losses = []
            val_losses   = []
            epoch_times  = []

            # Epoch loop
            for e in range(epochs):
                start_time = time.time()

                # Shuffle training data
                perm = np.random.permutation(X_train.shape[0])
                X_sh, y_sh = X_train[perm], y_train[perm]

                # Mini-batch training
                for i in range(0, X_sh.shape[0], batch_size):
                    xb = X_sh[i:i+batch_size]
                    yb = y_sh[i:i+batch_size]

                    # Forward + backward + update
                    y_pred, cache = model.forward(xb)
                    grads = model.backward(xb, yb, cache)
                    model.update_params(grads, alpha)

                # End of epoch: measure metrics
                y_train_pred, _ = model.forward(X_train)
                y_val_pred,   _ = model.forward(X_val)
                train_loss = np.mean((y_train_pred - y_train)**2)
                val_loss   = np.mean((y_val_pred   - y_val  )**2)

                train_losses.append(train_loss)
                val_losses.append(val_loss)
                epoch_times.append(time.time() - start_time)

            # Final test-set evaluation
            y_test_pred, _ = model.forward(X_test)
            test_loss = np.mean((y_test_pred - y_test)**2)

            # Store results
            results.append({
                'learning_rate': alpha,
                'batch_size': batch_size,
                'activation': phi,
                'train_losses': train_losses,
                'val_losses': val_losses,
                'epoch_times': epoch_times,
                'test_loss': test_loss
            })

# Example: inspect one result entry
print("Example result:", results[0])