# 2.1 MLP Multi-Class Classifier

##  2.1.2 Model Development from scratch

In [3]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import os

In [13]:
import numpy as np
import pandas as pd
import os
import cv2
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

# Activation Functions
class ActivationFunctions:
    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sigmoid_derivative(x):
        return x * (1 - x)

    @staticmethod
    def tanh(x):
        return np.tanh(x)
    
    @staticmethod
    def tanh_derivative(x):
        return 1 - np.tanh(x) ** 2
    
    @staticmethod
    def relu(x):
        return np.maximum(0, x)
    
    @staticmethod
    def relu_derivative(x):
        return np.where(x > 0, 1, 0)

# MLP Class
class MLP:
    def __init__(self, input_size, hidden_layers, output_size, activation='relu', lr=0.01, optimizer='sgd'):
        self.lr = lr
        self.optimizer = optimizer
        self.activation_func = getattr(ActivationFunctions, activation)
        self.activation_derivative = getattr(ActivationFunctions, f"{activation}_derivative")
        
        self.weights = []
        self.biases = []
    
        layer_sizes = [input_size] + hidden_layers + [output_size]
        for i in range(len(layer_sizes) - 1):
            self.weights.append(np.random.randn(layer_sizes[i], layer_sizes[i+1]) * 0.01)
            self.biases.append(np.zeros((1, layer_sizes[i+1])))
    
    def forward(self, X):
        activations = [X]
        for i in range(len(self.weights)):
            Z = np.dot(activations[-1], self.weights[i]) + self.biases[i]
            A = self.activation_func(Z) if i < len(self.weights) - 1 else Z  
            activations.append(A)
        return activations
    
    def backward(self, activations, y):
        m = y.shape[0]
        dA = activations[-1] - y  # Derivative of loss 
        weight_grads = []
        bias_grads = []
        
        for i in reversed(range(len(self.weights))):
            dZ = dA * self.activation_derivative(activations[i+1]) if i < len(self.weights) - 1 else dA
            dW = np.dot(activations[i].T, dZ) / m
            dB = np.sum(dZ, axis=0, keepdims=True) / m
            
            dA = np.dot(dZ, self.weights[i].T)  # Backpropagate
            weight_grads.insert(0, dW)
            bias_grads.insert(0, dB)
        
        return weight_grads, bias_grads
    
    def update_weights(self, weight_grads, bias_grads):
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * weight_grads[i]
            self.biases[i] -= self.lr * bias_grads[i]
    
    def train(self, X, y, epochs=10):
        for epoch in range(epochs):
            activations = self.forward(X)
            weight_grads, bias_grads = self.backward(activations, y)
            self.update_weights(weight_grads, bias_grads)
            if epoch % 10 == 0:
                loss = np.mean((activations[-1] - y) ** 2)
                print(f"Epoch {epoch}: Loss = {loss}")
    
    def predict(self, X):
        return self.forward(X)[-1]





In [15]:
IMAGE_SIZE = (32, 32)
FOLD_PATH = "Symbols/classification-task"  

def load_data(csv_file, img_folder, label_mapping):
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"CSV file not found: {csv_file}")

    df = pd.read_csv(csv_file)
    required_columns = {'path', 'symbol_id'}
    if not required_columns.issubset(df.columns):
        raise ValueError(f"CSV is missing required columns: {required_columns - set(df.columns)}")

    X, y = [], []
    for _, row in df.iterrows():
        img_path = os.path.normpath(os.path.join(img_folder, os.path.basename(row['path'])))
        if not os.path.exists(img_path):
            print(f"Warning: Image not found - {img_path}")
            continue

        image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if image is None:
            print(f"Skipping corrupted image: {img_path}")
            continue

        image = cv2.resize(image, IMAGE_SIZE).flatten() / 255.0
        X.append(image)
        y.append(label_mapping.get(row['symbol_id'], -1))  

    return np.array(X), np.array(y)

#label mapping from symbol IDs
def create_label_mapping(symbol_csv):
    df = pd.read_csv(symbol_csv)
    unique_ids = sorted(df['symbol_id'].unique())
    return {symbol_id: idx for idx, symbol_id in enumerate(unique_ids)}

symbol_csv = 'symbols.csv'
label_mapping = create_label_mapping(symbol_csv)

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")


all_train_y = []
for i in range(1, 11):
    csv_path = os.path.join(FOLD_PATH, f"fold-{i}", "train.csv")  
    img_folder = "Symbols/images"
    
    X, y = load_data(csv_path, img_folder, label_mapping)
    all_train_y.append(y)

all_train_y = np.concatenate(all_train_y)
encoder.fit(all_train_y.reshape(-1, 1))

fold_accuracies = []
for fold in range(1, 11): 
    print(f"\nTraining Fold {fold}...")

    train_X, train_y, test_X, test_y = [], [], [], []

    for i in range(1, 11):
        csv_path = os.path.join(FOLD_PATH, f"fold-{i}", "train.csv")  
        img_folder = "Symbols/images"

        X, y = load_data(csv_path, img_folder, label_mapping)
        y = encoder.transform(y.reshape(-1, 1)) 

        if i == fold:  
            test_X, test_y = X, y 
        else:
            train_X.append(X)
            train_y.append(y)

    # Merging all training folds
    train_X = np.vstack(train_X)
    train_y = np.vstack(train_y)

    # Train model
    model = MLP(input_size=1024, hidden_layers=[128, 64], output_size=len(label_mapping), activation='relu', lr=0.01)
    model.train(train_X, train_y, epochs=30)

    # Evaluate
    y_pred = model.forward(test_X)[-1]
    accuracy = np.mean(np.argmax(y_pred, axis=1) == np.argmax(test_y, axis=1))
    fold_accuracies.append(accuracy)

    print(f"Fold {fold} Accuracy: {accuracy:.4f}")

#final average accuracy
print(f"\nFinal Average Accuracy: {np.mean(fold_accuracies):.4f}")

KeyboardInterrupt: 

In [2]:
def stochastic_grad_descent(weights, biases, grad_w, grad_b, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * grad_w[i]
        biases[i] -= learning_rate * grad_b[i]

def batch_grad_descent(weights, biases, grad_w, grad_b, learning_rate):
    for i in range(len(weights)):
        weights[i] -= learning_rate * np.mean(grad_w[i], axis=0)
        biases[i] -= learning_rate * np.mean(grad_b[i], axis=0)

def mini_batch_grad_descent(weights, biases, grad_w, grad_b, learning_rate, batch_size):
    for i in range(len(weights)):
        batch_indices = np.random.choice(len(grad_w[i]), batch_size, replace=False)
        weights[i] -= learning_rate * np.mean(grad_w[i][batch_indices], axis=0)
        biases[i] -= learning_rate * np.mean(grad_b[i][batch_indices], axis=0)


In [3]:
class MLP:
    def __init__(self, layer_size, activation='sigmoid', optimizer='sgd', learn_rate=0.01,batch_size=32 ):
        self.layer_size = layer_size
        self.activation_fn = self.activate_func(activation)
        self.activation_der = self.activate_der(activation)
        self.optimizer=optimizer
        self.learn_rate = learn_rate
        self.batch_size = batch_size
        self.init_weights()

    def init_weights(self):
        self.weights=[]
        self.biases=[]
        for i in range(len(self.layer_size) - 1):
            self.weights.append(np.random.randn(self.layer_size[i],self.layer_size[i+1]) *0.01)#to prevent exploding gradients
            self.biases.append(np.zeros((1,self.layer_size[i+1])))
    
    def activate_func(self,name):
        if name == 'relu':
            return lambda x: np.maximum(0,x)
        elif name == 'sigmoid':
            return lambda x: 1/(1+np.exp(-x))
        elif name == 'tanh':
            return lambda x: (np.exp(x) - np.exp(-x))/ (np.exp(x) + np.exp(-x))
        else:
            raise ValueError("Error-Activation fn")
        
    def activate_der(self,name):
        if name == 'relu':
            return lambda x:np.where(x>0,1,0)
        elif name=='sigmoid':
            sig = lambda x: 1/(1+np.exp(-x))
            return lambda x:sig(x)*(1-sig(x))
        elif name == 'tanh':
            tan_func = lambda x: (np.exp(x) - np.exp(-x))/ (np.exp(x) + np.exp(-x))
            return lambda x: 1- tan_func(x)**2
        
    def forward_prop(self,X):
        activations = [X]
        z_sum = []
        for w,b in zip(self.weights,self.biases):
            z = np.dot(activations[-1],w)+b
            z_sum.append(z)
            activations.append(self.activation_fn(z))
        return activations,z_sum
    
    def back_prop(self,X,y,activation,z_sum):
        m=X.shape[0]
        grad_w = [np.zeros(w.shape) for w in self.weights]
        grad_b = [np.zeros(b.shape) for b in self.biases]

        error = activation[-1] - y
        delta = error * self.activate_der(z_sum[-1])

        for i in range(len(self.weights)-1,-1,-1):
            grad_w[i] = np.dot(activation[i].T,delta)/m
            grad_b[i] = np.sum(delta, axis=0,keepdims=True)/m
            if i>0:
                delta = np.dot(delta,self.weights[i].T)*self.activate_der(z_sum[i-1])
        return grad_w, grad_b
    
    def update_weights(self,grad_w,grad_b):
        if self.optimizer == 'sgd':
            stochastic_grad_descent(self.weights,self.biases,grad_w,grad_b,self.learn_rate)
        elif self.optimizer == 'batch':
            batch_grad_descent(self.weights,self.biases,grad_w,grad_b,self.learn_rate)
        elif self.optimizer == 'mini-batch':
            mini_batch_grad_descent(self.weights, self.biases,grad_w,grad_b,self.learn_rate,self.batch_size)
        else:
            raise ValueError("Error-optimizer")
        
    def train(self,X,y,epochs=100):
        for epoch in range(epochs):
            activations,z_sum = self.forward_prop(X)
            grad_w,grad_b = self.back_prop(X,y,activations,z_sum)
            self.update_weights(grad_w,grad_b)
            if epoch % 10 == 0:
                loss = np.mean((activations[-1] - y) ** 2)
                print(f"Epoch {epoch}, Loss: {loss}")

    def predict(self,X):
        activations, _ = self.forward_prop(X)
        return activations[-1]




## 2.1.3 Hyperparameter Tuning & Evaluation with 10-Fold Validation

In [12]:
from sklearn.metrics import accuracy_score
train_data = []
test_data = []
labels = {}

symbols = pd.read_csv('symbols.csv')
symbol_map = dict(zip(symbols['symbol_id'], symbols['latex']))

dataset_path = 'Symbols/classification-task'
image_path = 'Symbols/images'

def load_data(fold_num):
    train_data = []
    test_data = []
    
    for i in range(1, 11):
        fold_path = os.path.join(dataset_path, f'fold-{i}')
        df = pd.read_csv(os.path.join(fold_path, 'train.csv'))

        for _, row in df.iterrows():
            img_path = os.path.join(image_path, row['path'].split('/')[-1])
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                continue
            img = cv2.resize(img, (32, 32)).flatten() / 255.0

            if i == fold_num:
                test_data.append((img, row['symbol_id']))
            else:
                train_data.append((img, row['symbol_id']))

    return train_data, test_data

results = []
activations = ['sigmoid', 'relu', 'tanh']
optimizers = ['sgd', 'batch', 'mini-batch']
layer_sizes = [[1024, 256, 128, 1], [1024, 512, 256, 1]]

for activation in activations:
    for optimizer in optimizers:
        for layer_size in layer_sizes:
            accs = []
            for i in range(1, 11):
                train_data, test_data = load_data(i)
                X_train = np.array([i[0] for i in train_data])
                y_train = np.array([i[1] for i in train_data])
                X_test = np.array([i[0] for i in test_data])
                y_test = np.array([i[1] for i in test_data])

                model = MLP(layer_size, activation, optimizer)
                model.train(X_train, y_train, epochs=100)
                y_pred = model.predict(X_test)
                acc = accuracy_score(y_test, y_pred)
                accs.append(acc)

            mean_acc = np.mean(accs)
            std_acc = np.std(accs)
            results.append((activation, optimizer, layer_size, mean_acc, std_acc))

# results
results_df = pd.DataFrame(results, columns=['Activation', 'Optimizer', 'Layer_Size', 'Mean_Accuracy', 'Std_Accuracy'])
print(results_df.sort_values(by='Mean_Accuracy', ascending=False))


KeyboardInterrupt: 