# 2.1 MLP Multi-Class Classifier

In [47]:
import pandas as pd
import os

# reducing dataset size by 50%
for fold in range(1, 11):
    train_file_path = f'Symbols/classification-task/fold-{fold}/train.csv'
    test_file_path = f'Symbols/classification-task/fold-{fold}/test.csv'
 
    if os.path.exists(train_file_path) and os.path.exists(test_file_path):
        train_df = pd.read_csv(train_file_path)
        test_df = pd.read_csv(test_file_path)
        
        #sampling 50% of the dataset
        train_df_reduced = train_df.sample(frac=0.1, random_state=42)  
        test_df_reduced = test_df.sample(frac=0.5, random_state=42)  

        train_reduced_file_path = f'Symbols/classification-task/fold-{fold}/train_reduced.csv'
        test_reduced_file_path = f'Symbols/classification-task/fold-{fold}/test_reduced.csv'
        
        train_df_reduced.to_csv(train_reduced_file_path, index=False)
        test_df_reduced.to_csv(test_reduced_file_path, index=False)

        print(f"Fold-{fold} - Original Train Size: {len(train_df)}, Reduced Train Size: {len(train_df_reduced)}")
        print(f"Fold-{fold} - Original Test Size: {len(test_df)}, Reduced Test Size: {len(test_df_reduced)}")
    else:
        print(f"Files for fold-{fold} not found.")


Fold-1 - Original Train Size: 151241, Reduced Train Size: 15124
Fold-1 - Original Test Size: 16992, Reduced Test Size: 8496
Fold-2 - Original Train Size: 151288, Reduced Train Size: 15129
Fold-2 - Original Test Size: 16945, Reduced Test Size: 8472
Fold-3 - Original Train Size: 151320, Reduced Train Size: 15132
Fold-3 - Original Test Size: 16913, Reduced Test Size: 8456
Fold-4 - Original Train Size: 151358, Reduced Train Size: 15136
Fold-4 - Original Test Size: 16875, Reduced Test Size: 8438
Fold-5 - Original Train Size: 151395, Reduced Train Size: 15140
Fold-5 - Original Test Size: 16838, Reduced Test Size: 8419
Fold-6 - Original Train Size: 151423, Reduced Train Size: 15142
Fold-6 - Original Test Size: 16810, Reduced Test Size: 8405
Fold-7 - Original Train Size: 151464, Reduced Train Size: 15146
Fold-7 - Original Test Size: 16769, Reduced Test Size: 8384
Fold-8 - Original Train Size: 151501, Reduced Train Size: 15150
Fold-8 - Original Test Size: 16732, Reduced Test Size: 8366
Fold-9 -

In [41]:
import pandas as pd
from PIL import Image
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder

def load_data(csv_file_path, image_folder):
    data_df = pd.read_csv(csv_file_path)
    images = []
    labels = []
    base_image_path = os.path.abspath(image_folder)

    for _, row in data_df.iterrows():
        image_path = row['path']
        symbol_id = row['symbol_id'] 
        if image_path.startswith("../../images/"):
            image_path = os.path.join(base_image_path, image_path[6:]) 
  
        full_image_path = os.path.normpath(image_path)
        
        try:
            image = Image.open(full_image_path).convert('L')  # 'L' for grayscale
        except Exception as e:
            print(f"Error opening image {full_image_path}: {e}")
            continue
       
        image = image.resize((32, 32))
        image_array = np.array(image) / 255.0  # Normalizing the pixel values to [0, 1]
        flattened_image = image_array.flatten()
        
        images.append(flattened_image)
        labels.append(symbol_id)

    images = np.array(images)
    labels = np.array(labels)
    
    return images, labels

def encode_labels(symbol_ids):
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(symbol_ids) 
    return encoded_labels, label_encoder

In [36]:
test_images, test_labels = load_data('Symbols/classification-task/fold-1/test_reduced.csv')

encoded_labels, label_encoder = encode_labels(test_labels)
num_classes = len(label_encoder.classes_)
print("Number of unique classes (symbols):", num_classes)
print("Class names:", label_encoder.classes_)

Number of unique classes (symbols): 369
Class names: [  31   32   33   34   35   36   37   38   39   40   41   42   43   44
   45   46   47   48   49   50   51   52   53   54   55   56   59   70
   71   72   73   74   75   76   77   78   79   81   82   87   88   89
   90   91   92   93   94   95   96   97   98   99  100  101  102  103
  104  105  106  107  108  110  111  112  113  114  115  116  117  150
  151  152  153  154  155  156  157  158  159  160  161  162  163  164
  165  166  167  168  169  170  171  174  175  176  177  178  179  180
  181  182  183  184  185  186  187  188  189  190  191  192  193  194
  195  196  197  254  257  259  260  261  262  263  264  265  266  267
  268  269  508  510  511  512  513  514  517  520  521  523  524  526
  527  528  529  530  531  532  533  534  535  536  537  538  539  540
  541  542  544  549  550  553  555  562  564  574  577  582  583  584
  591  595  600  601  603  604  605  607  608  609  610  611  612  613
  614  615  616  617  61

In [38]:
import torch
import numpy as np
import torch.nn.functional as F

class MLP:
    def __init__(self, layers, activation_function='relu', learning_rate=0.01, optimizer='sgd', batch_size=None, device='cpu'):
        self.layers = layers  
        self.learning_rate = learning_rate
        self.activation_function = activation_function
        self.optimizer = optimizer
        self.device = device
        self.batch_size = batch_size 
        self.num_classes = layers[-1] 
        
        self.weights = []
        self.biases = []
        for i in range(1, len(layers)):
            W = torch.randn(layers[i], layers[i-1], device=self.device) * 0.1
            b = torch.zeros(layers[i], 1, device=self.device)
            self.weights.append(W)
            self.biases.append(b)

    def activation(self, z, derivative=False):
        if self.activation_function == 'sigmoid':
            if derivative:
                return self.sigmoid(z) * (1 - self.sigmoid(z))
            return self.sigmoid(z)
        elif self.activation_function == 'tanh':
            if derivative:
                return 1 - torch.tanh(z)**2
            return torch.tanh(z)
        elif self.activation_function == 'relu':
            if derivative:
                return (z > 0).float()
            return torch.maximum(torch.tensor(0.0, device=self.device), z)

    def sigmoid(self, x):
        return 1 / (1 + torch.exp(-x))

    def forward(self, X):
        activations = [X]
        for i in range(len(self.layers) - 2):
            z = torch.mm(self.weights[i], activations[-1]) + self.biases[i]
            a = self.activation(z)
            activations.append(a)

        # Final layerwith softmax activation 
        z_output = torch.mm(self.weights[-1], activations[-1]) + self.biases[-1]
        output = F.softmax(z_output, dim=0) 
        activations.append(output)

        return activations

    def backward(self, X, y, activations):
        m = X.size(1)
        dz = activations[-1] - y
        dW = torch.mm(dz, activations[-2].T) / m
        db = torch.sum(dz, dim=1, keepdim=True) / m
        gradients = [(dW, db)]

        for i in range(len(self.layers) - 2, 0, -1):
            dz = torch.mm(self.weights[i].T, dz) * self.activation(torch.mm(self.weights[i-1], activations[i-1]) + self.biases[i-1], derivative=True)
            dW = torch.mm(dz, activations[i-1].T) / m
            db = torch.sum(dz, dim=1, keepdim=True) / m
            gradients.append((dW, db))

        gradients.reverse()
        return gradients

    def update_parameters(self, gradients, batch_size=None):
        for i in range(len(self.weights)):
            dW, db = gradients[i]
            if self.optimizer == 'sgd':
                self.weights[i] -= self.learning_rate * dW
                self.biases[i] -= self.learning_rate * db
            elif self.optimizer == 'batch':
                self.weights[i] -= self.learning_rate * dW
                self.biases[i] -= self.learning_rate * db
            elif self.optimizer == 'mini-batch':
                self.weights[i] -= (self.learning_rate / batch_size) * dW
                self.biases[i] -= (self.learning_rate / batch_size) * db
    
    def to_one_hot(self,y,num_classes):
        y_one_hot = torch.zeros(num_classes,y.size(0),device=self.device)
        for i in range(y.size(0)):
            y_one_hot[y[i], i] = 1
        return y_one_hot
    
    def train(self, X, y, epochs=100):
        X = torch.tensor(X, dtype=torch.float32, device=self.device).T
        y = torch.tensor(y, dtype=torch.float32, device=self.device)
        
        unique_classes = torch.unique(y)
        class_mapping = {cls.item(): i for i, cls in enumerate(unique_classes)}

        y_mapped = torch.tensor([class_mapping[label.item()] for label in y], 
                               dtype=torch.long, device=self.device)
        
        # Convert to one-hot encoding
        y_one_hot = self.to_one_hot(y_mapped, len(unique_classes))
        self.class_mapping = class_mapping
        self.inverse_mapping = {v: k for k, v in class_mapping.items()}

        m = X.size(1)
        
        if self.optimizer == 'mini-batch' and self.batch_size is None:
            raise ValueError("Batch size must be provided for mini-batch gradient descent.")
        
        batch_size = self.batch_size if self.optimizer == 'mini-batch' else m
        
        for epoch in range(epochs):
            total_loss = 0
            
            if self.optimizer == 'mini-batch':
                indices = torch.randperm(m)
                X = X[:, indices]
                y_one_hot = y_one_hot[:, indices]
            
            for i in range(0, m, batch_size):
                end = min(i + batch_size, m)
                batch_X = X[:, i:end]
                batch_y = y_one_hot[:, i:end]
                
                # Forward pass
                activations = self.forward(batch_X)
                
                # Backward pass
                gradients = self.backward(batch_X, batch_y, activations)
                
                #parameters
                self.update_parameters(gradients)
                
                #loss (cross-entropy)
                epsilon = 1e-15  # Small constant to avoid log(0)
                predictions = torch.clamp(activations[-1], epsilon, 1.0 - epsilon)
                loss = -torch.sum(batch_y * torch.log(predictions)) / batch_X.size(1)
                total_loss += loss.item()
            
            if (epoch + 1) % 10 == 0 or epoch == 0:
                avg_loss = total_loss / (m // batch_size + (1 if m % batch_size != 0 else 0))
                print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    def predict(self, X):
        X = torch.tensor(X, dtype=torch.float32, device=self.device).T
        activations = self.forward(X)
        predictions = activations[-1]
        return predictions.cpu().detach().numpy()
    
    def predict_classes(self,X):
        probs = self.predict(X)
        pred_indices = np.argmax(probs, axis=0)
        if hasattr(self, 'inverse_mapping'):
            return np.array([self.inverse_mapping[idx] for idx in pred_indices])
        else:
            return pred_indices

In [39]:
base_path = 'Symbols'  
symbols_df = pd.read_csv('symbols.csv')
num_classes = len(symbols_df)
print(num_classes)

369


In [50]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

base_path = 'Symbols/classification-task'
image_folder = 'Symbols'

folds = [f'fold-{i}' for i in range(1, 11)]

symbols_df = pd.read_csv('symbols.csv')
num_classes = len(symbols_df)

input_size = 32 * 32
layers = [input_size, 768, 512, num_classes]
activation_function = 'relu'
learning_rate = 0.01
epochs = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'

results = []

# 10-fold cross-validation
for fold in folds:
    print(f"Processing fold: {fold}")
    
    #current fold's test set
    test_path = os.path.join(base_path, fold, 'test_reduced.csv')
    X_test, y_test = load_data(test_path, image_folder)
    
    # other folds' training data
    X_train_list = []
    y_train_list = []
    
    for other_fold in folds:
        if other_fold != fold:  
            train_path = os.path.join(base_path, other_fold, 'train_reduced.csv')
            X_fold, y_fold = load_data(train_path, image_folder)
            X_train_list.append(X_fold)
            y_train_list.append(y_fold)
    
    # Combine all training data
    X_train = np.vstack(X_train_list)
    y_train = np.concatenate(y_train_list)

    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_flat)
    X_test_scaled = scaler.transform(X_test_flat)
   
    model = MLP(
        layers=layers,
        activation_function=activation_function,
        learning_rate=learning_rate,
        optimizer='batch',
        device=device
    )
    
    model.train(X_train_scaled, y_train, epochs=epochs)
    
    y_pred_classes = model.predict_classes(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred_classes)
    results.append(accuracy)
    
    print(f'Fold {fold}: Accuracy = {accuracy:.4f}')

# average accuracy
average_accuracy = sum(results) / len(results)
print(f'Average Accuracy across all 10 folds: {average_accuracy:.4f}')

Processing fold: fold-1
Epoch 1/100, Loss: 26.1099
Epoch 10/100, Loss: 19.3489
Epoch 20/100, Loss: 16.2964
Epoch 30/100, Loss: 14.4973
Epoch 40/100, Loss: 13.1707
Epoch 50/100, Loss: 12.1094
Epoch 60/100, Loss: 11.2335
Epoch 70/100, Loss: 10.4967
Epoch 80/100, Loss: 9.8662
Epoch 90/100, Loss: 9.3206
Epoch 100/100, Loss: 8.8438
Fold fold-1: Accuracy = 0.1059
Processing fold: fold-2
Epoch 1/100, Loss: 27.6660
Epoch 10/100, Loss: 19.6984
Epoch 20/100, Loss: 16.4604
Epoch 30/100, Loss: 14.5755
Epoch 40/100, Loss: 13.2443
Epoch 50/100, Loss: 12.2081
Epoch 60/100, Loss: 11.3593
Epoch 70/100, Loss: 10.6428
Epoch 80/100, Loss: 10.0256
Epoch 90/100, Loss: 9.4886
Epoch 100/100, Loss: 9.0161
Fold fold-2: Accuracy = 0.1010
Processing fold: fold-3
Epoch 1/100, Loss: 26.9533
Epoch 10/100, Loss: 19.2315
Epoch 20/100, Loss: 16.0515
Epoch 30/100, Loss: 14.2612
Epoch 40/100, Loss: 12.9719
Epoch 50/100, Loss: 11.9493
Epoch 60/100, Loss: 11.1039
Epoch 70/100, Loss: 10.3880
Epoch 80/100, Loss: 9.7727
Epoch

In [51]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

base_path = 'Symbols/classification-task'
image_folder = 'Symbols'

folds = [f'fold-{i}' for i in range(1, 11)]

symbols_df = pd.read_csv('symbols.csv')
num_classes = len(symbols_df)

input_size = 32 * 32
layers = [input_size, 768, 512, num_classes]
activation_function = 'tanh'
learning_rate = 0.01
epochs = 500
device = 'cuda' if torch.cuda.is_available() else 'cpu'

results = []

# 10-fold cross-validation
for fold in folds:
    print(f"Processing fold: {fold}")
    
    #current fold's test set
    test_path = os.path.join(base_path, fold, 'test_reduced.csv')
    X_test, y_test = load_data(test_path, image_folder)
    
    # other folds' training data
    X_train_list = []
    y_train_list = []
    
    for other_fold in folds:
        if other_fold != fold:  
            train_path = os.path.join(base_path, other_fold, 'train_reduced.csv')
            X_fold, y_fold = load_data(train_path, image_folder)
            X_train_list.append(X_fold)
            y_train_list.append(y_fold)
    
    # Combine all training data
    X_train = np.vstack(X_train_list)
    y_train = np.concatenate(y_train_list)

    X_train_flat = X_train.reshape(X_train.shape[0], -1)
    X_test_flat = X_test.reshape(X_test.shape[0], -1)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_flat)
    X_test_scaled = scaler.transform(X_test_flat)
   
    model = MLP(
        layers=layers,
        activation_function=activation_function,
        learning_rate=learning_rate,
        optimizer='batch',
        device=device
    )
    
    model.train(X_train_scaled, y_train, epochs=epochs)
    
    y_pred_classes = model.predict_classes(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred_classes)
    results.append(accuracy)
    
    print(f'Fold {fold}: Accuracy = {accuracy:.4f}')

# average accuracy
average_accuracy = sum(results) / len(results)
print(f'Average Accuracy across all 10 folds: {average_accuracy:.4f}')

Processing fold: fold-1
Epoch 1/500, Loss: 7.5556
Epoch 10/500, Loss: 7.4694
Epoch 20/500, Loss: 7.3746
Epoch 30/500, Loss: 7.2810
Epoch 40/500, Loss: 7.1886
Epoch 50/500, Loss: 7.0977
Epoch 60/500, Loss: 7.0084
Epoch 70/500, Loss: 6.9210
Epoch 80/500, Loss: 6.8359
Epoch 90/500, Loss: 6.7532
Epoch 100/500, Loss: 6.6731
Epoch 110/500, Loss: 6.5955
Epoch 120/500, Loss: 6.5203
Epoch 130/500, Loss: 6.4473
Epoch 140/500, Loss: 6.3764
Epoch 150/500, Loss: 6.3076
Epoch 160/500, Loss: 6.2407
Epoch 170/500, Loss: 6.1757
Epoch 180/500, Loss: 6.1125
Epoch 190/500, Loss: 6.0510
Epoch 200/500, Loss: 5.9912
Epoch 210/500, Loss: 5.9331
Epoch 220/500, Loss: 5.8765
Epoch 230/500, Loss: 5.8214
Epoch 240/500, Loss: 5.7678
Epoch 250/500, Loss: 5.7156
Epoch 260/500, Loss: 5.6648
Epoch 270/500, Loss: 5.6152
Epoch 280/500, Loss: 5.5670
Epoch 290/500, Loss: 5.5199
Epoch 300/500, Loss: 5.4740
Epoch 310/500, Loss: 5.4293
Epoch 320/500, Loss: 5.3855
Epoch 330/500, Loss: 5.3428
Epoch 340/500, Loss: 5.3011
Epoch 3

### What do the mean and standard deviation tell you about model performance and consistency?
- Mean (average accuracy, loss, etc.) indicates the overall performance of the model across multiple runs or dataset. A higher mean typically suggests a better-performing model.
- Standard Deviation (SD) measures the variability in performance across different runs. A low SD means the model performs consistently, while a high SD means performance varies significantly across different test cases.
- Together, they provide insights into both accuracy and reliability—a high mean with a low SD is ideal because it suggests the model performs well and is stable.

### How does a high vs. low standard deviation impact confidence in the model’s generalization?
- A low standard deviation suggests the model’s performance is stable across different test sets, indicating good generalization to new, unseen data.
- A high standard deviation means the model's performance varies a lot, implying it might be overfitting to some subsets of data and underperforming on others. This reduces confidence in its ability to generalize well.
- In short, lower SD increases confidence that the model will perform similarly on new data, whereas higher SD raises concerns about inconsistency and possible overfitting.

### If one configuration has a slightly higher mean accuracy but a significantly higher standard deviation compared to another with marginally lower mean accuracy, which would you choose and why?
- If one configuration has a slightly higher mean accuracy but a significantly higher standard deviation compared to another with a marginally lower mean accuracy, the better choice depends on the use case and priorities:
- -  If Stability and Reliability Are Critical (like healthcare, finance, autonomous systems), the model with the lower standard deviation, even if its mean accuracy is slightly lower is chosen. A stable model ensures consistent performance across different scenarios, reducing the risk of unpredictable errors.
- - If Peak Performance Is the Priority (like recommendation systems, gaming AI), the model with the higher mean accuracy might be preferable, but this choice comes with higher risk—performance could vary significantly across different inputs.
- - In general a model with a lower standard deviation is typically preferred.
A more consistent model generalizes better to unseen data, making it more reliable in real-world applications.