In [None]:
# Mount Google Drive storage
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Import training data
# First row = labels as strings, every other row = (answer,pixel data array)
data_train_total = pd.read_csv( \
                "/content/drive/MyDrive/google_colaboratory/data/train.csv", \
                header=0,dtype=float).values

# Define total number of data samples
num_samples_total = data_train_total.shape[0]

# Randomly permute indices
indices = torch.randperm(num_samples_total)

# Split data into "train" and "validate" datasets
split = int(0.8 * num_samples_total)
indices_train, indices_validate = indices[:split], indices[split:]
data_train = data_train_total[indices_train]
data_validate = data_train_total[indices_validate]

# Import test data
data_test = pd.read_csv( \
                "/content/drive/MyDrive/google_colaboratory/data/test.csv", \
                header=0,dtype=float).values


In [None]:
# Import libraries
import torch
from typing import List, Tuple

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device, " - ", torch.cuda.get_device_name(0))

# Numerical precision
float_type = torch.float32

# PyTorch parameters
torch_params = {"dtype": float_type, "device": device}


# DEFINE MODEL'S PARAMETERS, INPUTS AND OUTPUTS

# Inputs:
# X = array(num_samples,num_features)
# (for each sample, pixel values in range [0,1], flattened 28x28-pixel image)
X = torch.tensor(data_train[:,1:]/255.0, **torch_params)
# Mean subtraction: subtract average across dataset, to make features balanced
X = (X - X.mean(0)) / (X.std(0)+10**(-9))

# Outputs:
# Y = array(num_samples)
# (for each sample, correct class = digit)
Y = torch.tensor(data_train[:,0], dtype=torch.long, device=device)

# Number of classes for classification (digits -> 10 possible classes)
# TODO: determine number of classes from data
num_classes = 10

# Number of samples (data points)
num_samples = X.shape[0]

# Number of features (pixels in a single sample)
num_features = X.shape[1]

# Number of hidden layers
num_layers = 5

# Dimension of a hidden layer
num_hidden = 512

# Generate array of initial parameters (Kaiming initialization)
def generate_parameter(dim_in: int, dim_out: int) -> Tuple[torch.Tensor,torch.Tensor]:
    W = torch.sqrt(torch.tensor(2.0/dim_in)) * torch.randn(dim_in,dim_out,**torch_params)
    b = torch.full((1, dim_out), 0.01, **torch_params)
    return (W, b)
layer_dims = [num_features] + [num_hidden]*(num_layers-1) + [num_classes]
parameters_init: Tuple[Tuple[torch.Tensor, torch.Tensor]] = tuple(\
    generate_parameter(dim_in, dim_out) for dim_in, dim_out in zip(layer_dims[:-1], layer_dims[1:]))
#if num_layers==1:
#    parameters_init = [generate_parameter(num_features,num_classes)]
#else:
#    parameters_init = [generate_parameter(num_features,num_hidden)]
#    for layer in range(2,num_layers):
#        parameters_init.append(generate_parameter(num_hidden,num_hidden))
#    parameters_init.append(generate_parameter(num_hidden,num_classes))

#-------------------------------------------------------------------------------

# DEFINE FUNCTIONS

# ReLU activation function (used for hidden layers): removes negative scores
def relu(Z: torch.Tensor) -> torch.Tensor:
    return torch.maximum(Z,torch.tensor(0.0,**torch_params))
def relu_derivative(Z: torch.Tensor) -> torch.Tensor: return (Z>0).float()

# SOFTMAX activation function (used for output layer):
# transforms arbitrary-valued scores (logits) into probabilities
# Z = array(num_samples,num_classes)
# (for each sample, arbitrary-valued scores for each class)
# softmax(Z) = array(num_samples,num_classes)
# (for each sample, probability of belonging to a certain class)
def softmax(Z: torch.Tensor) -> torch.Tensor:
    exp_z = torch.exp(Z - Z.max(1, keepdim=True).values)
    return exp_z / exp_z.sum(1, keepdim=True)

# Function that converts outputs (labels) into corresponding probability vectors
# Y = array(num_samples)
# one_hot(Y) = array(num_samples,num_classes)
# (for each sample, probability of belonging to a certain class)
def one_hot(Y: torch.Tensor) -> torch.Tensor:
    return torch.eye(num_classes,**torch_params)[Y]

# LOSS FUNCTION - cross-entropy: calculates the punishment for a bad prediction
# = LOG(cummulative probability of predicting all outputs)
def cross_entropy(probs_pred: torch.Tensor, probs_true: torch.Tensor) -> torch.Tensor:
    # 10**(-9) is needed to avoid LOG(0) error
    return -(probs_true * torch.log(probs_pred + 10**(-9))).sum() / num_samples

def cross_entropy_derivative(probs_pred: torch.Tensor, probs_true: torch.Tensor) -> torch.Tensor:
    return (probs_pred - probs_true) / num_samples

# FORWARD PROPAGATION

# Single-layer forward propagation
def forward_SL(X, W, b, activation_function):
    # Calculate arbitrary-valued scores
    Z = X @ W + b
    # Apply the activation function
    Y = activation_function(Z)
    return [Z, Y]

# Multi-layer forward propagation
# Examples:
# params_array = [[W1,b1],[W2,b2],...]
def forward_ML(X_init, parameters):
    forward_parameters, X = [[0, X_init]], X_init
    # For every layer: calculate forward propagation and update initial vector
    for index, parameter in enumerate(parameters):
        # softmax is only applied at the outer level
        activation_function = relu if index!=(num_layers-1) else softmax
        forward_parameter = forward_SL(X,*parameter,activation_function)
        X = forward_parameter[1]
        forward_parameters.append(forward_parameter)
    # Export list of [Z, Y] for all layers:
    # index 0 corresponds to the initial data, so len(forward_params)=num_layers+1
    return forward_parameters

# BACKWARD PROPAGATION

# Single-layer backward propagation
# Examples:
# params = [cross_entropy_derivative, probs_true]
# params = [relu_derivative, W_next, dL_dZ_next]
def backward_SL(X, Z, derivative_parameters):
    function_derivative = derivative_parameters[0]
    # Cross-entropy is always the outer layer: probs_pred = Z
    if function_derivative.__name__=='cross_entropy_derivative':
        probs_pred, probs_true = softmax(Z), derivative_parameters[1]
        dL_dZ = cross_entropy_derivative(probs_pred, probs_true)
    # ReLU is always an inner layer, so it needs:
    # W_next and dL_dZ_next from the "next" (in terms of forward propagation) step
    elif function_derivative.__name__=='relu_derivative':
        W_next, dL_dZ_next = derivative_parameters[1:]
        # Current step's Y is "next" step's X
        dL_dY = dL_dZ_next @ W_next.T
        dL_dZ = dL_dY * relu_derivative(Z)
    else:
        raise ValueError("Wrong derivative function!")
    # This works for every layer: Z = W*X + b
    dL_dW = X.T @ dL_dZ
    dL_db = dL_dZ.sum(0, keepdim=True)
    # Export derivatives
    return [dL_dZ, dL_dW, dL_db]

# Multi-layer backward propagation
def backward_ML(X_init, probs_true, parameters, forward_parameters):
    derivatives = []
    # Layers are counted in the direction of forward propagation
    # layer 0 is the initial data
    for layer in range(num_layers-1,-1,-1):
        # If it is the outer layer, apply cross_entroy_derivative
        # If it is an inner layer, apply relu_derivative with:
        # W_next = parameters[layer-1,1]
        # dL_dZ_next = derivatives[0,0]
        backward_parameters = \
            [cross_entropy_derivative, probs_true] if layer==num_layers-1 else \
            [relu_derivative, parameters[layer+1][0], derivatives[0][0]]
        X, Z = forward_parameters[layer][1], forward_parameters[layer+1][0]
        derivative = backward_SL(X, Z, backward_parameters)
        derivatives.insert(0,derivative)
    return derivatives

# Training function
def train(X, Y, parameters = None, \
          rate_init=0.05, num_epochs=300, lambda_reg=0.0001, momentum=0.9):

    # Assign default parameters
    #if parameters is None:
    #    parameters = [[W.copy(), b.copy()] for (W,b) in parameters_init]
    if parameters is None:
        parameters = [[W.clone(), b.clone()] for (W, b) in parameters_init]

    # Probabilities of outputs
    probs_true = one_hot(Y)

    # Initialize velocities
    # velocities = [[v_W1,v_b1],[v_W2,v_b2],...]
    velocities = [[torch.zeros_like(W), torch.zeros_like(b)] for (W, b) in parameters]

    # Iterate over epochs
    for epoch in range(num_epochs):

        # Forward propagation
        forward_parameters = forward_ML(X, parameters)

        # Backward propagation
        derivatives = backward_ML(X, probs_true, parameters, forward_parameters)

        # Iterate over layers
        for layer in range(num_layers):
            # L2-regularization
            derivatives[layer][1] += lambda_reg*parameters[layer][0]
            # Re-calculate velocities
            velocities[layer][0] = momentum*velocities[layer][0] - rate_init*derivatives[layer][1]
            velocities[layer][1] = momentum*velocities[layer][1] - rate_init*derivatives[layer][2]
            # Perform gradient descent
            parameters[layer][0] += velocities[layer][0]
            parameters[layer][1] += velocities[layer][1]

        # Print out loss function values
        if epoch % 10 == 0:
            # Calculate loss
            loss = cross_entropy(forward_parameters[-1][1],probs_true)
            # Calculate accuracy
            accuracy = (forward_parameters[-1][1].argmax(1) == Y).float().mean().item()
            print(f"Epoch {epoch}, loss {loss:.4f}, accuracy {accuracy:.4f}")

    return parameters

In [None]:
parameters_train = train(X, Y, num_epochs=500)

In [None]:
# Inputs:
# X = array(num_samples,num_features)
# (for each sample, pixel values in range [0,1], flattened 28x28-pixel image)
X_validate = torch.tensor(data_validate[:,1:]/255.0, **torch_params)
# Mean subtraction: subtract average across dataset, to make features balanced
#X_validate = (X_validate - X_validate.mean(0)) / (X_validate.std(0)+10**(-9))

# Outputs:
# Y = array(num_samples)
# (for each sample, correct class = digit)
Y_validate = torch.tensor(data_validate[:,0], dtype=torch.long, device=device)

# Check the validate accuracy
#probs_validate = forward_ML(X_validate,parameters_train)[-1][0]

def validate(X_validate: torch.Tensor, Y_validate: torch.Tensor, parameters: list):
    # One-hot labels for validation
    probs_true = one_hot(Y_validate)

    # Forward pass
    forward_parameters = forward_ML(X_validate, parameters)
    probs_pred = forward_parameters[-1][1]

    # Loss
    loss = cross_entropy(probs_pred, probs_true)

    # Accuracy
    accuracy = (probs_pred.argmax(1) == Y_validate).float().mean().item()

    print(f"Validation loss {loss:.4f}, accuracy {accuracy:.4f}")
    return loss.item(), accuracy

val_loss, val_acc = validate(X_validate, Y_validate, parameters_train)

In [None]:
probs_validate[0]

In [None]:
# Calculate predicted values.
# Outputs are classes with largest probabilities.
def predict(X, W1, b1, W2, b2):
    Z1 = X.dot(W1) + b1
    H = relu(Z1)
    Z2 = H.dot(W2) + b2
    probs_pred = softmax(Z2)
    return np.argmax(probs_pred, axis=1)

Y_pred = predict(X, W1_0, b1_0, W2_0, b2_0)

accuracy = np.mean(Y_pred == Y)
print("Training accuracy:", accuracy)

In [None]:
# Define tiny example
W = torch.tensor([[1.0, 2.0, 3.0],
                  [4.0, 5.0, 6.0]])   # shape (2, 3)

x = torch.tensor([1.0, 1.0, 1.0])      # shape (3,)
b = torch.tensor([0.5, -0.5])          # shape (2,)

print("W:\n", W)
print("x:\n", x)
print("b:\n", b)

# --- Method 1: using @ operator (transpose W to match shapes)
y1 = W @ x + b

# --- Method 2: using torch.matmul (same thing)
y2 = torch.matmul(W, x) + b

# --- Method 3: using torch.addmm (expects 2D input, so reshape x)
y3 = torch.addmm(b, x.unsqueeze(0), W.T)  # x as (1,3), W.T as (3,2)

print("\nMethod 1 ( @ ):\n", y1)
print("\nMethod 2 (matmul):\n", y2)
print("\nMethod 3 (addmm):\n", y3)