## 1. Downloading the dataset

In [1]:
import pip
def install(package):
    pip.main(['install', package])

In [2]:
try:
    from torchvision.datasets import MNIST
    print("module 'mutagen' is installed")
except ModuleNotFoundError:
    print("module 'torchvsion' is not installed")
    # or
    install("torchvision") # the install function from the question

module 'mutagen' is installed


In [3]:
import numpy as np
from torchvision.datasets import MNIST
train_dataset = None
test_dataset = None
def download_mnist(is_train: bool):
    if is_train == True:
        return MNIST(root='./data', transform=lambda x: np.array(x).flatten(), download=True, train=is_train)
    else: 
        return MNIST(root='./data', transform=lambda x: np.array(x).flatten(), download=True, train=is_train)

train_dataset = download_mnist(True)
test_dataset = download_mnist(False)

## 2. Processing the dataset

In [4]:
def processData(dataset):
    np_dataset_images = np.array([object[0] for object in dataset], dtype = np.bool_).reshape(len(dataset), 28 * 28)
    np_dataset_labels = np.array([object[1] for object in dataset], dtype = np.uint8)
    
    return np_dataset_images, np_dataset_labels

np_train_dataset = processData(train_dataset)
np_test_dataset = processData(test_dataset)

## 3. Implementing a perceptron handler
We need to have the follwing:

    - an array of the weights of the system
    - function to propagate forward and find the results
    - function to recalculate the weights based on the results (backwards propagation)

In [5]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    return sigmoid(z) * (1 - sigmoid(z))

def softmax(z):
    z_stable = z - np.max(z)  # for numerical stability
    exp_values = np.exp(z_stable)
    return exp_values / np.sum(exp_values)

def cross_entropy_loss(predicted, target):
    # Use small epsilon to prevent log(0)
    epsilon = 1e-12
    predicted = np.clip(predicted, epsilon, 1 - epsilon)
    return -np.sum(target * np.log(predicted))

class Layer:
    def __init__(self, previousLayerNumberOfNeurons, numberOfNeurons, is_output=False):
        self.is_output = is_output
        # Xavier initialization
        limit = np.sqrt(6 / (previousLayerNumberOfNeurons + numberOfNeurons))
        self.weights = np.random.uniform(-limit, limit, (numberOfNeurons, previousLayerNumberOfNeurons))
        self.bias = np.zeros(numberOfNeurons)
        self.activations = None
        self.z_values = None

    def forwardPropagation(self, inputs):
        self.z_values = (self.weights @ inputs) + self.bias
        
        # Apply the activation function
        if self.is_output:
            self.activations = softmax(self.z_values)
        else:
            self.activations = sigmoid(self.z_values)
        return self.activations

    def backwardPropagation(self, error):
        if self.is_output:
            dz = error
        else:
            dz = error * sigmoid_derivative(self.z_values)

        return dz

# Define network architecture
input_size = 784  # Number of input features
layer_sizes = [100]  # Sizes of each layer
output_size = 10  # Number of output classes

# Initialize layers as a list of Layer objects
layers = []

# Input layer to first hidden layer
layers.append(Layer(input_size, layer_sizes[0]))

# Hidden layers
for i in range(1, len(layer_sizes)):
    layers.append(Layer(layer_sizes[i - 1], layer_sizes[i]))

# Output layer
layers.append(Layer(layer_sizes[-1], output_size, is_output=True))


## 4. Defining the hyper parameters
1. The learning rate -> the rate at which we want to apply the gradient to each feature
2. The epoch number -> the number of times the algorithm runs
3. Additionally we will include here the e constant

In [6]:
LEARNING_RATE = 20
EPOCH_NUMBER = 100

## 5. Running the algorithm
- we will have a function for running epoch
- we will have a function for running a batch


In [7]:
from concurrent.futures import ThreadPoolExecutor
import time
def runBatch(np_batch_train_dataset):
    batch_size = np_batch_train_dataset[0].shape[0]
    num_classes = output_size

    # Initialize accumulators for gradients and biases
    gradients_accumulated = [np.zeros_like(layer.weights) for layer in layers]
    bias_accumulated = [np.zeros_like(layer.bias) for layer in layers]
    batch_loss = 0
    batch_correct = 0

    # Function to compute forward pass and error for a single sample
    def process_sample(testIndex):
        activations = np_batch_train_dataset[0][testIndex]
        for perceptron in layers:
            activations = perceptron.forwardPropagation(activations)
        
        softMaxArray = activations
        correctPredictionValue = np_batch_train_dataset[1][testIndex]
        targetArray = np.zeros(num_classes)
        targetArray[correctPredictionValue] = 1
        sample_loss = cross_entropy_loss(softMaxArray, targetArray)
        is_correct = int(correctPredictionValue == np.argmax(softMaxArray))
        
        # Return error for backpropagation
        errorArray = softMaxArray - targetArray
        return errorArray, sample_loss, is_correct

    # Run forward pass in parallel and gather errors and results
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(process_sample, range(batch_size)))

    # Accumulate errors for each sample and update batch loss and correct count
    for errorArray, sample_loss, is_correct in results:
        batch_loss += sample_loss
        batch_correct += is_correct

        # Backpropagation: accumulate gradients and biases for each layer
        for i in reversed(range(len(layers))):
            layer = layers[i]
            dz = layer.backwardPropagation(errorArray)
            gradients_accumulated[i] += dz @ layer.activations.T
            bias_accumulated[i] += dz
            errorArray = layer.weights.T @ dz

    # Average the accumulated gradients and biases over the batch size
    gradients_accumulated = [grad / batch_size for grad in gradients_accumulated]
    bias_accumulated = [bias / batch_size for bias in bias_accumulated]

    return gradients_accumulated, bias_accumulated, batch_loss / batch_size, batch_correct






def runEpoch(batch_size=100):
    total_samples = np_train_dataset[0].shape[0]
    batch_count = total_samples // batch_size
    total_loss = 0
    total_correct = 0

    for batchIndex in range(batch_count):
        batch_start = batchIndex * batch_size
        batch_end = (batchIndex + 1) * batch_size
        batch_train_dataset = [np_train_dataset[0][batch_start:batch_end], 
                               np_train_dataset[1][batch_start:batch_end]]

        # Run forward and backward pass for batch
        gradients_accumulated, bias_accumulated, batch_loss, batch_correct = runBatch(batch_train_dataset)

        # Update weights and biases for each perceptron
        for i, layer in enumerate(layers):
            print ("layer " + str(i))
            print(" gradients: ")
            print(gradients_accumulated[i])
            print("weghts: ")
            print(layers[i].weights)
            time.sleep(2)
            layer.weights -= LEARNING_RATE * gradients_accumulated[i]
            layer.bias -= LEARNING_RATE * bias_accumulated[i]

        total_loss += batch_loss
        total_correct += batch_correct

    # avg_loss = total_loss / batch_count
    # accuracy = total_correct / total_samples

    # return avg_loss, accuracy
    return total_correct

def runTest(inputs):
    activations = inputs
    for i, perceptron in enumerate(layers):
        activations = perceptron.forwardPropagation(activations)
    max_index = np.argmax(activations)
    return max_index

## 6. Running the main function and initializing the layers

In [8]:
def main ():
    for epochIndex in range(EPOCH_NUMBER):
        totalCorrect = runEpoch()
        print('trainingAccuracy = ', totalCorrect, (np_train_dataset[0].size // (28 * 28)), totalCorrect / (np_train_dataset[0].size // (28 * 28)))
        correctlyPredicted = 0
        tests = np_test_dataset[0]
        correctPredictions = np_test_dataset[1]
        for index in range(tests.size // (28 * 28)):
            prediction = runTest(tests[index])
            if prediction == correctPredictions[index]:
                correctlyPredicted += 1
        print('Accuracy on tests at epoch ' + str(epochIndex) + " : " + str(correctlyPredicted / (tests.size // (28 * 28))))
main()

layer 0
 gradients: 
[[0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]
 [0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]
 [0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]
 ...
 [0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]
 [0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]
 [0.14032966 0.14032966 0.14032966 ... 0.14032966 0.14032966 0.14032966]]
weghts: 
[[-0.01485566 -0.06659684  0.06080602 ...  0.03436031  0.01562064
   0.07453964]
 [-0.03790095  0.06581477 -0.04702359 ...  0.0545069  -0.01320352
   0.04000259]
 [ 0.06382128  0.07752295 -0.02371626 ...  0.07868392  0.05194694
  -0.02395062]
 ...
 [-0.02724433  0.06406729 -0.01728321 ... -0.05355829  0.0040862
  -0.06088672]
 [ 0.00855533 -0.0299531   0.04440884 ...  0.03782656  0.05664516
  -0.0259915 ]
 [-0.0452254  -0.00293314  0.04405879 ... -0.06382516  0.00295714
  -0.05481025]]
layer 1
 gradients: 
[[0.04585406 0

  return 1 / (1 + np.exp(-z))


layer 0
 gradients: 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
weghts: 
[[-2.82144889 -2.87319007 -2.74578721 ... -2.77223292 -2.79097259
  -2.73205358]
 [-2.84449418 -2.74077846 -2.85361681 ... -2.75208633 -2.81979675
  -2.76659064]
 [-2.74277195 -2.72907027 -2.83030949 ... -2.72790931 -2.75464628
  -2.83054385]
 ...
 [-2.83383756 -2.74252594 -2.82387644 ... -2.86015151 -2.80250702
  -2.86747995]
 [-2.79803789 -2.83654633 -2.76218439 ... -2.76876667 -2.74994807
  -2.83258473]
 [-2.85181863 -2.80952637 -2.76253443 ... -2.87041839 -2.80363609
  -2.86140348]]
layer 1
 gradients: 
[[0.03848069 0.03848069 0.03848069 0.03848069 0.03848069 0.03848069
  0.03848069 0.03848069 0.03848069 0.03848069 0.03848069 0.03848069
  0.03848069 0.03848069 0.03848069 0.03848069 0.03848069 0.03848069
  0.03848069 0.03848069 0.03848069 0.03848069 0.03848069 0.03848069
  0.03848069 0.03848069 0.038

KeyboardInterrupt: 