# Multi-class Classification MLP

In [2]:
import numpy as np

### References

https://deepnotes.io/softmax-crossentropy

# Data Set Information

Data downloaded from: https://archive.ics.uci.edu/ml/datasets/seeds

The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. 

The data set can be used for the tasks of classification and cluster analysis.

In [3]:
class EmbeddingDataSet:
    """
       Isomorphi DataSet class
       Takes a dataset, loads, scales, shuffles, splits & batches data
       !!! Assumes all features are already Continuous Numeric Features !!! 
    """

    def __init__(self, filename, **kwargs):
        """

        """
        self.filename = filename
        self.splitsize = kwargs.get('splitsize')
        self.valsize = kwargs.get('valsize')
        self.batchsize = kwargs.get('batchsize')


    def load(self, **kwargs):
        """ 
            loads the dataset from the passed filename.
            Assumes the dataset is in a .txt file
            shuffles the dataset
            returns X and y.
        """
        with open(self.filename) as inF:
            rawdata = inF.readlines()

        data = []
        for line in rawdata:
            line = line.strip()
            line = line.split('\t')
            line = [x for x in line if x != '']
            data.append(line)

        self.X = np.array(data).astype(float)
        np.random.seed(kwargs.get('seed'))
        np.random.shuffle(self.X)

        self.splits = self.split(validation=kwargs.get('validation'), valsize=kwargs.get('valsize'))
        self.datasets = [self.scale(dataset[:, :-1]) for dataset in self.splits]
        self.labels = [dataset[:, -1].astype(int) - 1 for dataset in self.splits]
        self.name()
        return self.datasets, self.labels
    

    def scale(self, X, **kwargs):
        """
            Normalizes a numpy array via Standard Normal Distribution
            This can be useful in algorithms that do not assume any distribution,
            and is a required steps for Networks that learn via Gradient Descent
        """
        min = X.min()
        max = X.max()
        range = max - min
        return (X - min) / range
    

    def split(self, testsize:float=.10, **kwargs):
        """
            segregates the dataset into distinct splits
        """
        validation_set = kwargs.get('validation')
        if validation_set:
            valsize = kwargs.get('valsize')
            trainsize =  1 - testsize - valsize
            train_n = round(self.X.shape[0] * trainsize)
            val_n = round(self.X.shape[0] * valsize)
            train = self.X[:train_n]
            val = self.X[train_n:train_n+val_n]
            test = self.X[train_n+val_n:]
            return train, val, test
        
        else:
            trainsize =  1 - testsize
            train_n = round(self.X.shape[0] * trainsize)
            train = self.X[:train_n]
            test = self.X[train_n:]
            return train, test
    

    def name(self):
        """
            Assigns named attributes to specific datasets
        """
        self.X_train = self.datasets[0]
        self.y_train = self.labels[0]
        if len(self.datasets) > 2:
            self.X_val = self.datasets[1]
            self.y_val = self.labels[1]
            self.X_test = self.datasets[2]
            self.y_test = self.labels[2]

        else:
            self.X_test = self.datasets[1]
            self.y_test = self.labels[1]


    def batch(self, dataset, labels, batch_size:int=1):
        """
        """
        num_batches = len(dataset) // batch_size
        if len(dataset) % batch_size != 0:
            num_batches += 1

        batches = []
        for i in range(num_batches):
            start_index = i * batch_size
            end_index = (i + 1) * batch_size
            batch = (dataset[start_index:end_index], labels[start_index:end_index])
            batches.append(batch)
        
        return batches


seeds = EmbeddingDataSet('data/seeds.data')
seeds.load(seed=1, validation=True, valsize=.10)

([array([[7.02039093e-01, 6.74059239e-01, 4.43750675e-03, ...,
          1.30268312e-01, 1.13873099e-01, 2.14600575e-01],
         [5.11579733e-01, 5.89628801e-01, 2.51327816e-03, ...,
          9.88032476e-02, 3.29661591e-01, 2.03654071e-01],
         [7.34927694e-01, 6.92221600e-01, 4.12825573e-03, ...,
          1.28893863e-01, 2.34873698e-01, 2.12440727e-01],
         ...,
         [5.88156177e-01, 6.24480900e-01, 3.45575747e-03, ...,
          1.10240627e-01, 2.29474077e-01, 2.03703158e-01],
         [6.89767227e-01, 6.80440609e-01, 2.91579536e-03, ...,
          1.20254469e-01, 6.48347225e-02, 2.22994532e-01],
         [4.89981249e-01, 5.95028422e-01, 1.22718660e-04, ...,
          9.03111163e-02, 2.28443240e-01, 2.15287800e-01]]),
  array([[0.93709687, 0.80200981, 0.00665639, 0.27733579, 0.15776828,
          0.11048781, 0.2672816 ],
         [0.6184564 , 0.65042356, 0.00676982, 0.22211509, 0.12322312,
          0.10636301, 0.20958603],
         [0.63908038, 0.67310994, 0.005212

In [21]:
np.random.seed(45)

class FFMLP(object):
    
    """
        Feed-Forward Multi Layer Perceptron for Classification
    """

    def __init__(
        self, 
        epochs:int,
        batch_size:int,
        features:int,
        output_nodes:int=1,
        hidden_layers:int=1, 
        hidden_nodes:int=16, 
        init_weight:float=1,
        alpha:float=.01):
        """
            initializes a Feed Forward Multilayer Perceptron
            for classification tasks
            
            x is an numpy array of training data
            output_nodes is the number of classes
        """
        self.epochs = epochs
        self.alpha = alpha

        self.params = {
            "epochs" : epochs,
            "hidden layers": hidden_layers,
            "hidden nodes": hidden_nodes,
            "initialization weight": init_weight,
            "learning rate": alpha,
        }
        
        self.hidden_layer = {
            "weights": np.random.uniform(low=0, high=init_weight, size=(features, hidden_nodes)),
            "bias": np.random.uniform(low=0, high=init_weight, size=hidden_nodes)
        }

        if hidden_layers > 1:
            self.hidden_layers = [
                {
                    "weights": np.random.uniform(low=0, high=init_weight, size=(hidden_nodes, hidden_nodes)),
                    "bias": np.random.uniform(low=0, high=init_weight, size=hidden_nodes)
                } for layer in range(hidden_layers-1)
            ]
        else:
            self.hidden_layers = []

        self.output_layer = {
            "weights": np.random.uniform(low=0, high=init_weight, size=(hidden_nodes, output_nodes)),
            "bias": np.random.uniform(low=0, high=init_weight, size=output_nodes)
        }

        self.network = dict() # a data structure to hold our layers
        self.network['hidden layer 1'] = self.hidden_layer
        for idx, layer in enumerate(self.hidden_layers, start=2):
            self.network[f'hidden layer {idx}'] = layer        
        self.network['output layer'] = self.output_layer # hidden to output
        print('Initialized Network')


    def __str__(self):
        return f'Parameters: {self.params}\n, Network: {self.network}'
        
    def ReLU(self, x):
        return np.maximum(x, 0)
    

    def sigmoid(self, x):
        """
        """
        return 1/(1 + np.exp(-x))
    
    
    def softmax_stable(self, X):
        """
            stable softmax
        """
        exps = np.exp(X - np.max(X))
        return exps / np.sum(exps)
    

    def cross_entropy(self, X, y):
        """
        X is the output from fully connected layer (num_examples x num_classes)
          after softmax()
        y is labels (num_examples x 1)
            Note that y is not one-hot encoded vector. 
            It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
        """
        p = X
        m = y.shape[0]
        # takes the log of the highest values in p (which is the probability of the class)
        log_likelihood = -np.log(p[range(m), y]) 
        loss = np.sum(log_likelihood) / m
        return loss

    def delta_cross_entropy(self, X, y):
        """
        X is the output from fully connected layer (num_examples x num_classes)
        y is labels (num_examples x 1)
            Note that y is not one-hot encoded vector. 
            It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
        """
        m = y.shape[0]
        X[range(m), y] -= 1
        X = X / m
        return X

    def forward(self, batch):
        """
        """
        self.z = batch[0]
        for idx, (key, value) in enumerate(self.network.items(), start=1):
            print(f"forward {key}")
            self.network[key][f'z{idx}'] = self.z
            self.z = np.dot(self.z, value['weights']) + value['bias']
            a = self.ReLU(self.z) # activations
            self.network[key][f'a{idx}'] = a
            
        self.outputs = np.apply_along_axis(self.softmax_stable, 1, a)
        self.loss = self.cross_entropy(self.outputs, batch[1])
        return self.loss


    def backward(self, batch):
        """
        """


        delta = self.delta_cross_entropy(self.outputs, batch[1])
        for key in reversed(list(self.network.keys())):

            layer = self.network[key]
            input_data = layer['input']
            output_data = layer['output']
            weights = layer['weights']
            bias = layer['bias']
            
            d_weights = np.dot(input_data.T, delta)
            d_bias = np.sum(delta, axis=0)
            
            self.network[key]['weights'] = d_weights
            self.network[key]['bias'] = d_bias
            
            delta = np.dot(delta, weights.T) * output_data * (1 - output_data)
            
        self.output_layer['weights'] = np.dot(self.network['hidden layer 1']['output'].T, delta)
        self.output_layer['bias'] = np.sum(delta, axis=0)


    def fit(self, batches):

        for epoch in range(self.epochs):
            for batch in batches:
                pass_loss = self.forward(batch)
                self.backward()
                # Print loss every 10 epochs
                if epoch % 1 == 0:
                    print(f"Epoch {epoch}, loss: {pass_loss:.4f}")


# Multi Layer Perceptron
EPOCHS = 10
CLASSES = 3
HIDDEN_NODES = 4
HIDDEN_LAYERS = 1
INITWEIGHT = .1
ALPHA = .10

network = FFMLP(
    epochs=EPOCHS,
    batch_size=1,
    features=2,
    hidden_layers = HIDDEN_LAYERS,
    hidden_nodes = HIDDEN_NODES,
    output_nodes = CLASSES,
    init_weight=INITWEIGHT,
    alpha=ALPHA
)

train_batches = seeds.batch(seeds.X_train, seeds.y_train, 1)
print(f"{len(train_batches)} training batches")
#print(f"pass loss: {network.forward(train_batches[0][0][:)}")

Initialized Network
168 training batches


In [22]:
network.network

{'hidden layer 1': {'weights': array([[0.09890115, 0.05495447, 0.02814473, 0.00772896],
         [0.04444695, 0.0472808 , 0.0048522 , 0.01633244]]),
  'bias': array([0.01159507, 0.06273917, 0.0856182 , 0.06501024])},
 'output layer': {'weights': array([[0.09907217, 0.04703507, 0.06182945],
         [0.02826672, 0.09760033, 0.0673068 ],
         [0.04405309, 0.02896873, 0.05096997],
         [0.01124609, 0.02269548, 0.04785523]]),
  'bias': array([0.02427582, 0.03879825, 0.08188734])}}

In [27]:
x = train_batches[0][0][:, :2].round(2)
W = network.network['hidden layer 1']['weights'].round(2)
b = network.network['hidden layer 1']['bias'].round(2)

In [28]:
W

array([[0.1 , 0.05, 0.03, 0.01],
       [0.04, 0.05, 0.  , 0.02]])

In [32]:
np.dot(x, W)

array([[0.0968, 0.0685, 0.021 , 0.0204]])

In [127]:
# Define the input data
X = np.array([[1, 2]])

# Define the weights and biases
# Hidden Layer
W1 = np.array([[0.1, 0.2], [.30, .40]])
b1 = np.array([0.5, 0.6])

# Output layer
W2 = np.array([[0.7], [0.8]])
b2 = np.array([0.9])

# Perform the forward pass
z1 = np.dot(X, W1) + b1
a1 = np.maximum(z1, 0)  # ReLU activation
z2 = np.dot(a1, W2) + b2

y_pred = 1 / (1 + np.exp(-z2))  # Sigmoid activation
y_pred

# exps = np.exp(z2 - np.max(z2))
# y_pred = exps / np.sum(exps)

array([[0.95346953]])

In [128]:
# Define the target output
y_true = np.array([[0]])

# Compute the loss using binary cross-entropy
loss = - y_true * np.log(y_pred) - (1 - y_true) * np.log(1 - y_pred)
#loss = - sum(y_true * np.log(y_pred))
print(loss)

[[3.06764782]]


In [115]:
def delta_cross_entropy(X, y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
        Note that y is not one-hot encoded vector. 
        It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    m = y.shape[0]
    X[range(m), y] -= 1
    X = X / m
    return X

In [129]:
# Compute the gradient of the loss with respect to the output
dL_dy_pred = -(y_true / y_pred) + (1 - y_true) / (1 - y_pred)


# Compute the gradient of the loss with respect to the parameters of the second layer
dy_pred_dz2 = y_pred * (1 - y_pred)
dz2_da1 = W2
dz2_db2 = np.ones((1, 1))
dL_dz2 = dL_dy_pred * dy_pred_dz2
dL_dW2 = np.dot(a1.T, dL_dz2)
dL_db2 = np.sum(dL_dz2, axis=0)

# Compute the gradient of the loss with respect to the parameters of the first layer
dz2_da1 = W2
da1_dz1 = np.where(z1 > 0, 1, 0)
dz1_dW1 = X
dz1_db1 = np.ones((1, 2))
dL_dz1 = np.dot(dL_dz2, dz2_da1.T) * da1_dz1
dL_dW1 = np.dot(dz1_dW1.T, dL_dz1)
dL_db1 = np.sum(dL_dz1, axis=0)

# Update the parameters using gradient descent
learning_rate = 0.1
W2 -= learning_rate * dL_dW2
b2 -= learning_rate * dL_db2

W1 -= learning_rate * dL_dW1
b1 -= learning_rate * dL_db1

In [130]:
W1

array([[0.03325713, 0.12372244],
       [0.16651427, 0.24744488]])

In [136]:
# Define the input data
X = train_batches[0][0]

# Define the weights and biases
# Hidden Layer
W1 = network.network['hidden layer 1']['weights']
b1 = network.network['hidden layer 1']['bias']

# Hidden Layer
W2 = network.network['hidden layer 2']['weights']
b2 = network.network['hidden layer 2']['bias']

# Hidden Layer
W3 = network.network['hidden layer 3']['weights']
b3 = network.network['hidden layer 3']['bias']

# Output Layer
W4 = network.network['output layer']['weights']
b4 = network.network['output layer']['bias']


# Perform the forward pass
z1 = np.dot(X, W1) + b1
a1 = np.maximum(z1, 0)  # ReLU activation

z2 = np.dot(a1, W2) + b2
a2 = np.maximum(z2, 0)

z3 = np.dot(a2, W3) + b3
a3 = np.maximum(z3, 0)

z4 = np.dot(a3, W4) + b4
a4 = np.maximum(z4, 0)


def softmax_stable(X):
    """
        stable softmax
    """
    exps = np.exp(X - np.max(X))
    return exps / np.sum(exps)


y_preds = softmax_stable(a4)
y_preds

array([[0.34478128, 0.32202362, 0.3331951 ]])

In [149]:
y_true = np.array([[0]])# train_batches[0][1] 
y_true

array([[0]])

In [150]:
def cross_entropy(X, y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
      after softmax()
    y is labels (num_examples x 1)
        Note that y is not one-hot encoded vector. 
        It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    p = X
    m = y.shape[0]
    log_likelihood = -np.log(p[range(m), y]) 
    loss = np.sum(log_likelihood) / m
    return loss

loss = cross_entropy(y_preds, y_true)
loss

1.064845023445294

In [157]:
# Compute the gradient of the loss with respect to the output
dL_dy_pred = delta_cross_entropy(y_preds, y_true)

# Compute the gradient of the loss with respect to the parameters of the third layer
dy_pred_dz3 = y_pred * (1 - y_pred)
dz2_da2 = W3
dz2_db3 = np.ones((1, 1))
dL_dz3 = dL_dy_pred * dy_pred_dz3
dL_dW3 = np.dot(a2.T, dL_dz3)
dL_db3 = np.sum(dL_dz3, axis=0)


# Compute the gradient of the loss with respect to the parameters of the second layer
dy_pred_dz2 = y_pred * (1 - y_pred)
dz2_da1 = W2
dz2_db2 = np.ones((1, 1))
dL_dz2 = dL_dy_pred * dy_pred_dz2
dL_dW2 = np.dot(a1.T, dL_dz2)
dL_db2 = np.sum(dL_dz2, axis=0)

# Compute the gradient of the loss with respect to the parameters of the first layer
dz2_da1 = W2
da1_dz1 = np.where(z1 > 0, 1, 0)
dz1_dW1 = X
#dz1_db1 = np.ones((1, 2))
dL_dz1 = np.dot(dL_dz2, dz2_da1.T)# * da1_dz1
dL_dW1 = np.dot(dz1_dW1.T, dL_dz1)
dL_db1 = np.sum(dL_dz1, axis=0)

# # Update the parameters using gradient descent
# learning_rate = 0.1
# W2 -= learning_rate * dL_dW2
# b2 -= learning_rate * dL_db2

# W1 -= learning_rate * dL_dW1
# b1 -= learning_rate * dL_db1

ValueError: shapes (1,3) and (16,16) not aligned: 3 (dim 1) != 16 (dim 0)

In [None]:
# Define the neural network architecture
input_size = 7
hidden_size1 = 16
hidden_size2 = 16
output_size = 3

# Initialize the weights and biases
W1 = np.random.randn(input_size, hidden_size1)
b1 = np.zeros((1, hidden_size1))
W2 = np.random.randn(hidden_size1, hidden_size2)
b2 = np.zeros((1, hidden_size2))
W3 = np.random.randn(hidden_size2, output_size)
b3 = np.zeros((1, output_size))

# Define the activation function (sigmoid in this case)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Define the derivative of the activation function
def sigmoid_derivative(x):
    return sigmoid(x) * (1 - sigmoid(x))

# Define the forward pass function
def forward(X):
    # Calculate the weighted sum of the inputs for the first hidden layer
    z1 = np.dot(X, W1) + b1
    # Apply the activation function to the output of the first hidden layer
    a1 = sigmoid(z1)
    
    # Calculate the weighted sum of the inputs for the second hidden layer
    z2 = np.dot(a1, W2) + b2
    # Apply the activation function to the output of the second hidden layer
    a2 = sigmoid(z2)
    # Calculate the weighted sum of the inputs for the output layer
    z3 = np.dot(a2, W3) + b3
    # Apply the softmax activation function to the output of the output layer
    exp_scores = np.exp(z3)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return a1, a2, probs

# Define the loss function (cross-entropy in this case)
def cross_entropy_loss(probs, y):
    num_examples = len(y)
    corect_logprobs = -np.log(probs[range(num_examples),y])
    data_loss = np.sum(corect_logprobs)
    return 1./num_examples * data_loss

# Define the derivative of the loss function with respect to the output layer
def delta_cross_entropy_loss(probs, y):
    num_examples = len(y)
    delta = probs
    delta[range(num_examples),y] -= 1
    return delta


def cross_entropy(self, X, y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
        after softmax()
    y is labels (num_examples x 1)
        Note that y is not one-hot encoded vector. 
        It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    m = y.shape[0]
    p = X
    
    # takes the log of the highest values in p (which is the probability of the class)
    log_likelihood = -np.log(p[range(m), y]) 
    loss = np.sum(log_likelihood) / m
    return loss

def delta_cross_entropy(self, X, y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
        Note that y is not one-hot encoded vector. 
        It can be computed as y.argmax(axis=1) from one-hot encoded vectors of labels if required.
    """
    m = y.shape[0]
    X[range(m), y] -= 1
    X = X / m
    return X


# Define the backward pass function
def backward(X, y, a1, a2, probs):
    # Calculate the derivative of the loss function with respect to the output layer
    delta3 = delta_cross_entropy_loss(probs, y)
    # Calculate the derivative of the loss function with respect to the weights and biases in the output layer
    dW3 = np.dot(a2.T, delta3)
    db3 = np.sum(delta3, axis=0)
    # Calculate the derivative of the loss function with respect to the second hidden layer
    delta2 = np.dot(delta3, W3.T) * sigmoid_derivative(a2)
    # Calculate the derivative of the loss function with respect to the weights and biases in the second hidden layer
    dW2 = np.dot(a1.T, delta2)
    db2 = np.sum(delta2, axis=0)
    # Calculate the derivative of the loss function with respect to the first hidden layer
    delta1 = np.dot(delta2, W2.T) * sigmoid_derivative(a1)
    # Calculate the derivative of the loss function with respect to the
    # weights and biases in the first hidden layer
    dW1 = np.dot(X.T, delta1)
    db1 = np.sum(delta1, axis=0)
    return dW1, db1, dW2, db2, dW3, db3

In [None]:
def train(X, y, learning_rate, num_epochs):
    for epoch in range(num_epochs):
        # Perform the forward pass to get the predicted probabilities
        a1, a2, probs = forward(X)
        # Calculate the loss
        loss = cross_entropy_loss(probs, y)
        # Perform the backward pass to get the gradients
        dW1, db1, dW2, db2, dW3, db3 = backward(X, y, a1, a2, probs)
        # Update the weights and biases using gradient descent
        W1 -= learning_rate * dW1
        b1 -= learning_rate * db1
        W2 -= learning_rate * dW2
        b2 -= learning_rate * db2
        W3 -= learning_rate * dW3
        b3 -= learning_rate * db3
        # Print the loss every 100 epochs
        if epoch % 100 == 0:
        print(f"Epoch {epoch}, loss: {loss}")
        return W1, b1, W2, b2, W3, b3

Generate some random data for training and testing

X_train = np.random.randn(100, input_size)
y_train = np.random.randint(output_size, size=100)
X_test = np.random.randn(20, input_size)
y_test = np.random.randint(output_size, size=20)

Train the neural network using the training data

learning_rate = 0.1
num_epochs = 1000
W1, b1, W2, b2, W3, b3 = train(X_train, y_train, learning_rate, num_epochs)

Evaluate the neural network using the testing data

a1, a2, probs = forward(X_test)
predictions = np.argmax(probs, axis=1)
accuracy = np.mean(predictions == y_test)
print(f"Test accuracy: {accuracy}")

csharp
Copy code

Note that this code is just an example and may not work out-of-the-box for your specific use case. You may need to tweak the architecture, hyperparameters, and other aspects of the code to get the best results for your problem.