In [1]:
from data.mnist_loader import load_data_wrapper
import numpy as np

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

### Getting data

In [2]:
train_data, validation_data, test_data = load_data_wrapper()
train_data = train_data + test_data

In [3]:
X_train, y_train = list(zip(*train_data))
X_val, y_val = list(zip(*validation_data))

In [4]:
X_train = [x.reshape(784) for x in X_train]
y_train = [np.argmax(yi) for yi in y_train]

X_val = [x.reshape(784) for x in X_val]

### Model selection

In [5]:
mlp = MLPClassifier(solver='sgd', activation='relu', random_state=1, alpha=0.01, batch_size=128, hidden_layer_sizes=(64,), max_iter=100, verbose=10, learning_rate_init=1, tol=1e-4)

# mlp = MLPClassifier(solver='sgd', activation='relu', random_state=1, alpha=0.46, batch_size=128, hidden_layer_sizes=(64,), max_iter=100, learning_rate_init=1)

### Training

In [6]:
train_losses = []
val_losses = []
train_accs = []
val_accs = []
grad_norms = []


while not converged:
    mlp.partial_fit(X_train, y_train, classes=np.unique(y_train))
    
    train_loss = mlp.loss_
    n_iter += 1

    grad_norm = np.linalg.norm(mlp.coefs_[0])
    grad_norms.append(grad_norm) # gradient at each step

    train_loss = mlp.loss_
    train_losses.append(train_loss) # train loss
    
    test_pred = mlp.predict(X_val)
    mean_squared_error = np.mean((test_pred - y_val)**2)
    val_losses.append(mean_squared_error) # test loss

    train_acc = mlp.score(X_train, y_train) 
    train_accs.append(train_acc) # train accuracy

    val_acc = mlp.score(X_val, y_val)
    val_accs.append(val_acc) # test accuracy

    if train_loss < best_loss - tolerance:
        best_loss = train_loss
        n_iter_no_change = 0
    else:
        n_iter_no_change += 1

    if n_iter_no_change >= 10:
        converged = True
        print("Training converged after {} iterations".format(n_iter))

    if n_iter >= mlp.max_iter:
        converged = True
        print("Training stopped after reaching max_iter")

Iteration 1, loss = 2.50554461
Iteration 2, loss = 2.36619704
Iteration 3, loss = 2.29518612
Iteration 4, loss = 2.25674135
Iteration 5, loss = 2.23836824
Iteration 6, loss = 2.22958757
Iteration 7, loss = 2.22539120
Iteration 8, loss = 2.22338572
Iteration 9, loss = 2.22242728
Iteration 10, loss = 2.22196924
Iteration 11, loss = 2.22175033
Iteration 12, loss = 2.22164572
Iteration 13, loss = 2.22159572
Iteration 14, loss = 2.22157183
Iteration 15, loss = 2.22202252
Iteration 16, loss = 2.22299915
Iteration 17, loss = 2.22224254
Iteration 18, loss = 2.22188095
Iteration 19, loss = 2.22170814
Iteration 20, loss = 2.22162555
Iteration 21, loss = 2.22158608
Iteration 22, loss = 2.22156722
Iteration 23, loss = 2.22155821
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 24, loss = 2.22155390
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 25, loss = 2.22155184
Training loss did not improve



Iteration 36, loss = 2.22141119
Iteration 37, loss = 2.22154995
Iteration 38, loss = 2.22154995
Iteration 39, loss = 2.22154995
Iteration 40, loss = 2.22154995
Iteration 41, loss = 2.22154995
Iteration 42, loss = 2.22154995
Iteration 43, loss = 2.22154995
Iteration 44, loss = 2.22154995




Iteration 45, loss = 2.22146122
Iteration 46, loss = 2.22154995


### Model analysis

#### Filters: Visualize the filters learned by the model to see what kind of patterns the model is looking for in the input.

In [None]:
weights = mlp.coefs_[0]

# Create figure and axis objects
fig, axes = plt.subplots(nrows=8, ncols=8, figsize=(8, 8))
fig.subplots_adjust(hspace=0.3, wspace=0.3)

# Plot filters
for i, ax in enumerate(axes.flat):
    filter = weights[:, i]
    image = filter.reshape(28, 28)
    ax.imshow(image, cmap='gray')
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_title(f'Filter {i+1}')

#### Gradient Norm: Plot the gradient norm over time to see if the gradients are exploding or vanishing.

In [None]:
plt.plot(grad_norms, label='Gradient Norm')
plt.xlabel('Epoch')
plt.ylabel('Gradient Norm')
plt.show()

#### Training and Validation Loss: Plot the training and validation loss over time to see how the model is performing. This can help you determine if the model is overfitting or underfitting.

In [None]:
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

#### Training and Validation Accuracy: Plot the training and validation accuracy over time to see how well the model is classifying the MNIST digits.

In [None]:
plt.plot(train_accs, label='Training Accuracy')
plt.plot(val_accs, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

#### Confusion Matrix: A confusion matrix can help you understand how the model is misclassifying digits. It shows the number of true positive, false positive, true negative, and false negative predictions.

In [None]:
y_pred = mlp.predict(X_val)

conf_matrix = confusion_matrix( y_val,y_pred)

fig, ax = plt.subplots(figsize=(5, 5))
ax.imshow(conf_matrix, cmap='Blues')
ax.set_xticks(np.arange(10))
ax.set_yticks(np.arange(10))
ax.set_xticklabels(np.arange(10))
ax.set_yticklabels(np.arange(10))
for i in range(10):
    for j in range(10):
        ax.text(j, i, conf_matrix[i, j], ha='center', va='center', color='white')

plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()