In [1]:
#!/usr/bin/env python3
import argparse

import numpy as np
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection

parser = argparse.ArgumentParser()
# These arguments will be set appropriately by ReCodEx, even if you change them.
parser.add_argument("--batch_size", default=10, type=int, help="Batch size")
parser.add_argument("--classes", default=10, type=int, help="Number of classes to use")
parser.add_argument("--epochs", default=10, type=int, help="Number of SGD training epochs")
parser.add_argument("--hidden_layer", default=50, type=int, help="Hidden layer size")
parser.add_argument("--learning_rate", default=0.01, type=float, help="Learning rate")
parser.add_argument("--recodex", default=False, action="store_true", help="Running in ReCodEx")
parser.add_argument("--seed", default=42, type=int, help="Random seed")
parser.add_argument("--test_size", default=797, type=lambda x: int(x) if x.isdigit() else float(x), help="Test size")
# If you add more arguments, ReCodEx will keep them with your default values.
args = parser.parse_args([] if "__file__" not in globals() else None)   

In [2]:
generator = np.random.RandomState(args.seed)

# Load the digits dataset.
data, target = sklearn.datasets.load_digits(n_class=args.classes, return_X_y=True)

# Append a constant feature with value 1 to the end of all input data.
# Then we do not need to explicitly represent bias - it becomes the last weight.
data = np.pad(data, [(0, 0), (0, 1)], constant_values=1)

# Split the dataset into a train set and a test set.
# Use `sklearn.model_selection.train_test_split` method call, passing
# arguments `test_size=args.test_size, random_state=args.seed`.
train_data, test_data, train_target, test_target = sklearn.model_selection.train_test_split(
    data, target, test_size=args.test_size, random_state=args.seed)

# Generate initial model weights.
weights = generator.uniform(size=[train_data.shape[1], args.classes], low=-0.1, high=0.1)

In [3]:
def softMax(x):
    if x.ndim == 1:
        exp_ = np.exp(x - np.max(x))
        return exp_ / np.sum(exp_)
    else:
        exp_ = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_ / np.sum(exp_, axis=1, keepdims=True)

In [20]:
for epoch in range(args.epochs):
        permutation = generator.permutation(train_data.shape[0])
        # TODO: Process the data in the order of `permutation`. For every
        # `args.batch_size` of them, average their gradient, and update the weights.
        # You can assume that `args.batch_size` exactly divides `train_data.shape[0]`.
        b_size = args.batch_size
        for i in range(0, train_data.shape[0], b_size):
            indices= permutation[i : i + b_size]
            X_batch = train_data[indices]
            t_batch = train_target[indices]
            probs= softMax(X_batch @ weights)

            g = probs.copy()
            g[np.arange(b_size), t_batch] -= 1
            g = X_batch.T @ g / b_size

            weights -= args.learning_rate * g
        # Note that you need to be careful when computing softmax because the exponentiation
        # in softmax can easily overflow. To avoid it, you should use the fact that
        # $softmax(z) = softmax(z + any_constant)$ and compute $softmax(z) = softmax(z - maximum_of_z)$.
        # That way we only exponentiate non-positive values, and overflow does not occur.

        # TODO: After the SGD epoch, measure the average loss and accuracy for both the
        # train test and the test set. The loss is the average MLE loss (i.e., the
        # negative log-likelihood, or cross-entropy loss, or KL loss) per example.
        train_probs = softMax(train_data @ weights)
        train_pred = np.argmax(train_probs, axis=1)
        train_accuracy = np.mean(train_pred == train_target)
        train_loss = -np.mean(np.log(train_probs[np.arange(len(train_target)), train_target]))

        test_probs = softMax(test_data @ weights)
        test_pred = np.argmax(test_probs, axis=1)
        test_accuracy = np.mean(test_pred == test_target)
        test_loss = -np.mean(np.log(test_probs[np.arange(len(test_target)), test_target]))

        print("After epoch {}: train loss {:.4f} acc {:.1f}%, test loss {:.4f} acc {:.1f}%".format(
            epoch + 1, train_loss, 100 * train_accuracy, test_loss, 100 * test_accuracy))

After epoch 1: train loss 0.2536 acc 91.7%, test loss 0.3084 acc 90.6%
After epoch 2: train loss 0.1520 acc 94.9%, test loss 0.2402 acc 92.2%
After epoch 3: train loss 0.0845 acc 97.3%, test loss 0.1391 acc 96.0%
After epoch 4: train loss 0.1290 acc 96.0%, test loss 0.1716 acc 94.5%
After epoch 5: train loss 0.0810 acc 97.4%, test loss 0.1619 acc 94.9%
After epoch 6: train loss 0.0634 acc 98.3%, test loss 0.1259 acc 96.1%
After epoch 7: train loss 0.0621 acc 98.3%, test loss 0.1243 acc 96.5%
After epoch 8: train loss 0.0441 acc 99.0%, test loss 0.1142 acc 97.0%
After epoch 9: train loss 0.0651 acc 97.9%, test loss 0.1302 acc 96.0%
After epoch 10: train loss 0.0425 acc 98.8%, test loss 0.1157 acc 96.5%
