In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import numpy.random as rng

# let's try something using a small dataset of small images

sklearn has [this](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits) nice set of 1797 items, each of which is an 8-by-8 grayscale image of a digit. It's a classification task: one tries to predict the target class, 0-9.


In [None]:
from sklearn.datasets import load_digits
digits = load_digits()
digits.keys()

In [None]:
print(digits.data.shape)
print(digits.target.shape)

In [None]:
i = rng.randint(0,1796)   #pick one at random to check
plt.imshow(digits.data[i,:].reshape(8,8), cmap='gray')
print(i, " is a ", digits.target[i])

### Convert the input data into `torch` tensors.

*Note: I found I had to make sure the torch data tensor was `Float` by, calling `.float()` on it. Apparently numpy default was Float64 ("Double") while torch is Float32 ("Float").*

In [None]:
print(digits.data.dtype)
data = torch.from_numpy(digits.data).float()
targ = torch.from_numpy(digits.target).long()
print(data.dtype)

This is the same as the above, but with the tensor. Note use of `view`


In [None]:
plt.gray()
i = rng.randint(len(data))
plt.matshow(data[i].view(8,8))
plt.show()
print(i, " is a ", targ[i].numpy())  # the .numpy is just to stop it saying "tensor(8)..."

# Let's construct a neural net, with pure torch
(My version isn't quite "pure" torch in fact - when it comes to defining a **loss**, I'm going to use the `torch.nn` loss function, just because it's handy...)


A good structure to adopt is the make the network a `class`, with two methods

 *   `__init__()` to set up the tensors
 *   `forward()` to define the computational graph

Let's try a network with one hidden layer, so in terms of parameters we will need:
 * two weights matrices, but different shapes
 * two vectors of "bias weights" (not connected to inputs)
We give these random starting values, and don't forget to tell torch to track gradients through them.

I'm going to use $z$ to refer to the input activation level for a neuron. (The usual notation for this is to write this as $\mathbf{x}\cdot\mathbf{w}$ for a single row vector $\mathbf{x}$, or $\mathbf{X}\cdot\mathbf{w}$ for a whole batch, $\mathbf{X}$. Oh and must add in the bias as well, $ + \mathbf{b}$. Perhaps we'll use the `relu` (rectified linear) function as our non-linearity.

In [None]:
class Net():
    def __init__(self):
        nHids = 12
        self.w1= 0.1 * torch.randn(64,nHids)      # weights to take us from those points to some hidden units
        self.b1= 0.1 * torch.randn(nHids)         # one bias, for each hidden unit
        self.w2= 0.1 * torch.randn(nHids,10)      # ditto for the second layer, but there's only one output
        self.b2= 0.1 * torch.randn(10)            # just one bias, on the sole output

        self.w1.requires_grad = True
        self.b1.requires_grad = True
        self.w2.requires_grad = True
        self.b2.requires_grad = True


    def forward(self, x):
        z1 = torch.matmul(x,self.w1) + self.b1  # often have to fiddle to get shapes right
        h = (z1>0.0) * z1                       # ReLU!
        z2 = torch.matmul(h,self.w2) + self.b2  # z2 is the weighted output.
        return z2

In [None]:
# quick sanity check
net = Net()
net.forward(data)[0:3,:]

### NB: the output looks like it hasn't done a classification yet
You might think we should return the output of doing a `softmax` operation at the end of `forward` (to get predicted probabilities of the classes), and then use the negative log Loss for classification (e.g. `torch.nn.NLLLoss`.

INSTEAD, here we simply return z2 just as it is, and use [`CrossEntropyLoss`](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html) from the `torch.nn` library instead.

_This does the softmax part itself internally_.

Doing it this way means we avoid having to include a numerical trick that is needed for doing softmax on large numbers (ask me why!).

Incidentally, the technical term for those "naked" z2 outputs is "logits".

In [None]:
lossFn = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD([net.w1, net.b1, net.w2, net.b2], lr=0.01)
optimizer

Now...

In [None]:
def train_show(network, data, targ, lossFunc, optimiser, epochs):
    lossHistory = []  # just to show a plot later...
    accuHistory = []

    for t in range(epochs):
        optimiser.zero_grad()      # Gradients accumulate by default, so don't forget to do this.

        y = network.forward(data)  # the forward pass

        loss = lossFunc(y,targ)    # recompute the loss
        loss.backward()            # runs autograd, to get the gradients needed by optimiser
        optimiser.step()           # take a step

        # just housekeeping and reporting
        accuracy = torch.mean((torch.argmax(y,dim=1) == targ).float())
        lossHistory.append(loss.detach().item())
        accuHistory.append(accuracy.detach())

    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    plt.plot(lossHistory,'r'); plt.title("loss"); plt.xlabel("epochs")
    plt.subplot(1,2,2)
    plt.plot(accuHistory,'b'); plt.title("accuracy")

In [None]:
# Take it from the top!
net = Net()
lossFn = torch.nn.CrossEntropyLoss() # see note above

optimiser = torch.optim.SGD([net.w1, net.b1, net.w2, net.b2], lr=0.01)

train_show(net, data, targ, lossFn, optimiser, 200)

# Let's try the same using `torch.nn` library

In particular [torch.nn.functional](https://pytorch.org/docs/stable/nn.functional.html), which has all sorts of "layers" prebuilt and optimised.

The simplest of these is [linear](https://pytorch.org/docs/stable/generated/torch.nn.functional.linear.html#torch.nn.functional.linear), which does our matrix multiplication (`matmul`) plus bias.

In [None]:
import torch.nn.functional as F

We'll proceed as before, but make our Net a subclass of `torch.nn.Module`.

*   one consequence is that, by default, parameters  in `__init__()` get initialised and have `requires_grad` set true
*   a `Linear` layer takes care of any biases, as well as the weights
*   Notice the reuse of `x` in `forward()` seems gratuitous, but makes adding more layers trivial


In [None]:
class OtherNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # here we set up the tensors......
        self.layer1 = torch.nn.Linear(64, 12)
        self.layer2 = torch.nn.Linear(12, 25)
        self.layer3 = torch.nn.Linear(25, 10)

    def forward(self, x):
        # here we define the (forward) computational graph,
        # in terms of the tensors, and elt-wise non-linearities
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        return x


Isn't that prettier than our first version?

In [None]:
# sanity check
othernet = OtherNet()
y = othernet.forward(data)
lossFn = torch.nn.CrossEntropyLoss()
loss = lossFn(y, targ)
print(loss)

In [None]:
othernet = OtherNet()
lossFunction = torch.nn.CrossEntropyLoss()

optimiser = torch.optim.SGD( othernet.parameters(), lr=0.01)
# Notice the handy "net.parameters()". Before, this was
#optimizer = torch.optim.Adam([net.w1, net.b1, net.w2, net.b2], lr=0.01)

train_show(othernet, data, targ, lossFunction, optimiser, 2000)

## some representative results?

That accuracy looks pretty good (well, on training set!...).


In [None]:
naked = othernet.forward(data).detach()
naked[0,:]

In [None]:
sm = torch.nn.Softmax(dim=1)  # ie. it's another torch neural net layer in fact
output = sm(naked)            # push the "input" through this layer...
print(output.shape)

In [None]:
output[0]

In [None]:
i = rng.randint(len(data))  # pick a random example to try
f, (ax1, ax2) = plt.subplots(1,2)
ax1.matshow(data[i].view(8,8)); ax1.axis('off');
ax2.bar(range(10), output[i])          # bar chart of the classifier probabilities
ax2.plot(targ[i],0,'^r',markersize=20) # red marker for the true class
ax2.set_xticks(range(10))
plt.show()