In [1]:
import torch
from torch import tensor, nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [2]:
X = tensor([[0, 0],
            [0, 1],
            [1, 0],
            [1, 1]], dtype=torch.float32)

y = tensor([0, 1, 1, 0], dtype=torch.int64)

gen = torch.Generator()
gen.manual_seed(42)
# W1 = torch.randn((2, 3), generator=gen, requires_grad=True)
# W2 = torch.randn((3, 2), generator=gen, requires_grad=True)
W1 = tensor([[0.1, -0.1, 0.2],
             [0.3, -0.3, 0.1]], dtype=torch.float32, requires_grad=True)
W2 = tensor([[0.1, -1],
             [-0.2, 2],
             [0.3, -3]], dtype=torch.float32, requires_grad=True)

b1 = torch.zeros(W1.shape[1], dtype=torch.float32, requires_grad=True)
b2 = torch.zeros(W2.shape[1], dtype=torch.float32, requires_grad=True)

lr = 0.1

params = [W1, b1, W2, b2]

In [3]:
for epoch in range(1000):
    print("----------------------------")
    hidden = F.relu(X @ W1 + b1)
    logits = hidden @ W2 + b2
    loss = F.cross_entropy(logits, y)

    loss.backward()

    # print(f"{W1=}")
    # print(f"{W1.grad=}")
    # print(f"{W2=}")
    # print(f"{W2.grad=}")
    # print(f"{b1=}")
    # print(f"{b1.grad=}")
    # print(f"{b2=}")
    # print(f"{b2.grad=}")
    # print(f"{hidden=}")
    # print(f"{logits=}")

    with torch.no_grad():
        for param in params:
            param -= lr * param.grad

    for param in params:
        param.grad = None

    print(f"loss: {loss.item()}")

----------------------------
loss: 0.783708930015564
----------------------------
loss: 0.6846922636032104
----------------------------
loss: 0.6644883155822754
----------------------------
loss: 0.649567186832428
----------------------------
loss: 0.6675959229469299
----------------------------
loss: 0.6525565385818481
----------------------------
loss: 0.6456142067909241
----------------------------
loss: 0.6496949195861816
----------------------------
loss: 0.6302125453948975
----------------------------
loss: 0.6446225643157959
----------------------------
loss: 0.6211984157562256
----------------------------
loss: 0.6177825927734375
----------------------------
loss: 0.6151044368743896
----------------------------
loss: 0.6253160238265991
----------------------------
loss: 0.6181395649909973
----------------------------
loss: 0.6054006814956665
----------------------------
loss: 0.6167093515396118
----------------------------
loss: 0.5960202217102051
----------------------------
l

In [4]:
import pandas as pd

In [13]:
Xtrain = pd.read_csv('./data/fashion_mnist_train_vectors.csv', header=None).values
ytrain = pd.read_csv('./data/fashion_mnist_train_labels.csv', header=None).values
Xtest = pd.read_csv('./data/fashion_mnist_test_vectors.csv', header=None).values
ytest = pd.read_csv('./data/fashion_mnist_test_labels.csv', header=None).values

TRAIN_SIZE = 50000

Xval = Xtrain[TRAIN_SIZE:]
yval = ytrain[TRAIN_SIZE:]
Xtrain = Xtrain[:TRAIN_SIZE]
ytrain = ytrain[:TRAIN_SIZE]

Xtrain = tensor(Xtrain, dtype=torch.float32) / 255.
ytrain = tensor(ytrain, dtype=torch.int64).squeeze()
Xval = tensor(Xval, dtype=torch.float32) / 255.
yval = tensor(yval, dtype=torch.int64).squeeze()
Xtest = tensor(Xtest, dtype=torch.float32) / 255.
ytest = tensor(ytest, dtype=torch.int64).squeeze()

Xtrain = Xtrain.reshape(-1, 1, 28, 28)
Xval = Xval.reshape(-1, 1, 28, 28)
Xtest = Xtest.reshape(-1, 1, 28, 28)

mean = Xtrain.mean()
std = Xtrain.std()

Xtrain = (Xtrain - mean) / std
Xval = (Xval - mean) / std
Xtest = (Xtest - mean) / std

In [14]:
model = nn.Sequential(
    nn.Conv2d(1, 64, 3),
    nn.MaxPool2d(2),
    nn.Conv2d(64, 32, 3),
    nn.MaxPool2d(2),
    nn.Flatten(),
    nn.Linear(800, 10)
)

lr = 1e-3

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [15]:
from torchsummary import summary

summary(model, (1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 26, 26]             640
         MaxPool2d-2           [-1, 64, 13, 13]               0
            Conv2d-3           [-1, 32, 11, 11]          18,464
         MaxPool2d-4             [-1, 32, 5, 5]               0
           Flatten-5                  [-1, 800]               0
            Linear-6                   [-1, 10]           8,010
Total params: 27,114
Trainable params: 27,114
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.45
Params size (MB): 0.10
Estimated Total Size (MB): 0.56
----------------------------------------------------------------


In [16]:
BATCH_SIZE = 100
prev_val_acc = -np.inf

for epoch in range(1, 11):
    for b in range(len(Xtrain) // BATCH_SIZE):
        # print(f"{b=}")
        si, ei = b * BATCH_SIZE, (b + 1) * BATCH_SIZE
        Xb = Xtrain[si:ei]
        yb = ytrain[si:ei]

        logits = model(Xb)
        loss = F.cross_entropy(logits, yb)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

    with torch.no_grad():
        logits = model.forward(Xval)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)
        val_acc = (sum(preds == yval) / len(yval)).item()

    print(f"{epoch=}")
    print(f"{loss.item()=}")
    print(f"{val_acc=}")
    if prev_val_acc > val_acc:
        for g in optimizer.param_groups:
            lr = lr / 2
            g['lr'] = lr
        print(lr)
    prev_val_acc = val_acc

epoch=1
loss.item()=0.5605951547622681
val_acc=0.8673999905586243
epoch=2
loss.item()=0.5165475010871887
val_acc=0.8801000118255615
epoch=3
loss.item()=0.48235443234443665
val_acc=0.8853999972343445
epoch=4
loss.item()=0.45466148853302
val_acc=0.8914999961853027
epoch=5
loss.item()=0.4255399703979492
val_acc=0.8931999802589417
epoch=6
loss.item()=0.40699777007102966
val_acc=0.895799994468689
epoch=7
loss.item()=0.39012324810028076
val_acc=0.8966000080108643
epoch=8
loss.item()=0.37240445613861084
val_acc=0.8963000178337097
0.0005
epoch=9
loss.item()=0.3542041480541229
val_acc=0.8978999853134155
epoch=10
loss.item()=0.3482556939125061
val_acc=0.8985999822616577


In [17]:
logits = model(Xtest)
probs = torch.softmax(logits, axis=1)
preds = torch.argmax(probs, axis=1)
(sum(preds == ytest) / len(ytest)).item()

0.8986999988555908