In [10]:
import os
import torch
import torch.optim as optim
import torch.nn.functional as F
from sklearn.datasets import fetch_openml
from torch.utils import data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from tensorboardX import SummaryWriter
from torch.utils.data import TensorDataset, DataLoader

from MnistNet import MnistNet

In [11]:
# define model parameters
NUM_EPOCHS = 100  # original paper
BATCH_SIZE = 128
MOMENTUM = 0.9
LR_DECAY = 0.0005
LR_INIT = 0.01
IMAGE_DIM = 28  # pixels
NUM_CLASSES = 10  # 10 classes for mnist dataset
DEVICE_IDS = [0, 1, 2, 3]  # GPUs to use

# define pytorch device - useful for device-agnostic execution
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# print the seed value
# seed = torch.initial_seed()
# print('Used seed : {}'.format(seed))

In [13]:
mnistnet = MnistNet(num_classes=NUM_CLASSES).to(device)
# train on multiple GPUs
mnistnet = torch.nn.parallel.DataParallel(mnistnet, device_ids=DEVICE_IDS)
print(mnistnet)
print('MnistNet created')

DataParallel(
  (module): MnistNet(
    (net): Sequential(
      (0): Conv2d(1, 96, kernel_size=(11, 11), stride=(2, 2))
      (1): ReLU()
      (2): LocalResponseNorm(5, alpha=0.0001, beta=0.75, k=2)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (classifier): Sequential(
      (0): Dropout(p=0.5, inplace=False)
      (1): Linear(in_features=1536, out_features=4096, bias=True)
      (2): ReLU()
      (3): Dropout(p=0.5, inplace=False)
      (4): Linear(in_features=4096, out_features=4096, bias=True)
      (5): ReLU()
      (6): Linear(in_features=4096, out_features=10, bias=True)
    )
  )
)
MnistNet created


In [14]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
y = y.astype(int)

  warn(


In [15]:
print(X.shape)
X = X.reshape(X.shape[0], 1, 28, 28)
print(X.shape)

(70000, 784)
(70000, 1, 28, 28)


In [16]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
train_dataset = TensorDataset(torch.from_numpy(X_train).float(),
                              torch.from_numpy(y_train).long())
train_loader = DataLoader(train_dataset,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=8,
                          drop_last=True,
                          batch_size=BATCH_SIZE)

test_dataset = TensorDataset(torch.from_numpy(X_test).float(),
                             torch.from_numpy(y_test).long())
test_loader = DataLoader(test_dataset,
                          shuffle=True,
                          pin_memory=True,
                          num_workers=8,
                          drop_last=True,
                          batch_size=BATCH_SIZE)

In [17]:
# create optimizer
# the one that WORKS
optimizer = optim.Adam(params=mnistnet.parameters(), lr=0.0001)
### BELOW is the setting proposed by the original paper - which doesn't train....
# optimizer = optim.SGD(
#     params=alexnet.parameters(),
#     lr=LR_INIT,
#     momentum=MOMENTUM,
#     weight_decay=LR_DECAY)
print('Optimizer created')

Optimizer created


In [None]:
# multiply LR by 1 / 10 after every 30 epochs
lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
print('LR Scheduler created')

# start training!!
print('Starting training...')
total_steps = 1
for epoch in range(NUM_EPOCHS):
    lr_scheduler.step()
    for imgs, classes in train_loader:
        imgs, classes = imgs.to(device), classes.to(device)

        # calculate the loss
        output = mnistnet(imgs)
        loss = F.cross_entropy(output, classes)

        # update the parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # log the information and add to tensorboard
        if total_steps % 10 == 0:
            with torch.no_grad():
                _, preds = torch.max(output, 1)
                accuracy = torch.sum(preds == classes)/len(classes)

                print('Epoch: {} \tStep: {} \tLoss: {:.4f} \tAcc: {}'
                      .format(epoch + 1, total_steps, loss.item(), accuracy.item()))

        # print out gradient values and parameter average values
        if total_steps % 100 == 0:
            with torch.no_grad():
                # print and save the grad of the parameters
                # also print and save parameter values
                print('*' * 10)
                for name, parameter in mnistnet.named_parameters():
                    if parameter.grad is not None:
                        avg_grad = torch.mean(parameter.grad)
                        print('\t{} - grad_avg: {}'.format(name, avg_grad))
                        # tbwriter.add_scalar('grad_avg/{}'.format(name), avg_grad.item(), total_steps)
                        # tbwriter.add_histogram('grad/{}'.format(name),
                        #                       parameter.grad.cpu().numpy(), total_steps)
                    if parameter.data is not None:
                        avg_weight = torch.mean(parameter.data)
                        print('\t{} - param_avg: {}'.format(name, avg_weight))
                        # tbwriter.add_histogram('weight/{}'.format(name),
                        #                       parameter.data.cpu().numpy(), total_steps)
                        # tbwriter.add_scalar('weight_avg/{}'.format(name), avg_weight.item(), total_steps)

        total_steps += 1

LR Scheduler created
Starting training...




Epoch: 1 	Step: 10 	Loss: 2.3370 	Acc: 0.296875
Epoch: 1 	Step: 20 	Loss: 1.3040 	Acc: 0.4765625
Epoch: 1 	Step: 30 	Loss: 0.7916 	Acc: 0.796875
Epoch: 1 	Step: 40 	Loss: 0.7768 	Acc: 0.7890625
Epoch: 1 	Step: 50 	Loss: 0.5921 	Acc: 0.7734375
Epoch: 1 	Step: 60 	Loss: 0.4054 	Acc: 0.875
Epoch: 1 	Step: 70 	Loss: 0.4375 	Acc: 0.8515625
Epoch: 1 	Step: 80 	Loss: 0.2400 	Acc: 0.9609375
Epoch: 1 	Step: 90 	Loss: 0.3686 	Acc: 0.875
Epoch: 1 	Step: 100 	Loss: 0.3923 	Acc: 0.90625
**********
	module.net.0.weight - grad_avg: 0.009873581118881702
	module.net.0.weight - param_avg: -0.0011626502964645624
	module.net.0.bias - grad_avg: 0.0002546436444390565
	module.net.0.bias - param_avg: -0.001768997055478394
	module.classifier.1.weight - grad_avg: 1.9491248167469166e-05
	module.classifier.1.weight - param_avg: -0.00015869141498114914
	module.classifier.1.bias - grad_avg: 5.179708296054741e-06
	module.classifier.1.bias - param_avg: -0.0003106079821009189
	module.classifier.4.weight - grad_avg: 8.

In [None]:
import torch.nn as nn
criterion = nn.CrossEntropyLoss()

In [None]:
mnistnet.eval()
    with torch.no_grad():
        correct_out = 0
        total_out = 0
        for pics, lbls in test_loader:
            out = mnistnet(pics)
            pred = torch.argmax(out, dim=1)
            total_out += lbls.shape[0]
            correct_out += (pred == lbls).sum().item()
    loss_current = criterion(out, lbls)
    print(correct_out / total_out, loss_current)