Import the necessary dependencies

In [1]:
# !pip install torchmetrics
# !pip install lightning

In [2]:
import torch
import torchvision
import torchmetrics
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader, random_split
from utils import Fashion_MNIST_MobileNet
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score
from pytorch_lightning.callbacks import ModelCheckpoint

Check that we have PyTorch CUDA version installed and CUDA available

In [3]:
print("Torch version is: ", torch.__version__) # Should see something like 2.1.0+cu118
print("Is CUDA available - ", torch.cuda.is_available())
# Just as a fail-safe, switch to CPU if CUDA not available, but training will be very slow
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pl.seed_everything(42)

Seed set to 42


Torch version is:  2.1.0
Is CUDA available -  True


42

In [4]:
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                  transforms.RandomVerticalFlip(),
                                  transforms.ToTensor(),
                                  transforms.Normalize((72.9404/255,), (90.0212/255,))
])

test_transform = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((72.9404/255,), (90.0212/255,))
])

In [5]:
# Here we load the download the train dataset
train_set = FashionMNIST('./data', download=True, train=True, transform=train_transform)
print(train_set.data.float().mean())
print(train_set.data.float().std())
# Here we load the download the test dataset
test_set = FashionMNIST('./data', download=True, train=False, transform=test_transform)
# Dictionary of the classes in the dataset
classes_dict = dict(enumerate(train_set.classes))
print(classes_dict)

tensor(72.9404)
tensor(90.0212)
{0: 'T-shirt/top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat', 5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'}


Using Resnet without pre-trained weights, we do not have to rescale to 224x224 </br>
But if we need to use pre-trained weights, might have to change it
</br>
Here we set the initial transformations, I need to see the accuracy first before deciding to add more data augmentation (eg. rots, flips, crops)

**Note:** for model evaluation, we randomly split the train dataset into train/val with into a split of 50,000 images and 10,000 images respectively

In order to do this, we set a generator object with manual seed for reproducibility, then use the *random_split()* function provided by PyTorch to do the split.

In [6]:
# g1 = torch.Generator().manual_seed(42) # For reproducibility
# trainset, valset = random_split(full_trainset, [50000, 10000], g1)
# print(len(trainset))
# print(len(valset))

Load the datasets as well as the loaders for processing batches

In [7]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=8)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=8)

In [9]:
model = Fashion_MNIST_MobileNet()
checkpoint_callback = ModelCheckpoint(monitor="val_acc", mode='max', filename='{epoch}-{val_loss:.3f}-{val_acc:.3f}', auto_insert_metric_name=True)
trainer = pl.Trainer(accelerator='gpu', max_epochs=30, callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/youxiang/anaconda3/envs/dl_proj/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.se

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 10.34it/s]

Epoch: 0 - Metrics: 
Training loss: None, Validation loss:2.3026, Validation accuracy: 0.1067, Validation F1: 0.0326

Epoch 0: 100%|██████████| 938/938 [00:13<00:00, 71.50it/s, v_num=0]        

Epoch: 0 - Metrics: 
Training loss: 0.5758934617042542, Validation loss:0.5823, Validation accuracy: 0.7948, Validation F1: 0.7709

Epoch 1: 100%|██████████| 938/938 [00:13<00:00, 69.56it/s, v_num=0, val_acc=0.795, val_f1=0.771]

Epoch: 1 - Metrics: 
Training loss: 1.441855788230896, Validation loss:0.5080, Validation accuracy: 0.8168, Validation F1: 0.8160

Epoch 2: 100%|██████████| 938/938 [00:13<00:00, 67.51it/s, v_num=0, val_acc=0.817, val_f1=0.816]

Epoch: 2 - Metrics: 
Training loss: 0.24897129833698273, Validation loss:0.4325, Validation accuracy: 0.8449, Validation F1: 0.8440

Epoch 3: 100%|██████████| 938/938 [00:14<00:00, 64.67it/s, v_num=0, val_acc=0.845, val_f1=0.844]

Epoch: 3 - Metrics: 
Training loss: 0.

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 938/938 [00:15<00:00, 60.71it/s, v_num=0, val_acc=0.896, val_f1=0.896]


Load our model with no pre-trained weights

In [10]:
# model = resnet50(weights=None, num_classes=10)
# model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
# model.to(device)

Define the optimizer, loss function and number of epochs to train for

In [11]:
# optimizer = optim.Adam(model.parameters(), lr=1e-3) # Use default lr, betas and epsilon
# criterion = nn.CrossEntropyLoss()
# num_epochs = 100

Add our metrics for determining model performance

In [12]:
# f1 = F1Score(task="multiclass", num_classes=10).to(device)
# acc = Accuracy(task="multiclass", num_classes=10).to(device)

Training loop

In [13]:
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for idx, (inputs, labels) in enumerate(train_loader, 0):
#         inputs, labels = inputs.to(device), labels.to(device)

#         # Zero the parameter gradients
#         optimizer.zero_grad()

#         # Forward + backward + optimize
#         outputs = model(inputs)
#         loss = criterion(outputs, labels)
#         f1.update(outputs, labels)
#         acc.update(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()
#         if idx % 100 == 99:    # Print every 100 mini-batches
#             print(f'Epoch {epoch + 1}, Batch {idx + 1}, Loss: {running_loss / 100:.4f}')
#             running_loss = 0.0
#     epoch_f1 = f1.compute()
#     epoch_acc = acc.compute()
#     print(f'Epoch {epoch + 1}, Acc: {epoch_acc}, F1-score: {epoch_f1}')

# print('Finished Training')