Import the necessary dependencies

In [1]:
# !pip install torchmetrics
# !pip install lightning

In [2]:
import torch
import torchvision
import torchmetrics
import torchvision.transforms as transforms
from torchvision.datasets import FashionMNIST
from torch.utils.data import DataLoader, random_split
from utils import Fashion_MNIST_ResNet
import torch.nn as nn
import torch.optim as optim
import pytorch_lightning as pl
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score
from pytorch_lightning.callbacks import ModelCheckpoint

Check that we have PyTorch CUDA version installed and CUDA available

In [3]:
print("Torch version is: ", torch.__version__) # Should see something like 2.1.0+cu118
print("Is CUDA available - ", torch.cuda.is_available())
# Just as a fail-safe, switch to CPU if CUDA not available, but training will be very slow
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pl.seed_everything(42)

Seed set to 42


Torch version is:  2.1.0
Is CUDA available -  True


42

In [4]:
train_transform = transforms.Compose([transforms.RandomHorizontalFlip(),
                                  transforms.RandomVerticalFlip(),
                                  transforms.ToTensor(),
                                  transforms.Normalize((72.9404/255,), (90.0212/255,))
])

test_transform = transforms.Compose([transforms.ToTensor(),
                                  transforms.Normalize((72.9404/255,), (90.0212/255,))
])

In [5]:
# Here we load the download the train dataset
train_set = FashionMNIST('./data', download=True, train=True, transform=train_transform)
print(train_set.data.float().mean())
print(train_set.data.float().std())
# Here we load the download the test dataset
test_set = FashionMNIST('./data', download=True, train=False, transform=test_transform)
# Dictionary of the classes in the dataset
classes_dict = dict(enumerate(train_set.classes))
print(classes_dict)

tensor(72.9404)
tensor(90.0212)
{0: 'T-shirt/top', 1: 'Trouser', 2: 'Pullover', 3: 'Dress', 4: 'Coat', 5: 'Sandal', 6: 'Shirt', 7: 'Sneaker', 8: 'Bag', 9: 'Ankle boot'}


Load the datasets as well as the loaders for processing batches

In [7]:
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=8)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=8)

In [8]:
model = Fashion_MNIST_ResNet()
checkpoint_callback = ModelCheckpoint(monitor="val_acc", mode='max', filename='{epoch}-{val_loss:.3f}-{val_acc:.3f}', auto_insert_metric_name=True)
trainer = pl.Trainer(accelerator='gpu', max_epochs=30, callbacks=[checkpoint_callback])
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=test_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/home/youxiang/anaconda3/envs/dl_proj/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:67: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 3080') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.se

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 10.47it/s]

Epoch: 0 - Metrics: 
Training loss: None, Validation loss:5.6418, Validation accuracy: 0.1176, Validation F1: 0.0375

Epoch 0: 100%|██████████| 938/938 [00:15<00:00, 59.44it/s, v_num=0]        

Epoch: 0 - Metrics: 
Training loss: 0.6239408850669861, Validation loss:0.5105, Validation accuracy: 0.8127, Validation F1: 0.8138

Epoch 1: 100%|██████████| 938/938 [00:15<00:00, 59.34it/s, v_num=0, val_acc=0.813, val_f1=0.814, train_loss=0.624]

Epoch: 1 - Metrics: 
Training loss: 0.4091157019138336, Validation loss:0.4356, Validation accuracy: 0.8438, Validation F1: 0.8429

Epoch 2: 100%|██████████| 938/938 [00:16<00:00, 58.35it/s, v_num=0, val_acc=0.844, val_f1=0.843, train_loss=0.409]

Epoch: 2 - Metrics: 
Training loss: 0.8715630173683167, Validation loss:2.0866, Validation accuracy: 0.7106, Validation F1: 0.6995

Epoch 3: 100%|██████████| 938/938 [00:16<00:00, 57.43it/s, v_num=0, val_acc=0.711, val_f1=0.700, tra

`Trainer.fit` stopped: `max_epochs=30` reached.


Epoch 29: 100%|██████████| 938/938 [00:16<00:00, 55.42it/s, v_num=0, val_acc=0.905, val_f1=0.905, train_loss=0.131]
