Going through pytorch tutorial commenting (almost) everything

In [5]:
import torch
from torch import nn #neural network
from torch.utils.data import DataLoader #data loader for dataset
from torchvision import datasets # built in datasets
from torchvision.transforms import ToTensor # transforms the image to tensor (tensor is a multi-dimensional matrix)

In [6]:
# FashionMNIST dataset is a dataset of Zalando's article images consisting of 60,000 training examples and 10,000 test examples

training_data = datasets.FashionMNIST(
    root="data", #root is the location where the data is stored
    train=True, #true for training, false for testing
    download=True, #true to download, false if already downloaded
    transform=ToTensor() 
)

test_data = datasets.FashionMNIST (
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)
#if we would like to load data from csv file, we can use pandas library
#training_data = pd.read_csv('data.csv') transform=ToTensor() is not needed since it is already in tensor(matrix) form



Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz to data\FashionMNIST\raw\train-images-idx3-ubyte.gz


100%|██████████| 26421880/26421880 [00:01<00:00, 17916734.15it/s]


Extracting data\FashionMNIST\raw\train-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz to data\FashionMNIST\raw\train-labels-idx1-ubyte.gz


100%|██████████| 29515/29515 [00:00<00:00, 1867249.13it/s]

Extracting data\FashionMNIST\raw\train-labels-idx1-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz





Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz


100%|██████████| 4422102/4422102 [00:00<00:00, 14933567.45it/s]


Extracting data\FashionMNIST\raw\t10k-images-idx3-ubyte.gz to data\FashionMNIST\raw

Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz
Downloading http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz


100%|██████████| 5148/5148 [00:00<?, ?it/s]

Extracting data\FashionMNIST\raw\t10k-labels-idx1-ubyte.gz to data\FashionMNIST\raw






In [14]:
batch_size = 64
train_dataloader = DataLoader(training_data, batch_size=batch_size) #dataloader is used to shuffle and batch the data, shuffling for randomness
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [28]:
for X,y in test_dataloader:
    print("Shape of X [N, C, H, W]: ", X.shape) #N samles, C channels(gray scale), H height, W width
    print("Shape of y: ", y.shape, y.dtype) #y is the label, so shape of y = N = batch size
    break #break to stop the loop after one iteration

test_data.targets.unique() #labels of the test data


Shape of X [N, C, H, W]:  torch.Size([64, 1, 28, 28])
Shape of y:  torch.Size([64]) torch.int64


tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [26]:
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
print(f'Using {device} device') 

Using cpu device


In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self): #initializing the neural network
        super().__init__() #super() is used to call the parent class constructor
        self.flatten = nn.Flatten() #flatten the tensor to 1D 
        self.linear_relu_stack = nn.Sequential ( 
            nn.Linear(28*28, 512), #input layer
            nn.ReLU(), #activation function
            nn.Linear(512, 512), #hidden layer  512 in, 512 out
            nn.ReLU(),
            nn.Linear(512, 10) #output layer with 10 classes, 512 in, 10 out, linear activation function because we are using cross entropy loss
        )
        
    def forward(self, x): #forward pass, x is the input logits(log-odds) are the output
        x = self.flatten(x)
        logits = self.linear_relu_stack(x) 
        return logits

In [33]:
model=NeuralNetwork().to(device) # to(device) is used to move the model to the device (cpu or gpu)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [34]:
loss_fn = nn.CrossEntropyLoss() #negative log likelihood with softmax built in
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) #stochastic gradient descent optimizer



In [35]:
def train(dataloader, model, loss_fn, optimizer): 
    size = len(dataloader.dataset) #size of the dataset (number of samples)
    model.train() #set the model to training mode
    for batch, (X, y) in enumerate(dataloader): #iterate over the batches
        X, y = X.to(device), y.to(device)
        
        pred = model(X) #make predictions
        loss = loss_fn(pred, y) #calculate the loss and use it in backpropagation
        
        loss.backward() #backpropagate the loss
        optimizer.step() #update the parameters
        optimizer.zero_grad() #reset the gradients to zero to address the accumulation of gradients
        
        if batch % 100 == 0:
            loss, current = loss.item(), (batch+1)*len(X) #loss.item() is used to get the scalar value held in the loss
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [37]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset) #size of the dataset
    num_batches = len(dataloader) #number of batches
    model.eval()
    test_loss, correct = 0, 0 
    with torch.no_grad(): #no need to calculate gradients during testing, 'with' means that the block of code is executed without calculating the gradients
    
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            
            pred = model(X)
            
            loss = loss_fn(pred, y) #current loss
            test_loss += loss.item() #overall loss to report
            
            correctly_predicted = (pred.argmax(1) == y).type(torch.float).sum().item() #number of correctly predicted samples true/false => 1.0/0.0 => sum of 1s => extract the scalar value from the tensor
            correct += correctly_predicted #overall number of correctly predicted samples
            
    test_loss /= num_batches #average loss across all batches
    correct /= size #accuracy
        
    print (f"Test accuracy: {correct}, Average loss: {test_loss}")

In [38]:
epochs = 10
for i in range(epochs):
    print (f"Epoch {i}")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
    print (f"Done")

Epoch 0
loss: 2.293887  [   64/60000]
loss: 2.284237  [ 6464/60000]
loss: 2.265129  [12864/60000]
loss: 2.268166  [19264/60000]
loss: 2.242955  [25664/60000]
loss: 2.216816  [32064/60000]
loss: 2.222272  [38464/60000]
loss: 2.183035  [44864/60000]
loss: 2.177886  [51264/60000]
loss: 2.155577  [57664/60000]
Test accuracy: 0.447, Average loss: 2.1467758849927576
Done
Epoch 1
loss: 2.157333  [   64/60000]
loss: 2.148030  [ 6464/60000]
loss: 2.085177  [12864/60000]
loss: 2.108140  [19264/60000]
loss: 2.060771  [25664/60000]
loss: 1.999161  [32064/60000]
loss: 2.027446  [38464/60000]
loss: 1.940122  [44864/60000]
loss: 1.941845  [51264/60000]
loss: 1.885275  [57664/60000]
Test accuracy: 0.5939, Average loss: 1.8767663825089764
Done
Epoch 2
loss: 1.913813  [   64/60000]
loss: 1.882994  [ 6464/60000]
loss: 1.756091  [12864/60000]
loss: 1.802019  [19264/60000]
loss: 1.699396  [25664/60000]
loss: 1.654137  [32064/60000]
loss: 1.672935  [38464/60000]
loss: 1.564328  [44864/60000]
loss: 1.590086 

In [None]:
import random
for i in range(20):
    model.eval()
    sample = random.randint(0, len(test_data)) #randomly select a sample from the test data
    X, y = test_data[sample][0], test_data[sample][1] 
    with torch.no_grad():
        pred = model(X) 
        predicted, actual = pred[0].argmax(0), y 
        print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "7", Actual: "9"
Predicted: "4", Actual: "4"
Predicted: "4", Actual: "6"
Predicted: "2", Actual: "6"
Predicted: "0", Actual: "0"
Predicted: "9", Actual: "9"
Predicted: "3", Actual: "3"
Predicted: "1", Actual: "1"
Predicted: "8", Actual: "8"
Predicted: "0", Actual: "0"
Predicted: "3", Actual: "3"
Predicted: "4", Actual: "4"
Predicted: "2", Actual: "2"
Predicted: "0", Actual: "0"
Predicted: "1", Actual: "1"
Predicted: "3", Actual: "3"
Predicted: "7", Actual: "7"
Predicted: "7", Actual: "9"
Predicted: "7", Actual: "7"
Predicted: "2", Actual: "2"


In [53]:
torch.save(model.state_dict(), "model.pth") # to save 

model = NeuralNetwork().to(device)                                # to load 
model.load_state_dict(torch.load("model.pth", weights_only=True)) # to load

  return self.fget.__get__(instance, owner)()


<All keys matched successfully>

2. Tensors

In [62]:
#reimport so can run separately
import torch
import numpy as np

data = [[1, 2],[3, 4]] #vanilla python  matrix (list of lists)
x_data = torch.tensor(data) #convert list to tensor
x_data



tensor([[1, 2],
        [3, 4]])

In [64]:
np_array = np.array(data) 
x_np = torch.from_numpy(np_array) # torch.tensor() works, but torch.from_numpy() is more efficient = shares memory with numpy array
x_np

tensor([[1, 2],
        [3, 4]], dtype=torch.int32)