In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

**Quickstart**

In [2]:
# Download training data from open datasets.
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

# Download test data from open datasets.
test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

# root="data"	=> The directory where the dataset will be stored. If "data" doesn’t exist, it will be created.
# train=True =>	This tells PyTorch to load the training set (60,000 images).
# download=True =>	If the dataset is not already present, download it from the internet.
# transform=ToTensor() =>	Apply a transformation that converts each image (PIL format) into a PyTorch tensor and normalizes pixel values to [0.0, 1.0].

100%|██████████| 26.4M/26.4M [00:02<00:00, 11.7MB/s]
100%|██████████| 29.5k/29.5k [00:00<00:00, 210kB/s]
100%|██████████| 4.42M/4.42M [00:01<00:00, 3.91MB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 24.5MB/s]


In [3]:
batch_size = 64
# This sets the number of samples per batch.

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
# This wraps the training_data (which is a FashionMNIST dataset) in a DataLoader.
# What DataLoader does:
  # Batches the data into size batch_size
  # Allows iterating over the dataset using a for loop
  # Optionally shuffles or loads data in parallel (more on this later)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

# output [64, 1, 28, 28]: Each batch contains 64 grayscale images (1 channel, 28x28 pixels) and their 64 labels.

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [5]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")
# Checks if any hardware accelerator (GPU or Apple Silicon, etc.) is available.
# If yes, it uses it (e.g., "cuda" for NVIDIA GPU, "mps" for Apple GPU).
# If not, it defaults to "cpu".

# Define model
class NeuralNetwork(nn.Module):
  # define a class that inherits from nn.Module, which is the base class for all PyTorch models.
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        # nn.Flatten() converts the 2D image (28×28) into a 1D vector (784 values).
        # Shape change: [batch_size, 1, 28, 28] ⟶ [batch_size, 784]
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512), # input layer
            nn.ReLU(),             # activation function
            nn.Linear(512, 512),   # hidden layer
            nn.ReLU(),             # activation function
            nn.Linear(512, 10)     # output layer
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model = NeuralNetwork().to(device)
print(model)

Using cuda device
NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [6]:
loss_fn = nn.CrossEntropyLoss()
# A loss function measures how far the model’s predictions are from the correct labels (truth). It guides the training process.
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
# An optimizer updates the model’s weights based on the computed gradients during backpropagation.

In [8]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # This gives you the total number of samples in the dataset (not just in one batch).

    model.train()
    # This sets the model in training mode.
      # It activates dropout, batch norm, etc. if used.
      # Not needed in inference mode (model.eval() is used for testing/validation).

    for batch, (X, y) in enumerate(dataloader):
        # batch: Index of the batch (0, 1, 2, ...)
        # (X, y): A batch of inputs (X) and targets (y) from the dataloader

        # Example shapes:
        #   X: [64, 1, 28, 28]
        #   y: [64]

        X, y = X.to(device), y.to(device)
        # This ensures the data is on the same device as the model (CPU or GPU).
        # Without this, it may causes an error during training

        # Compute prediction error
        pred = model(X)
        # Feeds the batch of inputs X through the model.
        # Returns logits (raw output scores) of shape [batch_size, 10].

        loss = loss_fn(pred, y)
        # Compares predictions (pred) with actual labels (y)
        # Returns the average loss for this batch (a single number).

        # Backpropagation
        loss.backward()
        # Calculates the gradient of the loss w.r.t. each model parameter (using the chain rule).
        # These gradients are stored in parameter.grad.

        optimizer.step()
        # Uses the gradients calculated in loss.backward() to update the model's parameters.

        optimizer.zero_grad()
        # Clears the previous gradients, which accumulate by default in PyTorch.
        # Must do this before the next batch.

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [9]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    # size: Total number of samples in the test dataset (e.g., 10,000 for FashionMNIST)
    # num_batches: Total number of batches in the test set (e.g., 10,000 / 64 = 157)

    model.eval()
    # This puts the model in evaluation mode:
    #   Disables features like dropout or batch norm updating.
    #   Ensures consistent and correct behavior during testing.

    test_loss, correct = 0, 0
    with torch.no_grad():
      # This tells PyTorch not to compute gradients, which:
      #   Saves memory
      #   Speeds up computations
      # Gradients are only needed for training, not testing.

        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            # Moves data to the correct device (CPU/GPU)
            # Performs a forward pass with model(X) to get predictions

            test_loss += loss_fn(pred, y).item()
            # Computes the loss for this batch

            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            # pred.argmax(1)	Finds the index (class) with the highest score for each image
            # == y	Compares predicted labels to true labels
            # .type(torch.float)	Converts the Boolean results (True/False) to floats (1.0 or 0.0)
            # .sum().item()	Sums how many were correct in this batch and converts it to float

    test_loss /= num_batches
    correct /= size
    # test_loss: Average loss across all test batches
    # correct: Fraction of correct predictions (i.e., accuracy) Accuracy = Number of correct predictions / Total number of predictions

    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [10]:
epochs = 5
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.292456  [   64/60000]
loss: 2.284010  [ 6464/60000]
loss: 2.265569  [12864/60000]
loss: 2.268365  [19264/60000]
loss: 2.243257  [25664/60000]
loss: 2.209792  [32064/60000]
loss: 2.225791  [38464/60000]
loss: 2.183168  [44864/60000]
loss: 2.181906  [51264/60000]
loss: 2.154438  [57664/60000]
Test Error: 
 Accuracy: 38.5%, Avg loss: 2.147512 

Epoch 2
-------------------------------
loss: 2.153708  [   64/60000]
loss: 2.145299  [ 6464/60000]
loss: 2.083499  [12864/60000]
loss: 2.111970  [19264/60000]
loss: 2.041953  [25664/60000]
loss: 1.982592  [32064/60000]
loss: 2.013886  [38464/60000]
loss: 1.923629  [44864/60000]
loss: 1.937470  [51264/60000]
loss: 1.868976  [57664/60000]
Test Error: 
 Accuracy: 57.5%, Avg loss: 1.862598 

Epoch 3
-------------------------------
loss: 1.890956  [   64/60000]
loss: 1.861298  [ 6464/60000]
loss: 1.735366  [12864/60000]
loss: 1.796638  [19264/60000]
loss: 1.664651  [25664/60000]
loss: 1.621276  [32064/600

In [16]:
classes = [
    "T-shirt/top",
    "Trouser",
    "Pullover",
    "Dress",
    "Coat",
    "Sandal",
    "Shirt",
    "Sneaker",
    "Bag",
    "Ankle boot",
]

model.eval()
x, y = test_data[0][0], test_data[0][1]
# test_data[0]: Gets the first image and its label from the dataset.
# [0]: The image (x) – a 1 × 28 × 28 tensor.
# [1]: The label (y) – an integer like 9.

# x: The image
# y: The actual label (e.g., 9 = "Ankle boot")

with torch.no_grad():
    x = x.to(device)
    pred = model(x)
    predicted, actual = classes[pred[0].argmax(0)], classes[y]
    print(f'Predicted: "{predicted}", Actual: "{actual}"')

Predicted: "Ankle boot", Actual: "Ankle boot"


**Tensors**

Tensors are a specialized data structure that are very similar to arrays and matrices. In PyTorch, we use tensors to encode the inputs and outputs of a model, as well as the model’s parameters.

Tensors can be initialized in various ways. Take a look at the following examples:

In [29]:
import numpy as np
# Directly from data
# Tensors can be created directly from data. The data type is automatically inferred.

data = [[1, 2],[3, 4]]
x_data = torch.tensor(data)

print(x_data)
print(x_data.shape)
print(x_data.ndim)
print(x_data.dtype)
print(x_data.itemsize)
print(x_data.device)
# shape: Returns a tuple representing the shape (dimensions) of the array.
# ndim: Returns the number of dimensions (axes) of the array.
# dtype: Provides the data type of the array elements.
# itemsize: Returns the size in bytes of each element
# device: Device tensor is stored on.

# By default, tensors are created on the CPU. We need to explicitly move tensors to the accelerator using .to method (after checking for accelerator availability). Keep in mind that copying large tensors across devices can be expensive in terms of time and memory!
# We move our tensor to the current accelerator if available
# if torch.accelerator.is_available():
#     tensor = tensor.to(torch.accelerator.current_accelerator())
print("------------------------------------------")
# From a NumPy array
# Tensors can be created from NumPy arrays.

np_array = np.array(data)
x_np = torch.from_numpy(np_array)
print(x_np)

print("------------------------------------------")
# From another tensor:
# The new tensor retains the properties (shape, datatype) of the argument tensor, unless explicitly overridden.

x_ones = torch.ones_like(x_data) # retains the properties of x_data
print(f"Ones Tensor: \n {x_ones} \n")

x_rand = torch.rand_like(x_data, dtype=torch.float) # overrides the datatype of x_data
print(f"Random Tensor: \n {x_rand} \n")

print("------------------------------------------")
# With random or constant values:
# shape is a tuple of tensor dimensions. In the functions below, it determines the dimensionality of the output tensor.

shape = (2,3,)
rand_tensor = torch.rand(shape)
ones_tensor = torch.ones(shape)
zeros_tensor = torch.zeros(shape)

print(f"Random Tensor: \n {rand_tensor} \n")
print(f"Ones Tensor: \n {ones_tensor} \n")
print(f"Zeros Tensor: \n {zeros_tensor}")

tensor([[1, 2],
        [3, 4]])
torch.Size([2, 2])
2
torch.int64
8
cpu
------------------------------------------
tensor([[1, 2],
        [3, 4]])
------------------------------------------
Ones Tensor: 
 tensor([[1, 1],
        [1, 1]]) 

Random Tensor: 
 tensor([[0.6909, 0.3353],
        [0.6910, 0.3305]]) 

------------------------------------------
Random Tensor: 
 tensor([[0.3844, 0.4920, 0.2729],
        [0.8826, 0.5658, 0.7343]]) 

Ones Tensor: 
 tensor([[1., 1., 1.],
        [1., 1., 1.]]) 

Zeros Tensor: 
 tensor([[0., 0., 0.],
        [0., 0., 0.]])


**Operations on Tensors**

Different operations on tensors: https://docs.pytorch.org/docs/stable/torch.html

In [32]:
# Standard numpy-like indexing and slicing:

tensor = torch.ones(4, 4)
print(f"First row: {tensor[0]}")
print(f"First column: {tensor[:, 0]}")
print(f"Last column: {tensor[..., -1]}")
# tensor[..., -1] is equivalent to tensor[:, -1]
tensor[:,1] = 0
print(tensor)
print("------------------------------------------")

# Joining tensors You can use torch.cat to concatenate a sequence of tensors along a given dimension.
t1 = torch.cat([tensor, tensor, tensor], dim=1)
print(t1)

First row: tensor([1., 1., 1., 1.])
First column: tensor([1., 1., 1., 1.])
Last column: tensor([1., 1., 1., 1.])
tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])
------------------------------------------
tensor([[1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.],
        [1., 0., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.]])


**Arithmetic operations**

In [37]:
# This computes the matrix multiplication between two tensors. y1, y2, y3 will have the same value
# ``tensor.T`` returns the transpose of a tensor
y1 = tensor @ tensor.T
y2 = tensor.matmul(tensor.T)
# This performs matrix multiplication:
  # @ is the operator version
  # matmul() is the function version
  # tensor.T returns the transpose of the tensor (swaps rows & columns)

y3 = torch.rand_like(y1)
torch.matmul(tensor, tensor.T, out=y3)

# This computes the element-wise product. z1, z2, z3 will have the same value
z1 = tensor * tensor
z2 = tensor.mul(tensor)

z3 = torch.rand_like(tensor)
torch.mul(tensor, tensor, out=z3)

tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]])

**Single-element tensors**

If you have a one-element tensor, for example by aggregating all values of a tensor into one value, you can convert it to a Python numerical value using item():

In [40]:
agg = tensor.sum()
# This returns a scalar tensor (0-dimensional), like tensor(12.)

agg_item = agg.item()
# .item() extracts the numerical value from a 1-element tensor.
# Converts tensor(12.) → 12.0 (Python float)

print(agg_item, type(agg_item))

12.0 <class 'float'>


**In-place operations**

Operations that store the result into the operand are called in-place. They are denoted by a _ suffix. For example: x.copy_(y), x.t_(), will change x.

In-place operations save some memory, but can be problematic when computing derivatives because of an immediate loss of history. Hence, their use is discouraged.

In [41]:
print(f"{tensor} \n")
tensor.add_(5)
print(tensor)

tensor([[1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.],
        [1., 0., 1., 1.]]) 

tensor([[6., 5., 6., 6.],
        [6., 5., 6., 6.],
        [6., 5., 6., 6.],
        [6., 5., 6., 6.]])


**Bridge with NumPy**

Tensors on the CPU and NumPy arrays can share their underlying memory locations, and changing one will change the other.

In [46]:
# Tensor to NumPy array
t = torch.ones(5)
print(f"t: {t}")
n = t.numpy()
print(f"n: {n}")

# A change in the tensor reflects in the NumPy array.
t.add_(1)
print(f"t: {t}")
print(f"n: {n}")

print("------------------------------------------")
# NumPy array to Tensor
n = np.ones(5)
t = torch.from_numpy(n)
print(f"t: {t}")
print(f"n: {n}")

# Changes in the NumPy array reflects in the tensor.
np.add(n, 1, out=n)
print(f"t: {t}")
print(f"n: {n}")

t: tensor([1., 1., 1., 1., 1.])
n: [1. 1. 1. 1. 1.]
t: tensor([2., 2., 2., 2., 2.])
n: [2. 2. 2. 2. 2.]
------------------------------------------
t: tensor([1., 1., 1., 1., 1.], dtype=torch.float64)
n: [1. 1. 1. 1. 1.]
t: tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
n: [2. 2. 2. 2. 2.]
