# **Lottery Ticket Hypothesis**

Within this notebook, I would like to demonstrate how I can catch the winning lottery in order to find subnetwork inside a large network.

## **1. Preparation**

Load packages needed and a dataset

In [2]:
# Load packages
import torch
import torch.nn as nn
from tqdm import tqdm
from torchvision import datasets
from torch.utils.data import Dataset
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

In [16]:
# Load a dataset (MNIST dataset)
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

testing_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

# Save training and testing data
torch.save(training_data, "./data/mnist_training_data.pt")
torch.save(testing_data, "./data/mnist_testing_data.pt")

100%|██████████| 26.4M/26.4M [01:21<00:00, 324kB/s] 
100%|██████████| 29.5k/29.5k [00:00<00:00, 169kB/s]
100%|██████████| 4.42M/4.42M [00:12<00:00, 350kB/s]
100%|██████████| 5.15k/5.15k [00:00<00:00, 4.27MB/s]


In [10]:
# Load the training and testing dataset
training_data = torch.load("./data/mnist_training_data.pt", weights_only=False)
testing_data = torch.load("./data/mnist_testing_data.pt", weights_only=False)

In [11]:
# Wrap training and testing data into data loader
training_dataloader = DataLoader(training_data, batch_size=128, shuffle=True)
testing_dataloader = DataLoader(testing_data, batch_size=128, shuffle=True)

## **Define MLP Model**

In [49]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_1 = nn.Linear(784, 256)
        self.linear_2 = nn.Linear(256, 128)
        self.linear_3 = nn.Linear(128, 32)
        self.linear_4 = nn.Linear(32, 10)
        self.relu = nn.ReLU()

    def forward(self, x):
        y = self.linear_1(x)
        y = self.relu(y)
        y = self.linear_2(y)
        y = self.relu(y)
        y = self.linear_3(y)
        y = self.relu(y)
        output = self.linear_4(y)

        return output

In [50]:
model = MLP()
model

MLP(
  (linear_1): Linear(in_features=784, out_features=256, bias=True)
  (linear_2): Linear(in_features=256, out_features=128, bias=True)
  (linear_3): Linear(in_features=128, out_features=32, bias=True)
  (linear_4): Linear(in_features=32, out_features=10, bias=True)
  (relu): ReLU()
)

# **Find Winning Ticket**

In order to find winning ticket which is the subnetwork that maximally contribute to the result, we need to copy the initial parameters of the model (weights and biases) which then be used to train pruned model.

In [51]:
import copy

# Copy initial parameters
model_parameter_copy = copy.deepcopy(model.state_dict())

In [52]:
# Define optimizer and loss function
optimizer = torch.optim.Adam(lr=0.0001, params=model.parameters())
loss_fn = nn.CrossEntropyLoss()

In [53]:
def train(model,
          training_data_loader,
          testing_data_loader,
          optimizer,
          loss_fn,
          epoch):
    for ep in range(epoch):
        # Training
        model.train()
        # Batch processing
        for x_batch, y_batch in training_data_loader:
            optimizer.zero_grad()
            x_batch = x_batch.view(x_batch.size(0), -1)
            prediction = model.forward(x_batch)
            loss = loss_fn(prediction, y_batch)
            loss.backward()
            optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            for x_test_batch, y_test_batch in testing_data_loader:
                x_test_batch = x_test_batch.view(x_test_batch.size(0), -1)
                eval_output = model.forward(x_test_batch)
            
            # Calculate metrics
            loss_test = loss_fn(eval_output, y_test_batch)
            eval_output_labeled = torch.argmax(eval_output, dim=1)
            accuracy = (eval_output_labeled == y_test_batch).sum().item() / len(eval_output_labeled)

        print(f"Epoch {ep+1} ==> Loss = {loss_test} | Accuracy = {round(accuracy, 2)}")


In [54]:
# Training
train(
    model,
    training_dataloader,
    testing_dataloader,
    optimizer,
    loss_fn,
    30
)

Epoch 1 ==> Loss = 0.4657773971557617 | Accuracy = 0.75
Epoch 2 ==> Loss = 0.27131006121635437 | Accuracy = 0.94
Epoch 3 ==> Loss = 0.5274918675422668 | Accuracy = 0.88
Epoch 4 ==> Loss = 0.3444649875164032 | Accuracy = 0.94
Epoch 5 ==> Loss = 0.3275434672832489 | Accuracy = 0.88
Epoch 6 ==> Loss = 0.3738173544406891 | Accuracy = 0.81
Epoch 7 ==> Loss = 0.5602895021438599 | Accuracy = 0.81
Epoch 8 ==> Loss = 0.3862707018852234 | Accuracy = 0.75
Epoch 9 ==> Loss = 0.31783944368362427 | Accuracy = 0.94
Epoch 10 ==> Loss = 0.3195052742958069 | Accuracy = 0.81
Epoch 11 ==> Loss = 0.7905294299125671 | Accuracy = 0.75
Epoch 12 ==> Loss = 0.3970736265182495 | Accuracy = 0.88
Epoch 13 ==> Loss = 0.42180317640304565 | Accuracy = 0.75
Epoch 14 ==> Loss = 0.6111569404602051 | Accuracy = 0.62
Epoch 15 ==> Loss = 0.3974618911743164 | Accuracy = 0.88
Epoch 16 ==> Loss = 0.3167000114917755 | Accuracy = 0.94
Epoch 17 ==> Loss = 0.21949374675750732 | Accuracy = 0.94
Epoch 18 ==> Loss = 0.59560716152191

# **Pruning**

After training the model and achieving the highest accuracy and the lowest loss value, then we perform pruning to create a sparse matrix (turn off the effect of few neurons within the big network).

In [64]:
model.state_dict()

OrderedDict([('linear_1.weight',
              tensor([[ 0.0950,  0.0353,  0.0237,  ..., -0.0350,  0.0286,  0.0392],
                      [-0.0019, -0.0829,  0.0020,  ...,  0.0378,  0.0912,  0.0309],
                      [-0.0643, -0.0754, -0.0452,  ..., -0.0726, -0.1302, -0.0625],
                      ...,
                      [-0.0724, -0.0226, -0.0903,  ...,  0.0350,  0.0261, -0.0487],
                      [ 0.0541, -0.0775, -0.0612,  ..., -0.0666,  0.0562, -0.0386],
                      [ 0.0415,  0.0179, -0.0985,  ..., -0.0396, -0.0039, -0.0418]])),
             ('linear_1.bias',
              tensor([ 2.0831e-01, -2.1906e-02,  4.1102e-02, -6.6021e-02, -3.6502e-02,
                       2.8309e-03,  4.5656e-02, -3.7591e-02, -3.4435e-02, -4.4073e-02,
                       2.0710e-02, -1.2697e-01,  1.0828e-02,  1.1019e-01,  4.3496e-03,
                       1.7435e-01,  1.3645e-01,  4.6424e-02, -3.3416e-02,  3.0224e-01,
                      -2.4514e-01,  1.1537e-01,  3.174

In [75]:
model._modules

{'linear_1': Linear(in_features=784, out_features=256, bias=True),
 'linear_2': Linear(in_features=256, out_features=128, bias=True),
 'linear_3': Linear(in_features=128, out_features=32, bias=True),
 'linear_4': Linear(in_features=32, out_features=10, bias=True),
 'relu': ReLU()}

In [81]:
import torch
import torch.nn.utils.prune as prune

for module in model._modules.keys():
    if "linear" in module:
        prune.l1_unstructured(model._modules[module], name="weight", amount=0.2)

In [79]:
for module in model._modules.keys():
    if "linear" in module:
        prune.remove(model._modules[module], name="weight")

In [82]:
model.state_dict()

OrderedDict([('linear_1.bias',
              tensor([ 2.0831e-01, -2.1906e-02,  4.1102e-02, -6.6021e-02, -3.6502e-02,
                       2.8309e-03,  4.5656e-02, -3.7591e-02, -3.4435e-02, -4.4073e-02,
                       2.0710e-02, -1.2697e-01,  1.0828e-02,  1.1019e-01,  4.3496e-03,
                       1.7435e-01,  1.3645e-01,  4.6424e-02, -3.3416e-02,  3.0224e-01,
                      -2.4514e-01,  1.1537e-01,  3.1745e-02,  1.8782e-01,  7.1748e-02,
                       6.9810e-02,  3.8171e-02,  9.6300e-02, -2.6342e-02,  5.9144e-02,
                      -1.3105e-01,  1.6288e-01, -4.4566e-02, -1.0080e-01,  4.6183e-02,
                       4.4561e-02,  6.9640e-02, -1.9696e-02, -5.0873e-02,  9.1190e-02,
                       5.0227e-03, -2.1194e-02, -1.8817e-02,  4.2949e-02,  8.3144e-03,
                       3.7185e-02,  1.0158e-01, -4.3520e-02,  5.7717e-02,  9.2958e-02,
                       1.9421e-02, -4.1240e-02, -3.5855e-02,  4.2309e-02,  8.9072e-02,
            