# Waste Classifier PyTorch Model

## Goal
Create a PyTorch model that can distinguish between 6 classes:
- cardboard
- glass
- metal
- paper
- plastic
- trash

## 0 - Setup

In [1]:
import torch
import torchvision

print(torch.__version__) # 1.12+
print(torchvision.__version__) # 0.13+

import matplotlib.pyplot as plt

from torch import nn
from torchvision import transforms

try:
    from scripts import data_setup, engine, utils
    from helper_functions import download_data, set_seeds, plot_loss_curves
    print("Base imports done.")
except:
    print("Couldn't find helper scripts, downloading from Github...")
    !git clone https://github.com/tznpau/waste-classifier
    !mv waste-classifier/scripts .
    !mv waste-classifier/helper_functions.py .
    !rm -rf waste-classifier
    from scripts import data_setup, engine, utils
    from helper_functions import download_data, set_seeds, plot_loss_curves

2.1.1
0.16.1
Base imports done.


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
torch.cuda.is_available()

True

## 1. Getting data

In [4]:
trash_dataset_path = download_data(source="https://github.com/tznpau/waste-classifier/raw/main/data/trash_dataset.zip",
                                   destination="trash_dataset")
trash_dataset_path

[INFO] data\trash_dataset directory exists, skipping download.


WindowsPath('data/trash_dataset')

In [5]:
# Setup training and test data directories
train_dir = trash_dataset_path / "train"
test_dir = trash_dataset_path / "test"

train_dir, test_dir

(WindowsPath('data/trash_dataset/train'),
 WindowsPath('data/trash_dataset/test'))

## 2. Deployment questions
1. ideal model scenario ?
2. where is the model going to go ?
3. how is the model going to function ?

**ideal use case**
- performs at 90%+ accuracy
- performs fast: 30fps

## 3. EffNetB2 feature extractor

https://pytorch.org/vision/stable/models/generated/torchvision.models.efficientnet_b2.html#torchvision.models.EfficientNet_B2_Weights

In [6]:
effnetb2 = torchvision.models.efficientnet_b2()
effnetb2

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [7]:
try:
    from torchinfo import summary
except:
    print("[INFO] Couldn't find torchinfo... installing it.")
    !pip install -q torchinfo
    from torchinfo import summary

In [8]:


# summary(effnetb2,
#         input_size=(32, 3, 224, 224), # make sure this is "input_size", not "input_shape" (batch_size, color_channels, height, width)
#         verbose=0,
#         col_names=["input_size", "output_size", "num_params", "trainable"],
#         col_width=20,
#         row_settings=["var_names"]
# )

The last layer of EffNetB2 is called **`classifier`**.

In [9]:
effnetb2.classifier

Sequential(
  (0): Dropout(p=0.3, inplace=True)
  (1): Linear(in_features=1408, out_features=1000, bias=True)
)

In [10]:
set_seeds()

effnetb2.classifier = nn.Sequential(
    nn.Dropout(p=0.3, inplace=True),
    nn.Linear(in_features=1408, out_features=6, bias=True)
)

In [11]:
effnetb2.classifier

Sequential(
  (0): Dropout(p=0.3, inplace=True)
  (1): Linear(in_features=1408, out_features=6, bias=True)
)

So after freezing the base layer I adapted the EffNetB2 architecture to suit my use case.

### 3.1 Creating an EffNetB2 feature extractor

In [12]:
def create_effnetb2_model(num_classes:int=6,
                          seed:int=42):
    weights = torchvision.models.EfficientNet_B2_Weights.DEFAULT
    transforms = weights.transforms()
    model = torchvision.models.efficientnet_b2(weights=weights).to(device)

    for param in model.parameters():
        param.requires_grad = False

    torch.manual_seed(seed)
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.3, inplace=True),
        nn.Linear(in_features=1408, out_features=num_classes)
    ).to(device)

    return model, transforms

In [13]:
effnetb2, effnetb2_transforms = create_effnetb2_model(num_classes=6, seed=42)

In [14]:
effnetb2_transforms

ImageClassification(
    crop_size=[288]
    resize_size=[288]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BICUBIC
)

### 3.2 DataLoaders for EffNetB2

In [15]:
from scripts import data_setup

train_dataloader_effnetb2, test_dataloader_effnetb2, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                                                  test_dir=test_dir,
                                                                                                  transform=effnetb2_transforms,
                                                                                                  batch_size=32)

Let's check that the split was done correctly during `data_setup.create_dataloaders`.

In [16]:
len(train_dataloader_effnetb2), len(test_dataloader_effnetb2), class_names

(64, 16, ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash'])

We have 64 batches for the train dataloader and 16 batches for the test dataloader.

### 3.3 Training EffNetB2 feature extractor

In [17]:
from scripts import engine

LEARNING_RATE = 1e-3
EPOCHS = 10

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=effnetb2.parameters(),
                             lr=LEARNING_RATE)

set_seeds(42)

# effnetb2_results = engine.train(model=effnetb2,
#                                 train_dataloader=train_dataloader_effnetb2,
#                                 test_dataloader=test_dataloader_effnetb2,
#                                 epochs=EPOCHS,
#                                 optimizer=optimizer,
#                                 loss_fn=loss_fn,
#                                 device=device)

In [18]:
from helper_functions import plot_loss_curves

# plot_loss_curves(effnetb2_results)

## 4. ViT feature extractor

https://pytorch.org/vision/main/models/generated/torchvision.models.vit_b_16.html#torchvision.models.ViT_B_16_Weights

In [19]:
vit = torchvision.models.vit_b_16()
vit.heads

Sequential(
  (head): Linear(in_features=768, out_features=1000, bias=True)
)

The last layer of ViT_b_16 is called **`heads`**.

### 4.1 Creating a ViT feature extractor

In [20]:
def create_vit_model(num_classes:int=6,
                     seed:int=42):
    weights = torchvision.models.ViT_B_16_Weights.DEFAULT
    transforms = weights.transforms()
    model = torchvision.models.vit_b_16(weights=weights).to(device)

    for param in model.parameters():
        param.requires_grad = False

    torch.manual_seed(seed)
    model.heads = nn.Sequential(
        nn.Linear(in_features=768, out_features=num_classes)
    )

    return model, transforms

In [21]:
vit, vit_transforms = create_vit_model()

In [22]:
vit

VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_a

In [23]:
from torchinfo import summary

# Print ViT feature extractor model summary (uncomment for full output)
summary(vit,
        input_size=(1, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"])

Layer (type (var_name))                                      Input Shape          Output Shape         Param #              Trainable
VisionTransformer (VisionTransformer)                        [1, 3, 224, 224]     [1, 6]               768                  Partial
├─Conv2d (conv_proj)                                         [1, 3, 224, 224]     [1, 768, 14, 14]     (590,592)            False
├─Encoder (encoder)                                          [1, 197, 768]        [1, 197, 768]        151,296              False
│    └─Dropout (dropout)                                     [1, 197, 768]        [1, 197, 768]        --                   --
│    └─Sequential (layers)                                   [1, 197, 768]        [1, 197, 768]        --                   False
│    │    └─EncoderBlock (encoder_layer_0)                   [1, 197, 768]        [1, 197, 768]        (7,087,872)          False
│    │    └─EncoderBlock (encoder_layer_1)                   [1, 197, 768]        [1, 1

### 4.2 Creating DataLoaders for ViT

In [24]:
from scripts import data_setup

train_dataloader_vit, test_dataloader_vit, class_names = data_setup.create_dataloaders(train_dir=train_dir,
                                                                                                  test_dir=test_dir,
                                                                                                  transform=vit_transforms,
                                                                                                  batch_size=32)

In [25]:
len(train_dataloader_vit), len(test_dataloader_vit), class_names

(64, 16, ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash'])

### 4.3 Training ViT feature extractor

In [26]:
# from scripts import engine

# loss_fn = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(params=vit.parameters(),
#                              lr=1e-3)

# set_seeds()

# vit_results = engine.train(
#     model=vit,
#     train_dataloader=train_dataloader_vit,
#     test_dataloader=test_dataloader_vit,
#     epochs=10,
#     optimizer=optimizer,
#     loss_fn=loss_fn,
#     device=device
# )

## 5. ResNet50 feature extractor

https://pytorch.org/vision/main/models/generated/torchvision.models.resnet50.html

In [27]:
resnet50 = torchvision.models.resnet50()
resnet50

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

The last layer of ResNet50 is called **`fc`**.

### 5.1 Creating a ResNet50 feature extractor

In [29]:
def create_resnet50_model(num_classes:int=6,
                          seed:int=42):
    weights = torchvision.models.ResNet50_Weights.DEFAULT
    transforms = weights.transforms()
    model = torchvision.models.resnet50(weights=weights).to(device)

    for param in model.parameters():
        param.requires_grad = False

    torch.manual_seed(seed)
    model.fc = nn.Linear(in_features=2048, out_features=num_classes).to(device)

    return model, transforms

In [30]:
resnet50, resnet50_transforms = create_resnet50_model(num_classes=6, seed=42)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to C:\Users\paula/.cache\torch\hub\checkpoints\resnet50-11ad3fa6.pth
100%|█████████████████████████████████████████████████████████████████████████████| 97.8M/97.8M [00:13<00:00, 7.64MB/s]


In [31]:
resnet50.fc

Linear(in_features=2048, out_features=6, bias=True)

In [32]:
resnet50_transforms

ImageClassification(
    crop_size=[224]
    resize_size=[232]
    mean=[0.485, 0.456, 0.406]
    std=[0.229, 0.224, 0.225]
    interpolation=InterpolationMode.BILINEAR
)