<a href="https://colab.research.google.com/github/vjhawar12/FreshNET-A-mobileNET-adaptation/blob/main/FreshNET.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip -q install lightning-bolts

In [None]:
import torch
import torch.nn as nn
from torch import optim
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, random_split
from tqdm import tqdm
from pl_bolts.optimizers.lr_scheduler import LinearWarmupCosineAnnealingLR

# Loading dataset from Google Cloud Storage

In [None]:
!gcloud auth login

In [None]:
!gcloud auth application-default login

In [None]:
!gcloud config set project freshnet-466505

In [None]:
def download_bucket_with_transfer_manager(bucket_name, destination_directory="", workers=8, max_results=1000):

    from google.cloud.storage import Client, transfer_manager

    client = Client()
    bucket = client.bucket(bucket_name)

    blob_names = [blob.name for blob in bucket.list_blobs(max_results=max_results)]

    results = transfer_manager.download_many_to_path(
        bucket, blob_names, destination_directory=destination_directory, max_workers=workers
    )

    for name, result in zip(blob_names, results):
        if isinstance(result, Exception):
            print("Failed to download {} due to exception: {}".format(name, result))
        else:
            print("Downloaded {} to {}.".format(name, destination_directory + name))

download_bucket_with_transfer_manager("freshnet-images", destination_directory="/content/dataset/", workers=8, max_results=None)

In [None]:
!cd /content/dataset/train/rotten && unzip -j train_rotten_images.zip

In [None]:
!cd /content/dataset/train/fresh && unzip -j train_fresh_images.zip

In [None]:
!cd /content/dataset/test/fresh && unzip -j test_fresh_images.zip

In [None]:
!cd /content/dataset/test/rotten && unzip -j test_rotten_images.zip

# Hyperparameters

In [11]:
BATCH_SIZE = 256
RANDOM_SEED = 42
TRAIN_SIZE, VAL_SIZE = 0.8, 0.2
IMG_SIZE = 320 # Original images are ~400*400 px so resizing them to 320 retains detail while reducing computational cost
EPOCHS = 40
WARMUP_DUR = 10 # num of warmup epochs
LR_MIN = 0.0001 # minimum learning rate
MILD_DROPOUT_RATE = 0.10

# Preprocessing

In [12]:
"""
    Mild preprocessing only. No MixUp, CutMix, RandomCrop, or ColorJitter because a lightweight model like this one
    is less likely to overfit. Also, this model needs to perform fine-grained classification,
    so aggressive augmentations could distort the small image regions (like signs of fungi or discoloration)
    that are crucial for accurate prediction.
"""

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation([-180, 180]),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),

])

test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Loading data into Dataloader

In [13]:
path_to_train_imgs = "/content/dataset/train"
path_to_test_imgs = "/content/dataset/test"

full_train_data = ImageFolder(path_to_train_imgs, transform=train_transform)
test_data = ImageFolder(path_to_test_imgs, transform=test_transform)
train_data, val_data = random_split(full_train_data, [TRAIN_SIZE, VAL_SIZE], generator=torch.Generator().manual_seed(RANDOM_SEED))

print(f"{full_train_data.class_to_idx} \n {test_data.class_to_idx}")

{'fresh': 0, 'rotten': 1} 
 {'fresh': 0, 'rotten': 1}


In [14]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=10)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

# CUDA optimizations

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if torch.cuda.is_available():
  torch.backends.cuda.matmul.allow_tf32 = True
  torch.backends.cudnn.allow_tf32 = True
  torch.backends.cuda.enable_flash_sdp(True)
  torch.backends.cuda.enable_mem_efficient_sdp(True)
  torch.backends.cuda.enable_math_sdp(True)

# Depthwise Seperable Convolution implementation

In [16]:
"""
Helper class for FreshNET. This is the implementation of the "Depthwise seperable convolution" as outlined in the MobileNET paper. Instances of this class are stacked
together and each stack constitutes a hidden layer in the NN. A DepthwiseSeperableConvolution instance is structured as follows:

- Pointwise convolution (applied along all channels at a single pixel) for channel expansion.
- Depthwise convolution (applies across each channel individually) for the model to efficiently learn features in a high dimensional space.
- Pointwise convolution to compress channels slightly.

In between the convolutional layers are batch normalizations and after the final layer is an activation function.
"""

class DepthwiseSeperableConvolution(nn.Module):
  def __init__(self, in_channels, out_channels, exp_factor, downsample_factor, kernel_size=3):
    super().__init__()

    self.in_channels = in_channels
    self.out_channels = out_channels
    self.exp_factor = exp_factor
    self.downsample_factor = downsample_factor
    self.kernel_size = kernel_size

    out = self.in_channels * self.exp_factor

    self.preserve = self.in_channels == self.out_channels
    """
        batchnorm normalizes inputs which has shown to improve training. It's typically applied after the convolutional layer output and before the activation function
        The MobileNET paper also mentions, with the exception of the final fully connected linear layer, "all layers are followed by a batchnorm and ReLU nonlinearity"
    """

    self.block = nn.Sequential(
        nn.Conv2d(in_channels=self.in_channels, out_channels=out, kernel_size=1, stride=1, groups=1), # expansion
        nn.BatchNorm2d(out),

        # adding padding because the image downsamples, but skip connections require same size
        nn.Conv2d(in_channels=out, out_channels=out, kernel_size=self.kernel_size, padding=self.kernel_size // 2, stride=self.downsample_factor, groups=out), # depthwise
        nn.BatchNorm2d(out),
        nn.Conv2d(in_channels=out, out_channels=self.out_channels, kernel_size=1, stride=1, groups=1), # pointwise
        nn.BatchNorm2d(self.out_channels),

        nn.ReLU6(), # output ∈ [0, 6] ==> more efficient than regular ReLU
    )

  def forward(self, x):
    x = self.block(x)
    return x

# Skip connection

In [17]:
# a simple skip connection implementation

"""
FreshNET only applies skip connections between DepthwiseSeperableConvolution instances which preserve channel dimension in order to adhere to the laws of vector addition.
"""
class SkipConnection(nn.Module):
  def __init__(self, seq): # seq is an nn.Sequential instance
    super().__init__()

    self.seq = seq

    for module in seq:
      if not hasattr(module, 'preserve'):
        raise Exception("Cannot apply skip connection between layers of different channel dimensions")

    self.seq = seq

  def forward(self, x):
    self.gradient = x

    for depthwise_sep_conv in self.seq:
      x = depthwise_sep_conv(x)

    return x + self.gradient

# FreshNet: A MobileNET adaptation

In [18]:
# A MobileNET adaptation
class FreshNET(nn.Module):

  """
  FreshNET applies the concept of depthwise seperable convolutions as mentioned in the MobileNet paper, but the image dimensions are slightly increased
  to better suit the dataset and leverage higher image quality. Specifically, FreshNET decouples spatial filtering and channel mixing,
  the two operations traditionally combined in standard convolutions, and performs them as separate, more efficient operations. As the authors of MobileNet noted,
  it "drastically reduc[es] computation and model size". Despite this computational efficiency, accuracy is largely preserved -- the original MobileNet paper reports
  only about a 1% drop in accuracy compared to standard convolutions.

  This separation is implemented in DepthwiseSeperableConvolution class. Each instance consists of:
  - A pointwise 1*1 convolution for channel expansion, increasing feature dimensionality.
  - A depthwise 3*3 convolution applied independently per channel for spatial filtering.
  - Another pointwise convolution to project back to a lower-dimensional space.

  This design significantly reduces computational cost while maintaining high representational power. There are 10 layers, of which 7 are DepthwiseSeperableConvolution layers.
  Each DepthwiseSeperableConvolution layer consists of 1-4 DepthwiseSeperableConvolution instances. Within the DepthwiseSeperableConvolution layer, there are non-linear activation functions.

  Skip connections are also applied where possible to improve gradient flow and preserve fine-grained information. For a task like fresh/rotten classification,
  fine-grained information can be very valuable.
  """

  def __init__(self):
    super().__init__()

    # SiLU is good for vanishing gradients and can be applied in the later layers where ReLU might not be as effective (dead neurons)
    silu = nn.SiLU()

    # Regularization
    """
    The original paper notes "we use less regularization and data augmentation techniques because small models have less trouble with overfitting".
    This is why I chose to use dropout with 15% probability since that's on the lower end of the probability spectrum.
    """
    dropout_mild = nn.Dropout(MILD_DROPOUT_RATE)

    # initial regular convolution
    initial_conv = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2) # 320*320*3 --> 160*160*32

    # inverted residual block 1: 160*160*32 --> 160*160*16, channel expansion = 1
    block_1 = nn.Sequential(
        DepthwiseSeperableConvolution(32, 16, 1, 1)
    )

    # inverted residual block 2: 160*160*16 --> 80*80*24, channel expansion = 6
    block_2 = nn.Sequential(
        DepthwiseSeperableConvolution(16, 24, 6, 2),
        DepthwiseSeperableConvolution(24, 24, 6, 1)
    )

    # inverted residual block 3: 80*80*24 --> 40*40*32, channel expansion = 6
    block_3 = nn.Sequential(
        DepthwiseSeperableConvolution(24, 32, 6, 2),
        SkipConnection(nn.Sequential(
              DepthwiseSeperableConvolution(32, 32, 6, 1),
              DepthwiseSeperableConvolution(32, 32, 6, 1),
          )
        )
    )

    # inverted residual block 4: 40*40*32 --> 20*20*64, channel expansion = 6
    block_4 = nn.Sequential(
        DepthwiseSeperableConvolution(32, 64, 6, 2),
        SkipConnection(nn.Sequential(
              DepthwiseSeperableConvolution(64, 64, 6, 1),
              DepthwiseSeperableConvolution(64, 64, 6, 1),
              DepthwiseSeperableConvolution(64, 64, 6, 1),
            )
        )
    )

    # inverted residual block 5: 20*20*64 --> 20*20*96, channel expansion = 6
    block_5 = nn.Sequential(
        DepthwiseSeperableConvolution(64, 96, 6, 1),
        SkipConnection(nn.Sequential(
              DepthwiseSeperableConvolution(96, 96, 6, 1),
              DepthwiseSeperableConvolution(96, 96, 6, 1),
            )
        )
    )

    # inverted residual block 6: 20*20*96 --> 10*10*160, channel expansion = 6
    block_6 = nn.Sequential(
        DepthwiseSeperableConvolution(96, 160, 6, 2),
        SkipConnection(nn.Sequential(
              DepthwiseSeperableConvolution(160, 160, 6, 1),
              DepthwiseSeperableConvolution(160, 160, 6, 1),
            )
        )
    )

    # inverted residual block 7: 10*10*160 --> 10*10*320, channel expansion = 6
    block_7 = nn.Sequential(
        DepthwiseSeperableConvolution(160, 320, 6, 1)
    )

    # final regular convolution
    final_conv = nn.Conv2d(in_channels=320, out_channels=1280, kernel_size=1, stride=1) # 10*10*320 --> 10*10*1280

    """
      MaxPooling is often preferred for highlighting dominant features, but it can be too aggressive in lightweight models
      like MobileNet or EfficientNet, which already have low spatial resolution. It can discard valuable spatial
      information. Average Pooling computes the average of nearby pixels, preserving more context. This makes it more
      suitable for compact architectures like this one.
    """

    avg_pool = nn.AvgPool2d(10) # 10*10*1280 --> 1*1*1280
    flatten = nn.Flatten() # fc layers expect a single tensor not a feature map

    # fully connected layers
    """
    The MobileNET paper mapped from 1280 directly to 1000 since it was being trained on ImageNET. My dataset, however, only has 2
    output classes: fresh (0) or rotten (1). Mapping directly from 1280 to 2 is an abrupt jump which could limit the model's ability to
    distinguish between classes, so I introduced an additional layer to map from 1280 to 500 then applied nonlinearity and
    mild dropout before going from 500 to 2.
    """
    fc_1 = nn.Linear(in_features=1280, out_features=500)
    fc_2 = nn.Linear(in_features=500, out_features=2)

    """
    The sequence of layers is as follows:

      - intial regular convolution
      - 4 stacks of DepthwiseSeperableConvolution
      - mild dropout for regularization
      - 3 more stacks of DepthwiseSeperableConvolution
      - 1 final regular convolution
      - average pooling
      - activation function
      - first fully connected linear layer
      - activation function
      - mild dropout for regularization
      - final fully connected linear layer

    There are skip connections in blocks 3-6.
    """

    self.layers = nn.Sequential(
        initial_conv,

        block_1,
        block_2,
        block_3,
        block_4,
        dropout_mild,

        block_5,
        block_6,
        block_7,

        final_conv,
        avg_pool,
        flatten,
        silu,
        dropout_mild,

        fc_1,
        silu,

        fc_2
    )


  def forward(self, x):
    for layer in self.layers:
      x = layer(x)

    return x

# Setup

In [19]:
cnn = FreshNET()
cnn = torch.compile(cnn)
cnn.to(device)

loss_fn = nn.CrossEntropyLoss() # std loss function for classification

optimizer = optim.Adam(cnn.parameters()) # RMSProp is pretty sensitive to changes in LR so i wanted to try Adam

scaler = torch.amp.GradScaler()

"""
Not mentioned in the original paper, but this scheduler was used because linear warmup reduces volatility in the earlier epochs.
Cosine annealing can improve training stability and convergence and declining LR steadily means the model will rely more on the features
it learns early on -- the key differentiators -- rather than picking up potential noise and overfitting.
"""
scheduler = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=WARMUP_DUR, eta_min=LR_MIN, max_epochs=EPOCHS)

  scheduler = LinearWarmupCosineAnnealingLR(optimizer, warmup_epochs=WARMUP_DUR, eta_min=LR_MIN, max_epochs=EPOCHS)


# Train and validate

In [20]:
for i in range(EPOCHS):
    torch.cuda.empty_cache()
    correct, total = 0, 0
    running_loss = 0

    cnn.train(True)

    loop = tqdm(train_dataloader, desc=f"Epoch {i+1}/{EPOCHS}", leave=True, disable=False)

    for j, (input, labels) in enumerate(loop, 1):
        input, labels = input.to(device), labels.to(device)
        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
          output = cnn(input)
          loss = loss_fn(output, labels)
          running_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        avg_loss = running_loss / j
        loop.set_postfix(loss=avg_loss)

        pred = torch.argmax(output, dim=1)

        total += labels.size(0)
        correct += (pred == labels).sum().item()


    cnn.eval()
    val_total, val_correct = 0, 0

    with torch.no_grad():
        for image, label in val_dataloader:
            image, label = image.to(device), label.to(device)

            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
              output = cnn(image)

            pred = torch.argmax(output, dim=1)
            val_total += label.size(0)
            val_correct += (pred == label).sum().item()

        val_accuracy = val_correct / val_total
        accuracy = correct / total


    scheduler.step()

    current_lr = optimizer.param_groups[0]['lr']
    print(f"Epoch {i + 1}: LR={current_lr:.6f} \t Train Acc: {accuracy:.4f} \t Val Acc: {val_accuracy:.4f}")

Epoch 1/40: 100%|██████████| 74/74 [03:26<00:00,  2.80s/it, loss=0.694]


Epoch 1: LR=0.000111 	 Train Acc: 0.4751 	 Val Acc: 0.4711


Epoch 2/40: 100%|██████████| 74/74 [00:26<00:00,  2.74it/s, loss=0.419]


Epoch 2: LR=0.000222 	 Train Acc: 0.7976 	 Val Acc: 0.8721


Epoch 3/40: 100%|██████████| 74/74 [00:27<00:00,  2.71it/s, loss=0.279]


Epoch 3: LR=0.000333 	 Train Acc: 0.8914 	 Val Acc: 0.8747


Epoch 4/40: 100%|██████████| 74/74 [00:27<00:00,  2.71it/s, loss=0.254]


Epoch 4: LR=0.000444 	 Train Acc: 0.9003 	 Val Acc: 0.8503


Epoch 5/40: 100%|██████████| 74/74 [00:27<00:00,  2.66it/s, loss=0.218]


Epoch 5: LR=0.000556 	 Train Acc: 0.9153 	 Val Acc: 0.8973


Epoch 6/40: 100%|██████████| 74/74 [00:27<00:00,  2.73it/s, loss=0.216]


Epoch 6: LR=0.000667 	 Train Acc: 0.9143 	 Val Acc: 0.9176


Epoch 7/40: 100%|██████████| 74/74 [00:27<00:00,  2.72it/s, loss=0.2]


Epoch 7: LR=0.000778 	 Train Acc: 0.9170 	 Val Acc: 0.8694


Epoch 8/40: 100%|██████████| 74/74 [00:27<00:00,  2.70it/s, loss=0.199]


Epoch 8: LR=0.000889 	 Train Acc: 0.9178 	 Val Acc: 0.8935


Epoch 9/40: 100%|██████████| 74/74 [00:27<00:00,  2.69it/s, loss=0.183]


Epoch 9: LR=0.001000 	 Train Acc: 0.9246 	 Val Acc: 0.9263


Epoch 10/40: 100%|██████████| 74/74 [00:27<00:00,  2.71it/s, loss=0.169]


Epoch 10: LR=0.001000 	 Train Acc: 0.9310 	 Val Acc: 0.9049


Epoch 11/40: 100%|██████████| 74/74 [00:27<00:00,  2.73it/s, loss=0.148]


Epoch 11: LR=0.000998 	 Train Acc: 0.9403 	 Val Acc: 0.9339


Epoch 12/40: 100%|██████████| 74/74 [00:27<00:00,  2.71it/s, loss=0.136]


Epoch 12: LR=0.000990 	 Train Acc: 0.9492 	 Val Acc: 0.9291


Epoch 13/40: 100%|██████████| 74/74 [00:27<00:00,  2.67it/s, loss=0.121]


Epoch 13: LR=0.000978 	 Train Acc: 0.9541 	 Val Acc: 0.9208


Epoch 14/40: 100%|██████████| 74/74 [00:27<00:00,  2.70it/s, loss=0.123]


Epoch 14: LR=0.000961 	 Train Acc: 0.9527 	 Val Acc: 0.9553


Epoch 15/40: 100%|██████████| 74/74 [00:26<00:00,  2.77it/s, loss=0.113]


Epoch 15: LR=0.000940 	 Train Acc: 0.9559 	 Val Acc: 0.9358


Epoch 16/40: 100%|██████████| 74/74 [00:27<00:00,  2.67it/s, loss=0.109]


Epoch 16: LR=0.000914 	 Train Acc: 0.9594 	 Val Acc: 0.9388


Epoch 17/40: 100%|██████████| 74/74 [00:27<00:00,  2.69it/s, loss=0.105]


Epoch 17: LR=0.000884 	 Train Acc: 0.9600 	 Val Acc: 0.9627


Epoch 18/40:  93%|█████████▎| 69/74 [00:25<00:01,  3.35it/s, loss=0.0858]Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7abc1874c360>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1618, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/dataloader.py", line 1582, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/multiprocessing/connection.py", line 948, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/selectors.py", l

KeyboardInterrupt: 

# Model size (no quantization)

In [22]:
param_size = 0
for param in cnn.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in cnn.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 11.111MB


# Model size (with quantization)

# Test

In [None]:
def test(test_dataloader):
  accuracy = 0
  total = 0

  for batch in test_dataloader:
    images, labels = batch
    images = images.to(device)
    labels = labels.to(device)

    predicted = torch.argmax(cnn(images), dim=1)
    accuracy += (predicted == labels).sum().item()
    total += labels.size(0)

  avg_acc = accuracy / total

  return avg_acc

In [None]:
cnn.eval()

with torch.no_grad():
  with torch.autocast(device_type="cuda", dtype=torch.float16)
    test_acc = test(test_dataloader)
    print(f"Final test accuracy: {100 * test_acc:.2f}%")