<a href="https://colab.research.google.com/github/vanshtibrewal/CLIP-classifier/blob/main/Vision_Lab_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Requirements

In [None]:
!pip install pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.2-py3-none-any.whl (776 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch_lightning)
  Downloading torchmetrics-1.2.1-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from pytorch_lightning)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.10.0 pytorch_lightning-2.1.2 torchmetrics-1.2.1


#Import Requirements

In [None]:
import torchvision
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader, TensorDataset
import os
from transformers import CLIPProcessor, CLIPModel
import pytorch_lightning as pl

#Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Hyperparameters/Arbitrary Constants

In [None]:
batch_size = 32
train_test_split = 0.8
pl.seed_everything(42, workers=True)
torch.backends.cudnn.deterministic = True
num_workers = 2

INFO:lightning_fabric.utilities.seed:Seed set to 42


#Load/Download/Process/Save Data

##Initial Download, Preprocessing, and Saving - first time only

In [None]:
dataset_dir = '/content/drive/My Drive/VisionLab/Caltech101'
caltech_dataset = torchvision.datasets.Caltech101(root=dataset_dir, download=False)

In [None]:
save_dir = '/content/drive/My Drive/VisionLab/PreprocessedCaltech101'
os.makedirs(save_dir, exist_ok=True)

In [None]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

processed_images = []
labels = []

In [None]:
total_len = len(caltech_dataset)
for idx, (image, label) in enumerate(caltech_dataset):
    inputs = processor(images=image, return_tensors="pt")["pixel_values"].squeeze()
    processed_images.append(inputs)
    labels.append(label)
    if idx % 1000 == 0:
      print("Progress Update:", (100.0*idx/total_len))

Progress Update: 0.0
Progress Update: 11.524720525527256
Progress Update: 23.04944105105451
Progress Update: 34.574161576581766
Progress Update: 46.09888210210902
Progress Update: 57.62360262763628
Progress Update: 69.14832315316353
Progress Update: 80.67304367869079
Progress Update: 92.19776420421805


In [None]:
processed_images = torch.stack(processed_images)
labels = torch.tensor(labels)

In [None]:
torch.save(processed_images, os.path.join(save_dir, 'processed_images.pt'))
torch.save(labels, os.path.join(save_dir, 'labels.pt'))

##Load Preprocessed Data

In [None]:
save_dir = '/content/drive/My Drive/VisionLab/PreprocessedCaltech101'
images = torch.load(os.path.join(save_dir, 'processed_images.pt'))
labels = torch.load(os.path.join(save_dir, 'labels.pt'))

In [None]:
dataset = TensorDataset(images, labels)
train_size = int(train_test_split*len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

#Create Model

In [None]:
class CLIPClassifier(pl.LightningModule):
  def __init__(self):
    super().__init__()
    self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    self.encoder = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    self.classifier = nn.Sequential(nn.Linear(512, 256), nn.ReLU(), nn.Linear(256, 128), nn.ReLU(), nn.Linear(128, 101))

    for param in self.encoder.parameters():
        param.requires_grad = False

  def forward(self, x):
        processed = self.processor(x, return_tensors="pt")["pixel_values"].squeeze()
        embedding = self.encoder.get_image_features(processed)
        out = self.classifier(embedding)
        out = F.softmax(out, dim=1)
        return torch.argmax(out, dim=1)

  def training_step(self, batch, batch_idx):
      processed_imgs, labels = batch
      z = self.encoder.get_image_features(processed_imgs)
      out = self.classifier(z)
      out = F.log_softmax(out, dim=1)
      loss = nn.NLLLoss()(out, labels)
      self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
      return loss

  def validation_step(self, batch, batch_idx):
      processed_imgs, labels = batch
      z = self.encoder.get_image_features(processed_imgs)
      out = self.classifier(z)
      out = F.log_softmax(out, dim=1)
      loss = nn.NLLLoss()(out, labels)
      self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
      return loss

  def configure_optimizers(self):
      optimizer = torch.optim.Adam(self.classifier.parameters(), lr=1e-3)
      return optimizer

#Training

In [None]:
model = CLIPClassifier()
trainer = pl.Trainer(max_epochs=500)
trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=test_dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name       | Type       | Params
------------------------------------------
0 | encoder    | CLIPModel  | 151 M 
1 | classifier | Sequential | 177 K 
------------------------------------------
177 K     Trainable params
151 M     Non-trainable params
151 M     Total params
605.818   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]