In [1]:
#Import packages
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from torchvision.transforms import ToTensor
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.optim

  Referenced from: <85A36C65-3F71-3C3B-B529-961AE17DBE73> /Users/szaboreka/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [2]:
# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
#elif torch.backends.mps.is_available():
#    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

device(type='cpu')

In [3]:
#Load CLIP model - ViT B32
model, preprocess = clip.load('ViT-B/32', device, jit=False)

In [4]:
#Function to create one long image from video frames
def preprocess_video_to_image(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    #Handle if video can't be opened
    if not video.isOpened():
        print("Video file couldn't be opened")
    #If yes, read all video frames until the end of the video and append every frame to the frames list
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        #Release video
        video.release()
    #Concetanate the frames in the list together
    concatenated_frame = np.concatenate(frames, axis=1)
    return concatenated_frame

In [5]:
#Function to create a square-shaped image from the video (similar to 1 long image)
def preprocess_video_to_image_grid_version(video_path, num_rows=6, num_cols=6):
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    
    # Create grids and store rows in the grids
    grid = []
    for i in range(num_rows):
        row = np.concatenate(frames[i * num_cols: (i + 1) * num_cols], axis=1)
        grid.append(row)
    
    # Concatenate grid vertically to create a single square-shaped image from the smoke video
    concatenated_frame = np.concatenate(grid, axis=0)
    return concatenated_frame

In [6]:
# Load the JSON metadata
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    data = json.load(f)

# Convert the dataset to a Pandas DataFrame
ijmond_data = pd.DataFrame(data)

#Define Torch Dataset class
class ImageTitleDataset(Dataset):
    def __init__(self, list_video_path, list_labels, class_names):
        #Initalize image paths and corresponding texts
        self.video_path = list_video_path
        #Initialize labels (0 or 1)
        self.labels = list_labels
        #Initialize class names
        self.class_names = class_names
        #Transform to tensor
        #self.transforms = ToTensor()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        #tranform videos into images and preprocess with clip's function
        image = preprocess_video_to_image_grid_version(self.video_path[idx])
        image = Image.fromarray(image)
        image = preprocess(image)
        #get the corresponding class names and tokenize
        true_label = self.labels[idx]
        label = self.class_names[true_label]
        label = clip.tokenize(label, context_length=77, truncate=True)
        return image, label, true_label

# Prepare the list of video file paths and labels
list_video_path = [os.path.join("data/ijmond_videos/", f"{fn}.mp4") for fn in ijmond_data['file_name']]
#list_labels = dataset['label'].tolist()
list_labels = [int(label) for label in ijmond_data['label']]
#Define class names in a list - it needs prompt engineering
class_names = ["a photo of industrial plants with clear sky above chimney", "a photo of industrial plants emiting smoke from chimney"]

In [7]:
#try
print(class_names[1])
print(list_labels)
print(class_names[list_labels[0]])

a photo of industrial plants emiting smoke from chimney
[1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0]
a photo of industrial plants emiting smoke from chimney


In [8]:
# Create dataset and data loader
dataset = ImageTitleDataset(list_video_path, list_labels, class_names)
train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [9]:
# Function to convert model's parameters to FP32 format
#This is done so that our model loads in the provided memory
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

# Check if the device is set to CPU
if device == "cpu":
  model.float()

# Prepare the optimizer - weight from other user (https://www.labellerr.com/blog/fine-tuning-clip-on-custom-dataset/)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) # the lr is smaller, more safe for fine tuning to new dataset

# Adam optimizer is used with specific hyperparameters
# lr (learning rate) is set to 5e-5, which is considered safe for fine-tuning to a new dataset
# betas are used for the optimization algorithm
# eps is a small value to prevent division by zero
# weight_decay adds L2 regularization to the optimizer

# Specify the loss functions - for images and for texts
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [12]:
# Model training
num_epochs = 5
for epoch in range(num_epochs):
  pbar = tqdm(train_dataloader, total=len(train_dataloader))
  for batch in pbar:
      # Zero out gradients for the optimizer (Adam)
      optimizer.zero_grad()

      # Extract images and texts from the batch
      images,texts, true_label = batch 
      print('Texts: ', texts)

      # Print the current device (CPU or GPU)
      print("Used device: ", device)

      # Move images and texts to the specified device (CPU or GPU)
      images= images.to(device)
      texts = texts.to(device)

      #Squeeze texts tensor to match the required size
      texts = texts.squeeze(dim = 1)
      #print("Shape of input tensor before forward pass: ", texts.shape)
      #images = torch.stack([img for img in images],dim=0)

      # Forward pass
      logits_per_image, logits_per_text = model(images, texts)
      print('Logits_per_text after forward passing: ', logits_per_text)

      # Compute loss
      ground_truth = torch.tensor(true_label, dtype=torch.long, device=device)
      #ground_truth = torch.tensor(texts[batch], dtype=torch.long, device=device)
      #ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
      print('Ground truth: ', ground_truth)

      #Transform logits to flote to match required dtype
      logits_per_image = logits_per_image.float()
      logits_per_text = logits_per_text.float()

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      
      # Backward pass
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        # Convert model's parameters to FP32 format, update, and convert back
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)
      # Update the progress bar with the current epoch and loss
      pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

  0%|          | 0/7 [00:00<?, ?it/s]

Texts:  tensor([[[49406,   320,  1125,   539,  7520,  5829,   908,  1257,  6664,   633,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]],

        [[49406,   320,  1125,   539,  7520,  5829,   593,  3143,  2390,  4348,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0




RuntimeError: Expected 3D (unbatched) or 4D (batched) input to conv2d, but got input of size: [1, 4, 3, 224, 224]

In [30]:
#Inspect a few examples in dataset

# Create dataset
dataset = ImageTitleDataset(list_video_path, list_labels)
print("Dataset Length:", len(dataset))

# Inspect 3 samples
for i in range(3):
    image, label = dataset[i]
    print("Sample:", i)
    print("Image Shape:", image.shape)
    print("Label:", label)
    

Dataset Length: 26
Sample: 0
Image Shape: torch.Size([3, 224, 224])
Label: tensor([[49406,   320,  1125,   539,  7520,  5829,   908,  1257,  6664,   633,
         26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)
Sample: 1
Image Shape: torch.Size([3, 224, 224])
Label: tensor([[49406,   320,  1125,   539,  7520,  5829,   593,   871,  6664,  4348,
         26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,


In [15]:
#Inspect Batch sizes
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Iterate over a few batches
for images, labels in dataloader:
    print("Batch Images Shape:", images.shape)
    print("Batch Labels:", labels)
    break  # Stop after first batch

# (batch_size, channel, time, height, width)

Batch Images Shape: torch.Size([4, 3, 224, 224])
Batch Labels: tensor([[[49406,   320,  1125,   539,  7520,  5829,   908,  1257,  6664,   633,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]],

        [[49406,   320,  1125,   539,  7520,  5829,   593,   871,  6664,  4348,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,   

In [11]:
#code to examine the preprocess function
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x13fe39e40>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [None]:
#code to save the trained model
torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': total_loss,
        }, f"model_checkpoint/model_clip_1.pt") #just change to your preferred folder/filename

In [None]:
#Code to load the saved model :
model, preprocess = clip.load("ViT-B/32",device=device,jit=False) #Must set jit=False for training
checkpoint = torch.load("model_checkpoint/model_clip_1.pt")

# Use these 3 lines if you use default model setting(not training setting) of the clip. For example, if you set context_length to 100 since your string is very long during training, then assign 100 to checkpoint['model_state_dict']["context_length"] 
checkpoint['model_state_dict']["input_resolution"] = model.input_resolution #default is 224
checkpoint['model_state_dict']["context_length"] = model.context_length # default is 77
checkpoint['model_state_dict']["vocab_size"] = model.vocab_size 

model.load_state_dict(checkpoint['model_state_dict'])