In [85]:
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from torchvision.transforms import ToTensor
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.optim

In [86]:
# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

device(type='mps')

In [87]:
#Load CLIP model
model, preprocess = clip.load('ViT-B/32', device)

In [88]:
#Define class names in a list - it need prompt engineering
class_names = ["a photo of industrial plants with no smoke above chimney", "a photo of industrial plants emiting smoke from chimney"]

In [89]:
def preprocess_video_to_image(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    # Concatenate frames horizontally to create a single image
    concatenated_frame = np.concatenate(frames, axis=1)
    return concatenated_frame

In [90]:
#Function to create an image from the video
def preprocess_video_to_image_grid_version(video_path, num_rows=6, num_cols=6):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    
        # Split frames into grid
    grid = []
    for i in range(num_rows):
        row = np.concatenate(frames[i * num_cols: (i + 1) * num_cols], axis=1)
        grid.append(row)
    
    # Concatenate grid vertically to create a single image
    concatenated_frame = np.concatenate(grid, axis=0)
    return concatenated_frame

In [91]:
# Load the dataset
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    data = json.load(f)

# Convert to DataFrame
ijmond_data = pd.DataFrame(data)

class ImageTitleDataset(Dataset):
    def __init__(self, list_video_path, list_labels):

        self.video_path = list_video_path
        self.labels = list_labels
        self.transforms = ToTensor()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = preprocess_video_to_image_grid_version(self.video_path[idx])
        image = Image.fromarray(image)
        image = preprocess(image)
        label = torch.tensor(self.labels[idx])
        return image, label

# Prepare the list of video file paths and labels
list_video_path = [os.path.join("data/ijmond_videos/", f"{fn}.mp4") for fn in ijmond_data['file_name']]
#list_labels = dataset['label'].tolist()
list_labels = [int(label) for label in ijmond_data['label']]

In [103]:
# Create dataset and data loader
dataset = ImageTitleDataset(list_video_path, list_labels)
train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [104]:
# Create dataset
dataset = ImageTitleDataset(list_video_path, list_labels)

# Verify dataset length
print("Dataset Length:", len(dataset))

# Inspect a few samples
for i in range(3):
    image, label = dataset[i]
    print("Sample:", i)
    print("Image Shape:", image.shape)
    print("Label:", label)
    

Dataset Length: 26
Sample: 0
Image Shape: torch.Size([3, 224, 224])
Label: tensor(1)
Sample: 1
Image Shape: torch.Size([3, 224, 224])
Label: tensor(0)
Sample: 2
Image Shape: torch.Size([3, 224, 224])
Label: tensor(0)


In [105]:
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Iterate over a few batches
for images, labels in dataloader:
    print("Batch Images Shape:", images.shape)
    print("Batch Labels:", labels)
    break  # Stop after first batch

# (batch_size, channel, time, height, width)

Batch Images Shape: torch.Size([4, 3, 224, 224])
Batch Labels: tensor([0, 0, 1, 1])


In [106]:
# Function to convert model's parameters to FP32 format
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 


if device == "cpu":
  model.float()

# Prepare the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) # the lr is smaller, more safe for fine tuning to new dataset


# Specify the loss function
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [107]:
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) #Params used from paper, the lr is smaller, more safe for fine tuning to new dataset

# add your own code to track the training progress.
for epoch in range(4):
  for batch in train_dataloader :
      optimizer.zero_grad()

      images,texts = batch 
    
      images= images.to(device)
      texts = texts.to(device)
    
      logits_per_image, logits_per_text = model(images, texts)

      ground_truth = torch.tensor(texts, dtype=torch.long, device=device)

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

RuntimeError: The size of tensor a (26) must match the size of tensor b (77) at non-singleton dimension 0