In [1]:
#Import packages
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from torchvision.transforms import ToTensor
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.optim

  Referenced from: <85A36C65-3F71-3C3B-B529-961AE17DBE73> /Users/szaboreka/anaconda3/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [2]:
# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
#elif torch.backends.mps.is_available():
#    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device

device(type='cpu')

In [3]:
#Load CLIP model - ViT B32
model, preprocess = clip.load('ViT-B/32', device, jit=False)

In [4]:
#Define class names in a list - it needs prompt engineering
class_names = ["a photo of industrial plants with no smoke above chimney", "a photo of industrial plants emiting smoke from chimney"]

In [5]:
#Function to create one long image from video frames
def preprocess_video_to_image(video_path):
    # Open the video file
    video = cv2.VideoCapture(video_path)
    frames = []
    #Handle if video can't be opened
    if not video.isOpened():
        print("Video file couldn't be opened")
    #If yes, read all video frames until the end of the video and append every frame to the frames list
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        #Release video
        video.release()
    #Concetanate the frames in the list together
    concatenated_frame = np.concatenate(frames, axis=1)
    return concatenated_frame

In [6]:
#Function to create a square-shaped image from the video (similar to 1 long image)
def preprocess_video_to_image_grid_version(video_path, num_rows=6, num_cols=6):
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        while True:
            ret, frame = video.read()
            if not ret:
                break
            frames.append(frame)
        video.release()
    
    # Create grids and store rows in the grids
    grid = []
    for i in range(num_rows):
        row = np.concatenate(frames[i * num_cols: (i + 1) * num_cols], axis=1)
        grid.append(row)
    
    # Concatenate grid vertically to create a single square-shaped image from the smoke video
    concatenated_frame = np.concatenate(grid, axis=0)
    return concatenated_frame

In [7]:
# Load the JSON metadata
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    data = json.load(f)

# Convert the dataset to a Pandas DataFrame
ijmond_data = pd.DataFrame(data)

#Define Torch Dataset class
class ImageTitleDataset(Dataset):
    def __init__(self, list_video_path, list_labels):

        self.video_path = list_video_path
        self.labels = list_labels
        self.class_names = class_names
        #self.transforms = ToTensor()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        image = preprocess_video_to_image_grid_version(self.video_path[idx])
        image = Image.fromarray(image)
        image = preprocess(image)
        label = self.labels[idx]
        label = self.class_names[label]
        label = clip.tokenize(label, context_length=77, truncate=True)
        return image, label

# Prepare the list of video file paths and labels
list_video_path = [os.path.join("data/ijmond_videos/", f"{fn}.mp4") for fn in ijmond_data['file_name']]
#list_labels = dataset['label'].tolist()
list_labels = [int(label) for label in ijmond_data['label']]

In [8]:
print(list_labels)

[1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0]


In [9]:
# Create dataset and data loader
dataset = ImageTitleDataset(list_video_path, list_labels)
train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [10]:
# Function to convert model's parameters to FP32 format
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 


if device == "cpu":
  model.float()

# Prepare the optimizer - weight from other user
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2) # the lr is smaller, more safe for fine tuning to new dataset


# Specify the loss function
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [12]:
# Training
num_epochs = 5
for epoch in range(num_epochs):
  pbar = tqdm(train_dataloader, total=len(train_dataloader))
  for batch in train_dataloader :
      optimizer.zero_grad()

      images,texts = batch 
    
      images= images.to(device)
      texts = texts.to(device)

      # Debugging: Print the shape of the input tensor
      texts = texts.squeeze(dim = 1)
      print("Shape of input tensor before forward pass", texts.shape)
      #images = torch.stack([img for img in images],dim=0)

      # Forward pass
      logits_per_image, logits_per_text = model(images, texts)
      print(logits_per_text)

      # Compute loss
      #ground_truth = torch.tensor(texts, dtype=torch.long, device=device)
      ground_truth = torch.arange(len(images),dtype=torch.long,device=device)
      print(ground_truth)

      logits_per_image = logits_per_image.float()
      logits_per_text = logits_per_text.float()

      total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2
      
      # Backward pass
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)

      pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")

Epoch 0/5, Loss: 1.9579:   0%|          | 0/7 [02:12<?, ?it/s]


Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[18.5469, 18.6719, 19.3438, 18.6250],
        [19.2969, 19.0156, 19.5781, 19.4062],
        [19.2969, 19.0156, 19.5781, 19.4062],
        [18.5469, 18.6719, 19.3438, 18.6250]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[20.5156, 20.2031, 17.4062, 20.5156],
        [19.0156, 18.6719, 16.5781, 19.0781],
        [19.0156, 18.6719, 16.5781, 19.0781],
        [19.0156, 18.6719, 16.5781, 19.0781]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[21.1875, 23.0312, 23.1094, 23.8750],
        [20.2969, 22.1562, 22.1094, 22.9375],
        [21.1875, 23.0312, 23.1094, 23.8750],
        [21.1875, 23.0312, 23.1094, 23.8750]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.1094, 23.1250, 23.1406, 23.3594],
        [23.1094, 23.1250, 23.1406, 23.3594],
        [23.1094, 23.1250, 23.1406, 23.3594],
        [23.3125, 23.3594, 23.3281, 23.5625]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.2812, 23.7812, 23.4375, 23.1875],
        [23.2812, 23.7812, 23.4375, 23.1875],
        [23.2812, 23.7812, 23.4375, 23.1875],
        [23.0625, 23.5625, 23.2344, 23.0938]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.4844, 23.2031, 22.7344, 22.5469],
        [22.4844, 23.2031, 22.7344, 22.5469],
        [22.4844, 23.2031, 22.7344, 22.5469],
        [22.7656, 23.4531, 23.0156, 22.8906]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([2, 77])
tensor([[23.5000, 24.0156],
        [23.5000, 24.0156]], dtype=torch.float16, grad_fn=<TBackward0>)
tensor([0, 1])


Epoch 0/5, Loss: 0.7096:   0%|          | 0/7 [10:54<?, ?it/s]


Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.4688, 23.5938, 23.2812, 23.5625],
        [23.4688, 23.5938, 23.2812, 23.5625],
        [23.4688, 23.5938, 23.2812, 23.5625],
        [23.3125, 23.5469, 23.1875, 23.5312]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3781:   0%|          | 0/7 [01:40<?, ?it/s]

Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[24.1719, 23.8594, 23.0312, 23.6094],
        [24.1719, 23.8594, 23.0312, 23.6094],
        [23.9375, 23.6406, 22.9375, 23.5000],
        [23.9375, 23.6406, 22.9375, 23.5000]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3926:   0%|          | 0/7 [03:21<?, ?it/s]

Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.2188, 23.6094, 23.3281, 23.6094],
        [23.2500, 23.7969, 23.4375, 23.6094],
        [23.2500, 23.7969, 23.4375, 23.6094],
        [23.2188, 23.6094, 23.3281, 23.6094]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3626:   0%|          | 0/7 [05:05<?, ?it/s]

Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.5312, 23.6562, 23.2812, 23.6250],
        [23.5000, 23.6875, 23.3906, 23.6094],
        [23.5000, 23.6875, 23.3906, 23.6094],
        [23.5000, 23.6875, 23.3906, 23.6094]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3764:   0%|          | 0/7 [06:50<?, ?it/s]

Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.2500, 22.5312, 23.2812, 23.4844],
        [23.2500, 22.5312, 23.2812, 23.4844],
        [23.2500, 22.5312, 23.2812, 23.4844],
        [23.1719, 22.6094, 23.1875, 23.5625]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3945:   0%|          | 0/7 [08:35<?, ?it/s]

Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.9375, 22.6094, 23.6094, 23.1562],
        [22.9844, 22.7344, 23.4688, 23.1562],
        [22.9375, 22.6094, 23.6094, 23.1562],
        [22.9375, 22.6094, 23.6094, 23.1562]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


Epoch 1/5, Loss: 1.3870:   0%|          | 0/7 [10:16<?, ?it/s]

Shape of input tensor before forward pass torch.Size([2, 77])
tensor([[22.8125, 22.8594],
        [22.7656, 22.8906]], dtype=torch.float16, grad_fn=<TBackward0>)
tensor([0, 1])


Epoch 1/5, Loss: 0.6743:   0%|          | 0/7 [11:14<?, ?it/s]


Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.2344, 23.1406, 23.1562, 22.7656],
        [22.8906, 23.0312, 23.0000, 22.4219],
        [22.8906, 23.0312, 23.0000, 22.4219],
        [22.8906, 23.0312, 23.0000, 22.4219]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.8125, 22.6875, 22.8125, 22.8125],
        [22.8125, 22.6875, 22.8125, 22.8125],
        [22.8125, 22.6875, 22.8125, 22.8125],
        [22.8125, 22.6875, 22.8125, 22.8125]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[23.0156, 22.9219, 22.8906, 22.7969],
        [23.0156, 22.9219, 22.8906, 22.7969],
        [23.0156, 22.9219, 22.8906, 22.7969],
        [23.0156, 22.9219, 22.8906, 22.7969]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.5625, 22.5625, 22.4219, 22.8125],
        [22.5625, 22.5625, 22.4219, 22.8125],
        [22.6406, 22.7656, 22.7969, 22.7344],
        [22.5625, 22.5625, 22.4219, 22.8125]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.6562, 22.8125, 22.5000, 22.7031],
        [22.6562, 22.8125, 22.5000, 22.7031],
        [22.6562, 22.8125, 22.5000, 22.7031],
        [22.6562, 22.8125, 22.5000, 22.7031]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])




Shape of input tensor before forward pass torch.Size([4, 77])
tensor([[22.5000, 22.6094, 23.0000, 22.7188],
        [22.5000, 22.6094, 23.0000, 22.7188],
        [22.5000, 22.6094, 23.0000, 22.7188],
        [22.5000, 22.6094, 23.0000, 22.7188]], dtype=torch.float16,
       grad_fn=<TBackward0>)
tensor([0, 1, 2, 3])


In [30]:
#Inspect a few examples in dataset

# Create dataset
dataset = ImageTitleDataset(list_video_path, list_labels)
print("Dataset Length:", len(dataset))

# Inspect 3 samples
for i in range(3):
    image, label = dataset[i]
    print("Sample:", i)
    print("Image Shape:", image.shape)
    print("Label:", label)
    

Dataset Length: 26
Sample: 0
Image Shape: torch.Size([3, 224, 224])
Label: tensor([[49406,   320,  1125,   539,  7520,  5829,   908,  1257,  6664,   633,
         26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)
Sample: 1
Image Shape: torch.Size([3, 224, 224])
Label: tensor([[49406,   320,  1125,   539,  7520,  5829,   593,   871,  6664,  4348,
         26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,


In [15]:
#Inspect Batch sizes
# Create DataLoader
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Iterate over a few batches
for images, labels in dataloader:
    print("Batch Images Shape:", images.shape)
    print("Batch Labels:", labels)
    break  # Stop after first batch

# (batch_size, channel, time, height, width)

Batch Images Shape: torch.Size([4, 3, 224, 224])
Batch Labels: tensor([[[49406,   320,  1125,   539,  7520,  5829,   908,  1257,  6664,   633,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0]],

        [[49406,   320,  1125,   539,  7520,  5829,   593,   871,  6664,  4348,
          26821, 49407,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,   