CLIP as binary image classification:

In [None]:
#Import packages
import os
import clip
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import json
import cv2
from torchvision.transforms import ToTensor
import pandas as pd
from PIL import Image
import torch.nn as nn
import torch.optim

In [19]:
# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M2 chips)
else:
    device = torch.device("cpu") # use CPU device
device
torch.manual_seed(42) # Setting the seed 

<torch._C.Generator at 0x107e23330>

In [5]:
#Load CLIP model - ViT B32
model, preprocess = clip.load('ViT-B/16', device, jit=False)

In [21]:
#Define Torch Dataset class
class ImageTitleDataset(Dataset):
    def __init__(self, list_video_path, list_labels, class_names):
        #to handle the parent class
        super().__init__()
        #Initalize image paths and corresponding texts
        self.video_path = list_video_path
        #Initialize labels (0 or 1)
        self.labels = list_labels
        #Initialize class names (no smoke or smoke)
        self.class_names = class_names
        #Transform to tensor
        #self.transforms = ToTensor()

    @staticmethod
    #Function to create a square-shaped image from the video (similar to 1 long image)
    #To do: what if the video has more frames than 36?
    def preprocess_video_to_image_grid_version(video_path, num_rows=6, num_cols=6):
        #Open the video file
        video = cv2.VideoCapture(video_path)
        #Create list for extracted frames
        frames = []
        #Handle if video can't be opened
        if not video.isOpened():
            print("Error: Could not open video file")
        else:
            while True:
                is_read, frame = video.read()
                if not is_read:
                    break
                frames.append(frame)
            video.release()
        
        if len(frames) != 36:
            print("Num of frames are not 36")
            print("Num of frames for video on ", video_path, "is ", len(frames))
        
        # Create  and store rows in the grids
        rows_list = []
        for i in range(num_rows):
            #create rows from the frames using indexes -- for example, if i=0, then between the 0th and 6th frame
            row = np.concatenate(frames[i * num_cols: (i + 1) * num_cols], axis=1)
            rows_list.append(row)
        
        # Concatenate grid vertically to create a single square-shaped image from the smoke video
        concatenated_frames = np.concatenate(rows_list, axis=0)
        return concatenated_frames
    

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        #tranform videos into images and preprocess with clip's function
        video_path = self.video_path[idx]
        image = self.preprocess_video_to_image_grid_version(video_path)
        image = Image.fromarray(image)
        image = preprocess(image)
        #get the corresponding class names and tokenize
        true_label = self.labels[idx]
        label = self.class_names[true_label]
        label = clip.tokenize(label, context_length=77, truncate=True)
        return image, label, true_label

In [22]:
#Define training data
# Load the JSON metadata
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    train_data = json.load(f)
# Convert the dataset to a Pandas DataFrame
train_data = pd.DataFrame(train_data)
# Prepare the list of video file paths and labels
list_video_path = [os.path.join("data/ijmond_videos/", f"{fn}.mp4") for fn in train_data['file_name']]
#list_labels = dataset['label'].tolist()
list_labels = [int(label) for label in train_data['label']]
#Define class names in a list - it needs prompt engineering
class_names = ["a photo of factories with clear sky above chimney", "a photo of factories emiting smoke from chimney"]

In [23]:
# Create dataset and data loader for training
dataset = ImageTitleDataset(list_video_path, list_labels, class_names)
train_dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [8]:
#Define validation data
# Load the JSON metadata
with open('data/datasets/experimental_ijmond_dataset.json', 'r') as f:
    val_data = json.load(f)
# Convert the dataset to a Pandas DataFrame
val_data = pd.DataFrame(val_data)
# Prepare the list of video file paths and labels
list_val_video_path = [os.path.join("data/ijmond_videos/", f"{fn}.mp4") for fn in val_data['file_name']]
#list_labels = dataset['label'].tolist()
list_val_labels = [int(label) for label in val_data['label']]

In [9]:
val_dataset = ImageTitleDataset(list_val_video_path, list_val_labels, class_names)
validation_dataloader = DataLoader(dataset, batch_size=4, shuffle=False)

In [24]:
# Function to convert model's parameters to FP32 format
#This is done so that our model loads in the provided memory
def convert_models_to_fp32(model): 
    for p in model.parameters(): 
        p.data = p.data.float() 
        p.grad.data = p.grad.data.float() 

# Check if the device is set to CPU
if device == "cpu":
  model.float()

# Prepare the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4,betas=(0.9,0.98),eps=1e-6,weight_decay=0.2)

#The lr, betas, eps and weight decay are from the CLIP paper

# Specify the loss functions - for images and for texts
loss_img = nn.CrossEntropyLoss()
loss_txt = nn.CrossEntropyLoss()

In [35]:
# Model training
num_epochs = 5
for epoch in range(num_epochs):
  model.train()
  pbar = tqdm(train_dataloader, total=len(train_dataloader))
  for batch in pbar:
      # Extract images and texts from the batch
      images, labels, true_label = batch 
      #texts = clip.tokenize(class_names).to(device)
      print('Text len: ', len(labels))
      print('Image len: ', len(images))
      print('True labels: ', len(true_label))

      # Move images and texts to the specified device (CPU or GPU)
      images= images.to(device)
      texts = labels.to(device)
      true_label = true_label.to(device)
      text_inputs = clip.tokenize(class_names).to(device)

      #Squeeze texts tensor to match the required size
      texts = texts.squeeze(dim = 1)
      text_inputs.squeeze(dim = 1)
      #images = images.unsqueeze(0)
      print('Text input shape', text_inputs.shape)

      # Forward pass - Run the model on the input data (images and texts)
      #logits_per_image, logits_per_text = model(images, texts)
      logits_per_image, logits_per_text = model(images, text_inputs)

      #Inspect logits
      print('Logits_per_text shape after forward passing: ', logits_per_text.shape)
      print('Logits_per_image shape after forward passing: ', logits_per_image.shape)
      print('Logits_per_text after forward passing: ', logits_per_text)
      print('Logits_per_image after forward passing: ', logits_per_image)

      # Compute loss
      ground_truth = torch.tensor(true_label, dtype=torch.long, device=device)
      #ground_truth = torch.arange(len(images), dtype=torch.long, device=device)
      print('Ground Truth: ', ground_truth)

      #Transform logits to float to match required dtype 
      logits_per_image = logits_per_image.float()
      logits_per_text = logits_per_text.float()

      #Compute loss - contrastive loss to pull similar pairs closer together
      #total_loss = (loss_img(logits_per_image,ground_truth) + loss_txt(logits_per_text,ground_truth))/2

      #One image should match 1 label, but 1 label can match will multiple images (when single label classification)
      total_loss = loss_img(logits_per_image,ground_truth) 
      
      # Zero out gradients for the optimizer (Adam) - to prevent adding gradients to previous ones
      optimizer.zero_grad()

      # Backward pass
      total_loss.backward()
      if device == "cpu":
         optimizer.step()
      else : 
        # Convert model's parameters to FP32 format, update, and convert back
        convert_models_to_fp32(model)
        optimizer.step()
        clip.model.convert_weights(model)
      # Update the progress bar with the current epoch and loss
      pbar.set_description(f"Epoch {epoch}/{num_epochs}, Loss: {total_loss.item():.4f}")
  #model.eval(True)
  

  0%|          | 0/7 [00:00<?, ?it/s]

Text len:  4
Image len:  4
True labels:  4
Text input shape torch.Size([2, 77])


  ground_truth = torch.tensor(true_label, dtype=torch.long, device=device)
  0%|          | 0/7 [00:01<?, ?it/s]

Logits_per_text len after forward passing:  2
Logits_per_image len after forward passing:  4
Logits_per_text after forward passing:  tensor([[24.3423, 20.6870, 21.1762, 22.1126],
        [22.1699, 16.9637, 18.9174, 20.2172]], grad_fn=<TBackward0>)
Logits_per_image after forward passing:  tensor([[24.3423, 22.1699],
        [20.6870, 16.9637],
        [21.1762, 18.9174],
        [22.1126, 20.2172]], grad_fn=<MmBackward0>)
Ground Truth:  tensor([1, 0, 1, 0])





ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 2])) is deprecated. Please ensure they have the same size.

Examine variables and model

In [7]:
#try labels and class names
print(class_names[1])
print(list_labels)
print(class_names[list_labels[0]])

a photo of industrial plants emiting smoke from chimney
[1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0]
a photo of industrial plants emiting smoke from chimney


In [3]:
import torch
import clip
from PIL import Image
import cv2
import os

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/16', device)
example_path = 'data/ijmond_videos/5PurGkmy0aw-1.mp4'

#define class names in a list - it need prompt engineering
class_names = ["a photo of industrial plants with clear sky above chimney", "a photo of industrial plants emiting smoke from chimney"]

#Crete a list of images from video
def preprocess_video(video_path):
    # Open the video file
    # example video : video_path = 'data/ijmond_videos/5PurGkmy0aw-1.mp4'
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        i = 1
        while True:
            ret, image = video.read()
            if ret == False:
                print('End of video reached during preprocessing')
                break
            frames.append(image)
            i += 1
        video.release()
    return frames


def vanilla_clip(video_path):
    #Create image list from video
    frames = preprocess_video(video_path)

    # Loop over each frame in video (36 frames in 1 video)
    i = 1
    for frame in frames:
        # Read image and preprocess
        image = preprocess(Image.fromarray(frame)).unsqueeze(0).to(device)

        # Prepare text inputs based on class names list
        text_inputs = clip.tokenize(class_names).to(device)

        # Calculate features
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text_inputs)

        # Calculate similarity
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        #values are the probabilities, indicies are the classes
        print(similarity)
        values, indices = similarity[0].topk(2)

        # Print predictions for each frame
        print(f"\nPredictions for frame {i}:\n")
        for value, index in zip(values, indices):
            print(f"{class_names[index]:>16s}: {100 * value.item():.2f}%")
        i+=1

vanilla_clip(example_path)

End of video reached during preprocessing
tensor([[0.8315, 0.1685]])

Predictions for frame 1:

a photo of industrial plants with clear sky above chimney: 83.15%
a photo of industrial plants emiting smoke from chimney: 16.85%
tensor([[0.7511, 0.2489]])

Predictions for frame 2:

a photo of industrial plants with clear sky above chimney: 75.11%
a photo of industrial plants emiting smoke from chimney: 24.89%
tensor([[0.5893, 0.4107]])

Predictions for frame 3:

a photo of industrial plants with clear sky above chimney: 58.93%
a photo of industrial plants emiting smoke from chimney: 41.07%
tensor([[0.6620, 0.3380]])

Predictions for frame 4:

a photo of industrial plants with clear sky above chimney: 66.20%
a photo of industrial plants emiting smoke from chimney: 33.80%
tensor([[0.5639, 0.4361]])

Predictions for frame 5:

a photo of industrial plants with clear sky above chimney: 56.39%
a photo of industrial plants emiting smoke from chimney: 43.61%
tensor([[0.7712, 0.2288]])

Prediction

KeyboardInterrupt: 

Some techniques for later use:

In [None]:
 # Store some constant - code to define params
'''num_epochs = int(training_args.num_train_epochs)
    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count() * training_args.gradient_accumulation_steps
    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
    steps_per_epoch = len(train_dataset) // train_batch_size
    total_train_steps = steps_per_epoch * num_epochs

# Create learning rate schedule
    linear_decay_lr_schedule_fn = create_learning_rate_fn(
        len(train_dataset),
        train_batch_size,
        training_args.num_train_epochs,
        training_args.warmup_steps,
        training_args.learning_rate,
    )'''

In [3]:
import torch
import clip
from PIL import Image
import cv2
import os
import pandas as pd
from EDA.eda_functions import get_label

#This model is able to predict the label of a video based on the frames + get the true label of the video

# Define device
if torch.cuda.is_available():
    device = torch.device("cuda") # use CUDA device
elif torch.backends.mps.is_available():
    device = torch.device("mps") # use MacOS GPU device (e.g., for M1 chips)
else:
    device = torch.device("cpu") # use CPU device

#load model and ijmond dataset
model, preprocess = clip.load('ViT-B/16', device)
ijmond_df = pd.read_json('data/datasets/ijmond_dataset.json')

#define class names in a list - it need prompt engineering
class_names = ["a photo of industrial plants with clear sky above chimney", "a photo of industrial plants emiting smoke from chimney"]

#Func to create a list of images from video
def preprocess_video(video_path):
    # Open the video file
    # example video : video_path = 'data/ijmond_videos/5PurGkmy0aw-1.mp4'
    video = cv2.VideoCapture(video_path)
    frames = []
    if not video.isOpened():
        print("Error: Could not open video file")
    else:
        i = 1
        while True:
            ret, image = video.read()
            if ret == False:
                print('End of video reached during preprocessing')
                break
            frames.append(image)
            i += 1
        video.release()
    return frames

#func to get the true label of the video
def get_true_label(file_name):
    row = ijmond_df[ijmond_df['file_name'] == file_name].iloc[0]
    return get_label(row)

#clip model to predict class for each frame in video
def vanilla_clip(video_path, file_name):
    #Create image list from video
    frames = preprocess_video(video_path)

    # Loop over each frame in video (36 frames in 1 video)
    i = 1
    prediction_list= []
    for frame in frames:
        # Read image and preprocess
        image = preprocess(Image.fromarray(frame)).unsqueeze(0).to(device)

        # Prepare text inputs based on class names list
        text_inputs = clip.tokenize(class_names).to(device)

        # Calculate features
        with torch.no_grad():
            image_features = model.encode_image(image)
            text_features = model.encode_text(text_inputs)

        # Calculate similarity
        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
        values, indices = similarity[0].topk(1)
        if class_names[indices] == "a photo of industrial plants with clear sky above chimney":
            prediction_list.append(0)
        else:
            prediction_list.append(1)       

        # Print predictions for each frame
        #print(f"\nPredictions for frame {i}:\n")
        #for value, index in zip(values, indices):
        #    print(f"{class_names[index]:>16s}: {100 * value.item():.2f}%")
        i+=1
    print(file_name)
    print(prediction_list)
    print("True label:", get_true_label(file_name))
    if sum(prediction_list) >= 3:
        return 1
    else:
        return 0

#predict label for each video in ijmond dir
files = os.listdir("data/ijmond_videos/")
for file in files:
    video_path = f"data/ijmond_videos/{file}"
    file_name = file.split('.')[0]
    vanilla_clip(video_path, file_name)


End of video reached during preprocessing
A9W8G55JucU-3
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
True label: 1
End of video reached during preprocessing
tT4vETXW7Og-2
[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
True label: 1
End of video reached during preprocessing
aINMnqmwSUg-1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
True label: 1
End of video reached during preprocessing
cxfcZFPpflE-0
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
True label: 0
End of video reached during preprocessing
Qv9-nS5BloI-2
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
True label: 1
End of video reached during preprocessing
9J-4qvCueZw-1
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 