In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir='/usr/lib/cuda'"

In [None]:
import cv2 
import pandas as pd
import numpy as np
import tensorflow as tf
from PIL import Image
from torchvision import transforms
from transformers import OFATokenizer, OFAModel
from transformers.models.ofa.generate import sequence_generator

In [None]:
import torch
torch.cuda.is_available()

In [None]:
mean, std = [0.5, 0.5, 0.5], [0.5, 0.5, 0.5]
resolution = 256

In [None]:
patch_resize_transform = transforms.Compose([
    lambda image: image.convert("RGB"),
    transforms.Resize((resolution, resolution), interpolation=Image.BICUBIC),
    transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

In [None]:
ckpt_dir='OFA-tiny'
tokenizer = OFATokenizer.from_pretrained(ckpt_dir)

In [None]:
model = OFAModel.from_pretrained(ckpt_dir, use_cache=False)

In [None]:
generator = sequence_generator.SequenceGenerator(
    tokenizer=tokenizer,
    beam_size=5,
    max_len_b=16,
    min_len=0,
    no_repeat_ngram_size=3,
)

In [None]:
FOLDER_NAME = '/path/to/folder/containing/videos'
IMAGE_DIR = '/path/to/folder/containing/image frames'
CATEGORY =  "Explicit Hate Videos" #change it to Implicit Hate Videos, Non Hate Videos as required

In [None]:
image_folders = []
captions = []

df1 = pd.read_excel(f'{FOLDER_NAME}/{CATEGORY}.xlsx') #Excel file with video names and IDs

txt = " what does the image describe?"
inputs = tokenizer([txt], return_tensors="pt").input_ids

for filename in df1['Video_ID']:
    if(filename in os.listdir(f'{IMAGE_DIR}/{CATEGORY}')):
        file_path = f'{IMAGE_DIR}/{CATEGORY}' + filename
        list_of_frames = os.listdir(file_path)
        for i in range(0, len(list_of_frames), 4):
            path = file_path + '/' + list_of_frames[i]
            img = Image.open(path)
            patch_img = patch_resize_transform(img).unsqueeze(0)
            gen = model.generate(inputs, patch_images=patch_img, num_beams=5, no_repeat_ngram_size=3)
            captions.append(tokenizer.batch_decode(gen, skip_special_tokens=True)[0].strip())
            image_folders.append(filename)
        print(f'{filename} captions complete')

In [None]:
destination_folder = '/path/to/folder/for/storing/captions'
files = []
for filename in (os.listdir(destination_folder)):
    filename = filename.replace('.npy', '')
    files.append(filename)

In [None]:
caption_df = pd.DataFrame({
    'Video': files,
    'Caption': captions})

In [None]:
caption_df.to_csv(f'{destination_folder}/{CATEGORY} captions.csv')

In [None]:
from transformers import BertModel, BertTokenizer
import torch

In [None]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained("bert-large-uncased")

In [None]:
# Function to generate BERT embeddings for a list of texts
def get_bert_embeddings(text_list):
    inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.pooler_output.detach().numpy()
    return embeddings

In [None]:
captions = pd.read_csv(f'{destination_folder}/{CATEGORY} captions.csv')
captions = captions['Caption']

In [None]:
embeddings = []
for i in range(len(captions)):
    embeddings.append(get_bert_embeddings(captions[i]))

In [None]:
arr1 = np.array(embeddings)

In [None]:
arr2= arr1.reshape(35919,1024)

In [None]:
def max_pool_embeddings(embeddings):
    # Apply max pooling along the token dimension (axis 1)
    pooled_embeddings, _ = torch.max(embeddings, dim=1)
    return pooled_embeddings

In [None]:
# Function to apply max pooling to a single embedding tensor
def max_pool_single_embedding(embedding):
    # Convert NumPy array to PyTorch tensor if necessary
    if isinstance(embedding, np.ndarray):
        embedding = torch.tensor(embedding)
    # Apply max pooling along the token dimension (axis 0)
    pooled_embedding, _ = torch.max(embedding, dim=0)
    return pooled_embedding

# Function to apply max pooling to a list of embeddings
def max_pool_embeddings_list(embeddings):
    # Apply max pooling to each tensor in the list
    pooled_embeddings = [max_pool_single_embedding(embedding) for embedding in embeddings]
    return torch.stack(pooled_embeddings)

In [None]:
emb_stack = []
for i in range(len(embeddings)):
    emb_stack.append(max_pool_single_embedding(embeddings[i]))

In [None]:
for i in missing_index:
    arr2[i] = np.full(509, -1000)
    image_folders[i] = 'delete'

In [None]:
# Remove tensors that are entirely filled with -1000
filtered_list = [arr for arr in arr2 if not np.all(arr == -1000)]
filtered_image_folders = [folder for folder in image_folders if not(folder=='delete')]

In [None]:
# Assuming image_folders is a list of folder names or IDs corresponding to embeddings
max_pool_emb = []

i=1
temp = []

df = pd.read_csv(f'/{destination_folder}/{CATEGORY} captions.csv')

cnt = 0
while i < len(emb_stack):  # Start loop from index 1 to avoid out-of-bound errors
    # Collect embeddings while the folder is the same
    if df['Video'][i] == df['Video'][i-1]:
        temp.append(arr2[i-1])
        i += 1
    # Ensure that temp is not empty before stacking
    else:
        cnt = cnt+1
        if temp:
            max_pool_emb.append(np.max(temp,axis=0))
            temp = []
        else:
            max_pool_emb.append(arr2[i])
        i =i+1

max_pool_emb.append(arr2[-1])

In [None]:
arr4 = np.array(max_pool_emb)

In [None]:
np.save("/home/ankr/Notebook/HateMM/Own Dataset/Transcription Features/OWN_NON_HATE_captions_pooled_embeddings.npy",arr4)