1. Data Loading and Preprocessing
We'll assume you have a dataset in the form of video files, text descriptions, and speech captions. We'll need to preprocess these inputs and extract features.


In [1]:
import os
import cv2
import torch
import pandas as pd
import torchvision.transforms as transforms
from transformers import BertTokenizer, BertModel
import torchaudio
from torch.utils.data import DataLoader, TensorDataset
from torchvision.models import resnet50
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
# Function to load video data
def load_video_data(directory):
    video_paths = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.mp4')]
    return video_paths

# Load video data
video_directory = 'C:/Users/Manikandan/Desktop/RA/final_cut/sample/sample'
video_paths = load_video_data(video_directory)


In [3]:
# Function to preprocess video frames
def preprocess_video(video_path, frame_count=16):
    cap = cv2.VideoCapture(video_path)
    frames = []
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])
    
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    interval = max(1, total_frames // frame_count)
    
    frame_idx = 0
    while len(frames) < frame_count and cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % interval == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(transform(frame))
        frame_idx += 1
    
    cap.release()
    
    while len(frames) < frame_count:
        frames.append(frames[-1])
    
    return torch.stack(frames)

# Preprocess video frames
video_features = torch.stack([preprocess_video(video_path) for video_path in video_paths])


In [4]:
# Function to load text descriptions and speech captions from CSV
def load_text_and_speech(csv_path):
    df = pd.read_csv(csv_path)
    descriptions = df['creative_data_description'].tolist()
    speech_captions = df['speech'].tolist()
    return descriptions, speech_captions

# Load text descriptions and speech captions
descriptions, speech_captions = load_text_and_speech('C:/Users/Manikandan/Desktop/RA/final_cut/Sample.csv')


In [5]:
# Function to preprocess text descriptions using BERT tokenizer
def preprocess_text(text, tokenizer, max_length=128):
    tokens = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=max_length)
    return tokens['input_ids'], tokens['attention_mask']

# Preprocess text descriptions using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_features = [preprocess_text(desc, tokenizer) for desc in descriptions]
text_input_ids = torch.cat([feat[0] for feat in text_features])
text_attention_masks = torch.cat([feat[1] for feat in text_features])


In [6]:
# Function to preprocess speech captions using BERT tokenizer
def preprocess_speech(speech_captions):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_captions = tokenizer(speech_captions, return_tensors='pt', padding=True, truncation=True)
    input_ids = tokenized_captions['input_ids']
    attention_mask = tokenized_captions['attention_mask']
    return input_ids, attention_mask

# Preprocess speech captions using BERT tokenizer
speech_input_ids, speech_attention_mask = preprocess_speech(speech_captions)


In [7]:
# Define the multimodal classifier model
class MultiModalClassifier(nn.Module):
    def __init__(self, video_feature_size, text_feature_size, hidden_size, num_questions):
        super(MultiModalClassifier, self).__init__()
        self.video_model = resnet50(pretrained=True)
        self.video_model.fc = nn.Linear(self.video_model.fc.in_features, video_feature_size)
        
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.speech_rnn = nn.RNN(input_size=1, hidden_size=hidden_size, batch_first=True)
        
        self.fc = nn.Linear(video_feature_size + text_feature_size + hidden_size, num_questions)

    def forward(self, video_feat, text_feat, text_mask, speech_feat):
        batch_size, num_frames, channels, height, width = video_feat.shape
        video_feat = video_feat.view(-1, channels, height, width)
        video_out = self.video_model(video_feat)
        video_out = video_out.view(batch_size, num_frames, -1).mean(1)
        
        text_feat = text_feat.squeeze(1)
        text_mask = text_mask.squeeze(1)
        text_out = self.text_model(input_ids=text_feat.long(), attention_mask=text_mask.long())[1]
        
        speech_out, _ = self.speech_rnn(speech_feat.unsqueeze(-1).float())
        speech_out = speech_out[:, -1, :]
        
        combined_feat = torch.cat((video_out, text_out, speech_out), dim=1)
        
        output = torch.sigmoid(self.fc(combined_feat))
        
        return output

# Initialize the multi-modal classifier model
video_feature_size = 2048
text_feature_size = 768
hidden_size = 128
num_questions = 21
model = MultiModalClassifier(video_feature_size, text_feature_size, hidden_size, num_questions)




In [8]:
# Simulated labels for each question (replace with actual labels)
labels = torch.randint(0, 2, (150, num_questions)).float()


In [9]:
# Function to split data into train, val, and test sets
def split_data(data, labels, test_size=0.2, val_size=0.1):
    data_train, data_test, labels_train, labels_test = train_test_split(data, labels, test_size=test_size, random_state=42)
    data_train, data_val, labels_train, labels_val = train_test_split(data_train, labels_train, test_size=val_size, random_state=42)
    return data_train, data_val, data_test, labels_train, labels_val, labels_test

# Split data into train, val, and test sets
video_feats_train, video_feats_val, video_feats_test, labels_train, labels_val, labels_test = split_data(video_features, labels)
text_feats_train, text_feats_val, text_feats_test, _, _, _ = split_data(text_input_ids, labels)
text_masks_train, text_masks_val, text_masks_test, _, _, _ = split_data(text_attention_masks, labels)
speech_feats_train, speech_feats_val, speech_feats_test, _, _, _ = split_data(speech_input_ids, labels)


In [10]:
# Function to create data loaders
def create_dataloader(video_feats, text_feats, text_masks, speech_feats, labels, batch_size=1):
    dataset = TensorDataset(video_feats, text_feats, text_masks, speech_feats, labels)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

# Create data loaders
train_loader = create_dataloader(video_feats_train, text_feats_train, text_masks_train, speech_feats_train, labels_train)
val_loader = create_dataloader(video_feats_val, text_feats_val, text_masks_val, speech_feats_val, labels_val)
test_loader = create_dataloader(video_feats_test, text_feats_test, text_masks_test, speech_feats_test, labels_test)


In [11]:
# Training setup
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 15


In [12]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for video_batch, text_batch, text_mask_batch, speech_batch, label_batch in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(video_batch, text_batch, text_mask_batch, speech_batch)
        
        # Calculate loss
        loss = criterion(outputs, label_batch)
        
        # Backward pass
        loss.backward()
        optimizer.step()
    
    # Validate the model
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for video_batch, text_batch, text_mask_batch, speech_batch, label_batch in val_loader:
            outputs = model(video_batch, text_batch, text_mask_batch, speech_batch)
            val_loss += criterion(outputs, label_batch).item()
    val_loss /= len(val_loader)
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss}')


Epoch 1, Validation Loss: 0.9468960960706075
Epoch 2, Validation Loss: 0.7040195514758428
Epoch 3, Validation Loss: 0.8068806628386179
Epoch 4, Validation Loss: 0.761273980140686
Epoch 5, Validation Loss: 0.6988709072271982
Epoch 6, Validation Loss: 0.7160429507493973
Epoch 7, Validation Loss: 0.7185488790273666
Epoch 8, Validation Loss: 0.7238199909528097
Epoch 9, Validation Loss: 0.7111341108878454
Epoch 10, Validation Loss: 0.704804057876269
Epoch 11, Validation Loss: 0.7077264984448751
Epoch 12, Validation Loss: 0.7103284895420074
Epoch 13, Validation Loss: 0.694024403889974
Epoch 14, Validation Loss: 0.7389432787895203
Epoch 15, Validation Loss: 0.7033341477314631


In [1]:
import pandas as pd
import torch

# Define the questions
questions = [
    "Is there a call to go online (e.g., shop online, visit the Web)? ",
    "Is there online contact information provided (e.g., URL, website)? ",
    "Is there a visual or verbal call to purchase (e.g., buy now, order now)?",
    "Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? ",
    "Is there an incentive to buy (e.g., a discount, a coupon, a sale or \"limited time offer\")? " ,
    "Is there offline contact information provided (e.g., phone, mail, store location)?",
    "Is there mention of something free? ",
    "Does the ad mention at least one specific product or service (e.g., model, type, item)? ",
    "Is there any verbal or visual mention of the price?",
    "Does the ad show the brand (logo, brand name) or trademark (something that most people know is the brand) multiple times?",
    "Does the ad show the brand or trademark exactly once at the end of the ad?",
    "Is the ad intended to affect the viewer emotionally, either with positive emotion (fun, joy), negative emotion (sad, anxious) or another type of emotion? (Note: You may not personally agree, but assess if that was the intention.)",
    "Does the ad give you a positive feeling about the brand? ",
    "Does the ad have a story arc, with a beginning and an end? ",
    "Does the ad have a reversal of fortune, where something changes for the better, or changes for the worse?",
    "Does the ad have relatable characters? ",
    "Is the ad creative/clever?",
    "Is the ad intended to be funny? (Note: You may not personally agree, but assess if that was the intention.) ",
    "Does this ad provide sensory stimulation (e.g., cool visuals, arousing music, mouth-watering)? ",
    "Is the ad visually pleasing?",
    "Does the ad have cute elements like animals, babies, animated, characters, etc?"
]

# Initialize lists to store video answers, predictions, and labels
video_answers = []
all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for i in range(len(video_paths)):
        video_feat = preprocess_video(video_paths[i]).unsqueeze(0)
        text_feat, text_mask = preprocess_text(descriptions[i], tokenizer)
        text_feat = text_feat.unsqueeze(0)
        text_mask = text_mask.unsqueeze(0)
        speech_feat = speech_input_ids[i].unsqueeze(0)
        
        output = model(video_feat, text_feat, text_mask, speech_feat)
        
        answers = [( int(output[0][j].item() > 0.5)) for j in range(len(questions))]
        video_answers.append(answers)
        
        all_preds.append((output[0] > 0.5).int().tolist())
        all_labels.append(labels[i].tolist())

# Print answers for all videos
for i in range(len(video_answers)):
    print(f"Video {i+1} answers:")
    for  answer in video_answers[i]:
        answer_str = "yes" if answer == 1 else "no"
        print(f" {answer_str}")




# Assuming `video_answers` is a list of lists containing the binary answers for each video
# Convert the binary answers (1/0) to 'yes'/'no'
video_answers_str = [['yes' if answer == 1 else 'no' for answer in video] for video in video_answers]

# Create a DataFrame from the list of lists
df = pd.DataFrame(video_answers_str)

# Write the DataFrame to an Excel file
df.to_excel('video_answers.xlsx', index=False, header=False)

print("Video answers have been written to video_answers.xlsx")


NameError: name 'model' is not defined

In [14]:

import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score


# Load ground truth data
ground_truth_df = pd.read_csv("C:/Users/Manikandan/Desktop/RA/final_cut/ground-truth.xlsx - Form Responses 1.csv")

# Extract relevant columns including 'creative_data_id'
ground_truth_df = ground_truth_df[['creative_data_id'] + questions]

# Define a function to take the majority vote for each question
def majority_vote(df):
    return df.mode().iloc[0]

# Group by creative_data_id and apply majority vote
ground_truth_agg = ground_truth_df.groupby('creative_data_id').apply(lambda x: x.iloc[0]).reset_index(drop=True)
ground_truth_agg = ground_truth_agg.drop('creative_data_id', axis=1)

# Prepare predictions DataFrame
predictions_df = pd.DataFrame(video_answers, columns= questions)

# Ensure the data types are consistent
predictions_df = predictions_df.astype(str)
ground_truth_agg = ground_truth_agg.astype(str)

# Merge predictions with ground truth on creative_data_id
comparison_df = pd.concat([predictions_df, ground_truth_agg.reset_index(drop=True)], axis=1, keys=['pred', 'true'])
comparison_df.columns = [f"{col}_pred" if idx < len(predictions_df.columns) else f"{col}_true" for idx, col in enumerate(comparison_df.columns)]
#print(comparison_df)
comparison_df.replace({"Yes": 1, "No": 0}, inplace=True)





  ground_truth_agg = ground_truth_df.groupby('creative_data_id').apply(lambda x: x.iloc[0]).reset_index(drop=True)
  comparison_df.replace({"Yes": 1, "No": 0}, inplace=True)


In [15]:
#comparison_df = comparison_df.astype(int)

# Initialize lists to store all predictions and ground truth labels
all_preds_flat = []
all_labels_flat = []

# Loop through each question to calculate metrics
for question in questions:
    preds = comparison_df[f"('pred', '{question}')_pred"]
    true = comparison_df[f"('true', '{question}')_true"]
    preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
    true.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)

    flag=False
    for i in preds:
        if i not in [1,0]:
            flag=True
    
    for i in true:
        if i not in [1,0]:
            flag=True

    if flag:
        print("Error in question: ", question)
        print("")
        continue



    
    all_preds_flat.extend(preds)
    all_labels_flat.extend(true)

    # Calculate precision, recall, and F1 score for the current question
    precision = precision_score(true, preds, pos_label=1, average='binary')
    recall = recall_score(true, preds, pos_label=1, average='binary')
    f1 = f1_score(true, preds, pos_label=1, average='binary')
    
    print(f"Metrics for {question}:")
    print(f"  Precision: {precision}")
    print(f"  Recall: {recall}")
    print(f"  F1 Score: {f1}\n")

# Calculate overall metrics
overall_precision = precision_score(all_labels_flat, all_preds_flat, average='macro')
overall_recall = recall_score(all_labels_flat, all_preds_flat, average='macro')
overall_f1 = f1_score(all_labels_flat, all_preds_flat, average='macro')

agreement_percentage = sum([pred == label for pred, label in zip(all_preds_flat, all_labels_flat)]) / len(all_preds_flat) * 100

print(f"Overall Agreement Percentage: {agreement_percentage}%")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall F1 Score: {overall_f1}")

Metrics for Is there a call to go online (e.g., shop online, visit the Web)? :

  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1,"Yes, both": 1, "0": 0}, inplace=True)
  preds.replace({"1": 1,"Yes, visual": 1


  Precision: 1.0
  Recall: 0.013888888888888888
  F1 Score: 0.0273972602739726

Metrics for Is there online contact information provided (e.g., URL, website)? :
  Precision: 0.45217391304347826
  Recall: 0.7222222222222222
  F1 Score: 0.5561497326203209

Metrics for Is there a visual or verbal call to purchase (e.g., buy now, order now)?:
  Precision: 0.5
  Recall: 0.5076923076923077
  F1 Score: 0.5038167938931297

Metrics for Does the ad portray a sense of urgency to act (e.g., buy before sales ends, order before ends)? :
  Precision: 0.225
  Recall: 0.20454545454545456
  F1 Score: 0.21428571428571427

Metrics for Is there an incentive to buy (e.g., a discount, a coupon, a sale or "limited time offer")? :
  Precision: 0.38095238095238093
  Recall: 0.49230769230769234
  F1 Score: 0.42953020134228187

Metrics for Is there offline contact information provided (e.g., phone, mail, store location)?:
  Precision: 0.25
  Recall: 0.6
  F1 Score: 0.35294117647058826

Metrics for Is there menti