In [93]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import os

In [94]:
resnet50_model = models.resnet50(pretrained=True)

class ResNet50FeatureExtractor(nn.Module):
    def __init__(self):
        super(ResNet50FeatureExtractor, self).__init__()
        self.resnet50 = nn.Sequential(*list(resnet50_model.children())[:-1])  # Remove the classification layer

    def forward(self, x):
        x = self.resnet50(x)
        return x.view(x.size(0), -1)
    
resnet50_extractor = ResNet50FeatureExtractor()

class MultimodalModel(nn.Module):
    def __init__(self, feature_extractor):
        super(MultimodalModel, self).__init__()
        self.feature_extractor = feature_extractor
        self.fc = nn.Linear(2048, 3) 

    def forward(self, image):
        img_features = self.feature_extractor(image)
        logits = self.fc(img_features)
        return logits

model = MultimodalModel(resnet50_extractor)
model.eval()



MultimodalModel(
  (feature_extractor): ResNet50FeatureExtractor(
    (resnet50): Sequential(
      (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (4): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (

In [95]:
video = 'ACCFP' # change here

In [96]:
data = pd.read_csv(f'dataset/{video}/{video}.csv', header=None, skiprows=1, names=['label', 'text']) 
print(data.head())

   label                                               text
1      1   Addressing Climate Change through Fiscal Poli...
0      0                                            [Music]
0      1                                            [Music]
1      1                                            [Music]
0      0   Indonesia is the world's largest island count...


In [97]:
image_folder = f'dataset/{video}/{video}_frames' 

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB") 
    inputs = preprocess(image).unsqueeze(0)
    return inputs

In [98]:
def evaluate_model(data, image_folder, video, model):
    true_labels = []
    predictions = []

    for index, row in data.iterrows():
        label = row['label']
        
        image_path = os.path.join(image_folder, f"{video}-{index + 1:03d}.jpg")
        
        if not os.path.exists(image_path):
            print(f"Image {image_path} does not exist.")
            continue
        
        image_inputs = preprocess_image(image_path)
        
        with torch.no_grad():
            logits = model(image_inputs)
        
        predicted_class = torch.argmax(logits, dim=1).item()
        true_labels.append(label)
        predictions.append(predicted_class)
    
    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    
    return accuracy, f1

In [99]:
accuracy, f1 = evaluate_model(data, image_folder, video, model)

accuracy = round(accuracy, 2)
f1 = round(f1, 2)

results = pd.DataFrame({
    'video': [video],
    'accuracy': [accuracy],
    'f1': [f1]
})

filename='results/ResNet50_test_results.csv'
if os.path.exists(filename):
        results.to_csv(filename, mode='a', header=False, index=False)
else:
    results.to_csv(filename, mode='w', header=True, index=False)
