In [55]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel

from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import numpy as np
import os

In [56]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

class MultimodalModel(nn.Module):
    def __init__(self, bert_model):
        super(MultimodalModel, self).__init__()
        self.bert_model = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, 3) 

    def forward(self, text):
        text_inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
        text_outputs = self.bert_model(**text_inputs)
        text_features = text_outputs.pooler_output
        logits = self.fc(text_features)
        return logits

model = MultimodalModel(bert_model)
model.eval()

MultimodalModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [57]:
video = 'WICC' # change here

In [58]:
data = pd.read_csv(f'dataset/{video}/{video}.csv', header=None, skiprows=1, names=['label', 'text']) 
print(data.head())

   label                                               text
0      0                            What is Climate Change?
0      1  goal 13 of the sustainable development goals c...
0      0  me Tom Tom is a college professor that teaches...
2      2  during one of his lectures one of his students...
2      2  during one of his lectures one of his students...


In [59]:
def evaluate_model(data, model):
    true_labels = []
    predictions = []

    for index, row in data.iterrows():
        label = row['label']
        transcript = row['text']
        
        with torch.no_grad():
            logits = model(transcript)
        
        predicted_class = torch.argmax(logits, dim=1).item()
        true_labels.append(label)
        predictions.append(predicted_class)
    
    true_labels = np.array(true_labels)
    predictions = np.array(predictions)
    
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    
    return accuracy, f1

In [60]:
accuracy, f1 = evaluate_model(data, model)

accuracy = round(accuracy, 2)
f1 = round(f1, 2)

results = pd.DataFrame({
    'video': [video],
    'accuracy': [accuracy],
    'f1': [f1]
})

filename='results/BERT_test_results.csv'
if os.path.exists(filename):
        results.to_csv(filename, mode='a', header=False, index=False)
else:
    results.to_csv(filename, mode='w', header=True, index=False)
