In [53]:
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import BertModel, BertTokenizer
from sklearn.metrics import classification_report, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms
import os
import pandas as pd


In [55]:
def load_mvsa_dataset(base_path="MVSA_Single"):
    label_file = os.path.join(base_path, "labelResultAll.txt")
    data_dir = os.path.join(base_path, "data")

    records = []

    with open(label_file, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()

    for line in lines[1:]:
        parts = line.strip().split("\t")
        if len(parts) != 2:
            continue

        img_id = parts[0]
        sentiments = parts[1].split(",")

        if len(sentiments) != 2:
            continue

        text_sentiment = sentiments[0].strip().lower()
        label_map = {"negative": 0, "neutral": 1, "positive": 2}

        if text_sentiment not in label_map:
            continue

        text_path = os.path.join(data_dir, f"{img_id}.txt")
        image_path = os.path.join(data_dir, f"{img_id}.jpg")

        if not os.path.exists(text_path) or not os.path.exists(image_path):
            continue

        with open(text_path, "r", encoding="utf-8", errors="ignore") as t:
            text = t.read().strip()

        records.append({
            "text": text,
            "image_path": image_path,
            "label": label_map[text_sentiment]
        })

    return pd.DataFrame(records)

df = load_mvsa_dataset()
print(df.shape)


(4869, 3)


In [56]:
from sklearn.model_selection import train_test_split

_, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

class MVSADataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )

        image = Image.open(row["image_path"]).convert("RGB")
        image = image_transform(image)

        label = torch.tensor(row["label"], dtype=torch.long)

        return (
            encoding["input_ids"].squeeze(0),
            encoding["attention_mask"].squeeze(0),
            image,
            label
        )

val_loader = DataLoader(MVSADataset(val_df), batch_size=8)


In [59]:
class MultiModalSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Identity()

        self.fc1 = nn.Linear(768 + 2048, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask, images):
        text_features = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).pooler_output

        image_features = self.resnet(images)

        fused = torch.cat((text_features, image_features), dim=1)
        x = self.fc1(fused)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiModalSentimentModel().to(device)
model.load_state_dict(torch.load("multimodal_model.pth", map_location=device))
model.eval()




MultiModalSentimentModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [60]:
y_true = []
y_pred = []

with torch.no_grad():
    for input_ids, attention_mask, images, labels in val_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        images = images.to(device)

        outputs = model(input_ids, attention_mask, images)
        preds = torch.argmax(outputs, dim=1).cpu()

        y_true.extend(labels.numpy())
        y_pred.extend(preds.numpy())


In [61]:
print(classification_report(
    y_true,
    y_pred,
    target_names=["Negative", "Neutral", "Positive"]
))

confusion_matrix(y_true, y_pred)


              precision    recall  f1-score   support

    Negative       0.65      0.76      0.70       244
     Neutral       0.76      0.71      0.74       384
    Positive       0.80      0.76      0.78       346

    accuracy                           0.74       974
   macro avg       0.74      0.75      0.74       974
weighted avg       0.75      0.74      0.74       974



array([[186,  39,  19],
       [ 65, 274,  45],
       [ 36,  47, 263]])