In [49]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os
import pandas as pd


In [51]:
def load_mvsa_dataset(base_path="MVSA_Single"):
    label_file = os.path.join(base_path, "labelResultAll.txt")
    data_dir = os.path.join(base_path, "data")

    records = []

    with open(label_file, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()

    for line in lines[1:]:
        parts = line.strip().split("\t")
        if len(parts) != 2:
            continue

        img_id = parts[0]
        sentiments = parts[1].split(",")

        if len(sentiments) != 2:
            continue

        text_sentiment = sentiments[0].strip().lower()
        label_map = {"negative": 0, "neutral": 1, "positive": 2}

        if text_sentiment not in label_map:
            continue

        text_path = os.path.join(data_dir, f"{img_id}.txt")
        image_path = os.path.join(data_dir, f"{img_id}.jpg")

        if not os.path.exists(text_path) or not os.path.exists(image_path):
            continue

        with open(text_path, "r", encoding="utf-8", errors="ignore") as t:
            text = t.read().strip()

        records.append({
            "text": text,
            "image_path": image_path,
            "label": label_map[text_sentiment]
        })

    return pd.DataFrame(records)

df = load_mvsa_dataset()
print(df.shape)


(4869, 3)


In [53]:
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

print(train_df.shape, val_df.shape)


(3895, 3) (974, 3)


In [55]:
from transformers import BertTokenizer
from PIL import Image
from torchvision import transforms

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

class MVSADataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        encoding = tokenizer(
            row["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )

        image = Image.open(row["image_path"]).convert("RGB")
        image = image_transform(image)

        label = torch.tensor(row["label"], dtype=torch.long)

        return (
            encoding["input_ids"].squeeze(0),
            encoding["attention_mask"].squeeze(0),
            image,
            label
        )


In [57]:
train_loader = DataLoader(
    MVSADataset(train_df),
    batch_size=8,
    shuffle=True
)

val_loader = DataLoader(
    MVSADataset(val_df),
    batch_size=8
)


In [59]:
import torch
import torch.nn as nn
import torchvision.models as models
from transformers import BertModel

class MultiModalSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()

        # Text encoder
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Image encoder
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Identity()

        # Fusion + classifier
        self.fc1 = nn.Linear(768 + 2048, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask, images):
        text_features = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        ).pooler_output

        image_features = self.resnet(images)

        fused = torch.cat((text_features, image_features), dim=1)
        x = self.fc1(fused)
        x = self.relu(x)
        x = self.dropout(x)
        return self.fc2(x)


In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = MultiModalSentimentModel().to(device)

# Class-weighted loss
class_counts = df["label"].value_counts().sort_index()
weights = 1.0 / class_counts
weights = torch.tensor(weights.values, dtype=torch.float).to(device)

criterion = torch.nn.CrossEntropyLoss(weight=weights)

# optimizer = torch.optim.AdamW(
#     filter(lambda p: p.requires_grad, model.parameters()),
#     lr=1e-3
# )
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=2e-5
)




In [63]:
# # Freeze BERT
# for param in model.bert.parameters():
#     param.requires_grad = False

# Unfreeze ONLY the last BERT encoder layer
for param in model.bert.encoder.layer[-1].parameters():
    param.requires_grad = True

# Freeze ResNet
for param in model.resnet.parameters():
    param.requires_grad = False


In [65]:
# epochs = 5
epochs = 2

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for input_ids, attention_mask, images, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1} Loss:", total_loss / len(train_loader))


Epoch 1 Loss: 0.8516178910737165
Epoch 2 Loss: 0.5651495594538947


In [66]:
# Save trained model
torch.save(model.state_dict(), "multimodal_model.pth")

print("Model saved as multimodal_model.pth")


Model saved as multimodal_model.pth
