In [None]:
!pip install -q torch torchvision transformers pillow pandas scikit-learn

In [None]:
ZIP_PATH = "/content/mvsa.zip"   # change if filename is different

!mkdir -p /content/mvsa_raw
!unzip -o "$ZIP_PATH" -d /content/mvsa_raw

!ls -R /content/mvsa_raw

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/mvsa_raw/MVSA_Single/data/3916.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3916.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3917.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3917.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3918.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3918.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3919.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3919.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/392.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/392.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3920.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3920.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3921.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3921.txt  
  inflating: /content/mvsa_raw/MVSA_Single/data/3922.jpg  
  inflating: /content/mvsa_raw/MVSA_Single/data/3922

In [None]:
import os
from pathlib import Path
import pandas as pd

ROOT_DIR   = "/content/mvsa_raw/MVSA_Single"
DATA_DIR   = os.path.join(ROOT_DIR, "data")
LABEL_FILE = os.path.join(ROOT_DIR, "labelResultAll.txt")

print("ROOT_DIR :", ROOT_DIR)
print("DATA_DIR :", DATA_DIR)
print("LABEL_FILE:", LABEL_FILE)

# 1) Read labelResultAll.txt
# First column: ID
# Second column header: "text,image"
labels_raw = pd.read_csv(LABEL_FILE, sep=r"\s+", engine="python")
print(labels_raw.head())
print("Columns:", labels_raw.columns)

# 2) Split "text,image" into two sentiment columns
sent_col = labels_raw.columns[1]  # should be "text,image"
labels_raw[["text_sent", "image_sent"]] = labels_raw[sent_col].str.split(",", expand=True)

# 3) Use text sentiment as label
labels = labels_raw[["ID", "text_sent"]].copy()
labels["ID"] = labels["ID"].astype(int)
labels["text_sent"] = labels["text_sent"].str.lower().str.strip()

sent_to_id = {"negative": 0, "neutral": 1, "positive": 2}
labels["label"] = labels["text_sent"].map(sent_to_id)

print("Sentiment mapping:", sent_to_id)
print(labels.head())

ROOT_DIR : /content/mvsa_raw/MVSA_Single
DATA_DIR : /content/mvsa_raw/MVSA_Single/data
LABEL_FILE: /content/mvsa_raw/MVSA_Single/labelResultAll.txt
   ID         text,image
0   1   neutral,positive
1   2   neutral,positive
2   3   neutral,positive
3   4  positive,positive
4   5  positive,positive
Columns: Index(['ID', 'text,image'], dtype='object')
Sentiment mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
   ID text_sent  label
0   1   neutral      1
1   2   neutral      1
2   3   neutral      1
3   4  positive      2
4   5  positive      2


In [None]:
rows = []

for _, row in labels.iterrows():
    id_int = row["ID"]
    id_str = str(id_int)

    img_path = os.path.join(DATA_DIR, f"{id_str}.jpg")
    txt_path = os.path.join(DATA_DIR, f"{id_str}.txt")

    # ensure both files exist
    if not (os.path.exists(img_path) and os.path.exists(txt_path)):
        continue

    # read text
    with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read().strip()

    if not text:
        continue

    rows.append({
        "id": id_int,
        "text": text,
        "image_path": img_path,
        "label": row["label"],
    })

df = pd.DataFrame(rows)
print(df.head())
print("Total usable samples:", len(df))

# Optional: subsample for quicker training
N_SAMPLES = 4000  # adjust as you like
if len(df) > N_SAMPLES:
    df = df.sample(N_SAMPLES, random_state=42).reset_index(drop=True)

print("Using samples:", len(df))
df["label"].value_counts()

   id                                               text  \
0   1       How I feel today #legday #jelly #aching #gym   
1   2  grattis min griskulting!!!???? va bara tvungen...   
2   3  RT @polynminion: The moment I found my favouri...   
3   4  #escort We have a young and energetic team and...   
4   5  RT @chrisashaffer: Went to SSC today to be a "...   

                                 image_path  label  
0  /content/mvsa_raw/MVSA_Single/data/1.jpg      1  
1  /content/mvsa_raw/MVSA_Single/data/2.jpg      1  
2  /content/mvsa_raw/MVSA_Single/data/3.jpg      1  
3  /content/mvsa_raw/MVSA_Single/data/4.jpg      2  
4  /content/mvsa_raw/MVSA_Single/data/5.jpg      2  
Total usable samples: 4869
Using samples: 4000


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,1569
2,1427
0,1004


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 64

# Image transforms
train_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.1, contrast=0.1,
                           saturation=0.1, hue=0.02),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
])

class MVSASingleDataset(Dataset):
    def __init__(self, df, transform):
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # text → tokens
        text = str(row["text"])
        enc = tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        # image
        image = Image.open(row["image_path"]).convert("RGB")
        image = self.transform(image)

        label = torch.tensor(row["label"], dtype=torch.long)

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "image": image,
            "label": label
        }

# Stratified split
train_df, val_df = train_test_split(
    df,
    test_size=0.2,
    stratify=df["label"],
    random_state=42
)

train_dataset = MVSASingleDataset(train_df, transform=train_transform)
val_dataset   = MVSASingleDataset(val_df,   transform=val_transform)

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)

len(train_loader), len(val_loader)

(200, 50)

In [None]:
import torch.nn as nn
from transformers import AutoModel
from torchvision import models

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

class MultimodalSentimentNet(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()

        # Text encoder
        self.text_encoder = AutoModel.from_pretrained("bert-base-uncased")
        text_dim = self.text_encoder.config.hidden_size  # 768

        # Image encoder
        resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        img_dim = resnet.fc.in_features                  # 512
        resnet.fc = nn.Identity()
        self.image_encoder = resnet

        # Fusion classifier
        fusion_dim = text_dim + img_dim  # 768 + 512
        self.classifier = nn.Sequential(
            nn.Linear(fusion_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Text branch
        text_out = self.text_encoder(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        text_emb = text_out.last_hidden_state[:, 0, :]  # CLS token

        # Image branch
        img_emb = self.image_encoder(images)

        # Fuse
        fused = torch.cat([text_emb, img_emb], dim=1)
        logits = self.classifier(fused)
        return logits

NUM_CLASSES = 3  # negative / neutral / positive
model = MultimodalSentimentNet(num_classes=NUM_CLASSES).to(device)

Device: cuda


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:04<00:00, 10.5MB/s]


In [None]:
import torch.nn as nn
from torch.optim import AdamW

# Class weights (handle imbalance)
class_counts = train_df["label"].value_counts().sort_index().values.astype(np.float32)
inv = 1.0 / class_counts
weights = inv / inv.sum() * len(class_counts)

class_weights = torch.tensor(weights, dtype=torch.float32).to(device)
print("Class counts :", class_counts)
print("Class weights:", class_weights)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=1e-2)

Class counts : [ 803. 1255. 1142.]
Class weights: tensor([1.2804, 0.8193, 0.9003], device='cuda:0')


In [None]:
from torch.nn.functional import cross_entropy

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    correct = 0
    total = 0

    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            logits = model(input_ids, attention_mask, images)
            loss = criterion(logits, labels)

            if train:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()

        preds = torch.argmax(logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    acc = correct / total
    return avg_loss, acc


EPOCHS = 15
best_val_acc = 0.0
patience = 3
patience_counter = 0
best_path = "/content/mvsa_multimodal_best.pt"

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(train_loader, train=True)
    val_loss, val_acc = run_epoch(val_loader, train=False)

    print(f"Epoch {epoch:02d}: "
          f"train_loss={train_loss:.4f}, train_acc={train_acc:.3f}, "
          f"val_loss={val_loss:.4f}, val_acc={val_acc:.3f}")

    # Early stopping on val_acc
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save(model.state_dict(), best_path)
        print(f"  🔸 New best model saved (val_acc={best_val_acc:.3f})")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("  🔻 Early stopping triggered.")
            break

print("Best validation accuracy:", best_val_acc)

Epoch 01: train_loss=0.9028, train_acc=0.583, val_loss=0.7931, val_acc=0.679
  🔸 New best model saved (val_acc=0.679)
Epoch 02: train_loss=0.6792, train_acc=0.747, val_loss=0.8670, val_acc=0.693
  🔸 New best model saved (val_acc=0.693)
Epoch 03: train_loss=0.4954, train_acc=0.841, val_loss=1.0891, val_acc=0.698
  🔸 New best model saved (val_acc=0.698)
Epoch 04: train_loss=0.3749, train_acc=0.891, val_loss=1.0736, val_acc=0.703
  🔸 New best model saved (val_acc=0.703)
Epoch 05: train_loss=0.3116, train_acc=0.913, val_loss=1.3244, val_acc=0.693
Epoch 06: train_loss=0.2940, train_acc=0.922, val_loss=1.4329, val_acc=0.682
Epoch 07: train_loss=0.2115, train_acc=0.947, val_loss=1.8094, val_acc=0.686
  🔻 Early stopping triggered.
Best validation accuracy: 0.7025


In [None]:
model.load_state_dict(torch.load(best_path, map_location=device))
model.eval()

FINAL_SAVE = "/content/mvsa_multimodal_sentiment.pt"
torch.save(model.state_dict(), FINAL_SAVE)
print("Final best model saved at:", FINAL_SAVE)

Final best model saved at: /content/mvsa_multimodal_sentiment.pt
