In [6]:
!pip install torch==2.3.1 torchvision==0.18.1 torchaudio==2.3.1 transformers==4.45.0 datasets==2.18.0 librosa==0.11.0 scikit-learn==1.7.2 pandas==2.2.3 numpy==2.1.2 matplotlib==3.9.2 seaborn==0.13.2 opencv-python==4.10.0.84 tqdm==4.67.1 soundfile==0.13.0 scipy==1.13.0 pydub==0.25.1 Pillow==11.0.0 tensorboard==2.17.0 wget==3.2


     

 Collecting torch==2.3.1 

   Downloading torch-2.3.1-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB) 

 Collecting torchvision==0.18.1 

   Downloading torchvision-0.18.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.6 kB) 

 Collecting torchaudio==2.3.1 

   Downloading torchaudio-2.3.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.4 kB) 

 Collecting transformers==4.45.0 

   Downloading transformers-4.45.0-py3-none-any.whl.metadata (44 kB) 

      ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.4/44.4 kB 1.7 MB/s eta 0:00:00 

 Collecting datasets==2.18.0 

   Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB) 


 Collecting scikit-learn==1.7.2 

   Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB) 

 Collecting pandas==2.2.3 

   Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB) 

      ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 89.9/89.9 kB 3.9 MB/s eta 0:00

In [7]:
import os
import tarfile
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
import torchaudio
import transformers
from transformers import AutoModel, AutoTokenizer
import cv2
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score, classification_report
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("Checking GPU...")
print(f"GPU available: {torch.cuda.is_available()}")
print(f"GPU name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Install required libraries
!pip install -q transformers torchaudio torchvision opencv-python pillow seaborn librosa

# Check if dataset is already downloaded
if not os.path.exists("MELD_Raw"):
    print("Dataset not found. Downloading...")

    # Download raw MELD archive
    if not os.path.exists("MELD.Raw.tar.gz"):
        !wget -q https://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz

    # Download CSV files
    if not os.path.exists("train_sent_emo.csv"):
        !wget -q https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/train_sent_emo.csv
    if not os.path.exists("dev_sent_emo.csv"):
        !wget -q https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/dev_sent_emo.csv
    if not os.path.exists("test_sent_emo.csv"):
        !wget -q https://raw.githubusercontent.com/declare-lab/MELD/master/data/MELD/test_sent_emo.csv

    # Extract only if not yet extracted
    print("Extracting MELD.Raw.tar.gz...")
    with tarfile.open('MELD.Raw.tar.gz', 'r:gz') as tar:
        tar.extractall('MELD_Raw')

else:
    print("Dataset already exists. Skipping download and extraction.")


with tarfile.open('MELD.Raw.tar.gz.1', 'r:gz') as tar:
    tar.extractall('MELD_Raw')

train_df = pd.read_csv('train_sent_emo.csv')
dev_df = pd.read_csv('dev_sent_emo.csv')
test_df = pd.read_csv('test_sent_emo.csv')

emotion_labels = {'neutral': 0, 'joy': 1, 'surprise': 2, 'sadness': 3, 'fear': 4, 'disgust': 5, 'anger': 6}
sentiment_labels = {'neutral': 0, 'positive': 1, 'negative': 2}

class MELDDataset(Dataset):
    def __init__(self, dataframe, base_path, modality='all', max_length=128, sr=22050, max_frames=75):
        self.dataframe = dataframe
        self.base_path = base_path
        self.modality = modality
        self.max_length = max_length
        self.sr = sr
        self.max_frames = max_frames
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def __len__(self):
        return len(self.dataframe)

    def load_audio(self, dialogue_id, utterance_id):
        audio_path = f"{self.base_path}/train_splits/dia{dialogue_id}_utt{utterance_id}.wav"
        if not os.path.exists(audio_path):
            audio_path = audio_path.replace("train_splits", "dev_splits_complete")
        if not os.path.exists(audio_path):
            audio_path = audio_path.replace("dev_splits_complete", "test_splits_complete")

        try:
            waveform, sample_rate = torchaudio.load(audio_path)
            if sample_rate != self.sr:
                waveform = torchaudio.transforms.Resample(sample_rate, self.sr)(waveform)
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            if waveform.shape[1] < self.sr * 3:
                padding = self.sr * 3 - waveform.shape[1]
                waveform = torch.nn.functional.pad(waveform, (0, padding))
            else:
                waveform = waveform[:, :self.sr * 3]
            return waveform.squeeze(0)
        except:
            return torch.zeros(self.sr * 3)

    def load_video_frames(self, dialogue_id, utterance_id):
        video_path = f"{self.base_path}/train_splits/dia{dialogue_id}_utt{utterance_id}.mp4"
        if not os.path.exists(video_path):
            video_path = video_path.replace("train_splits", "dev_splits_complete")
        if not os.path.exists(video_path):
            video_path = video_path.replace("dev_splits_complete", "test_splits_complete")

        frames = []
        try:
            cap = cv2.VideoCapture(video_path)
            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            if frame_count > 0:
                indices = np.linspace(0, frame_count-1, min(self.max_frames, frame_count), dtype=int)
                for i in range(max(indices) + 1):
                    ret, frame = cap.read()
                    if not ret:
                        break
                    if i in indices:
                        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        frame = cv2.resize(frame, (224, 224))
                        frame = torch.tensor(frame).permute(2, 0, 1).float() / 255.0
                        frames.append(frame)
            cap.release()
        except:
            pass

        if len(frames) == 0:
            frames = [torch.zeros(3, 224, 224) for _ in range(self.max_frames)]

        if len(frames) < self.max_frames:
            padding = [torch.zeros(3, 224, 224) for _ in range(self.max_frames - len(frames))]
            frames.extend(padding)
        else:
            frames = frames[:self.max_frames]

        return torch.stack(frames)

    def process_text(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', max_length=self.max_length,
                               padding='max_length', truncation=True)
        return inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        dialogue_id = row['Dialogue_ID']
        utterance_id = row['Utterance_ID']
        text = row['Utterance']
        emotion = emotion_labels.get(row['Emotion'].lower(), 0)

        text_input, attention_mask = self.process_text(text)

        if self.modality in ['text', 'all', 'text_audio', 'text_video']:
            audio = self.load_audio(dialogue_id, utterance_id)
        else:
            audio = torch.zeros(self.sr * 3)

        if self.modality in ['video', 'all', 'text_video', 'audio_video']:
            video_frames = self.load_video_frames(dialogue_id, utterance_id)
        else:
            video_frames = torch.zeros(self.max_frames, 3, 224, 224)

        if self.modality == 'text':
            return text_input, attention_mask, emotion
        elif self.modality == 'audio':
            return audio, emotion
        elif self.modality == 'video':
            return video_frames, emotion
        elif self.modality == 'text_audio':
            return text_input, attention_mask, audio, emotion
        elif self.modality == 'text_video':
            return text_input, attention_mask, video_frames, emotion
        elif self.modality == 'audio_video':
            return audio, video_frames, emotion
        else:
            return text_input, attention_mask, audio, video_frames, emotion

class TextModel(nn.Module):
    def __init__(self, num_classes=7):
        super().__init__()
        self.bert = AutoModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

class AudioModel(nn.Module):
    def __init__(self, num_classes=7):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=64, stride=2, padding=32),
            nn.ReLU(),
            nn.MaxPool1d(8),
            nn.Conv1d(16, 32, kernel_size=32, stride=2, padding=16),
            nn.ReLU(),
            nn.MaxPool1d(8),
            nn.Conv1d(32, 64, kernel_size=16, stride=2, padding=8),
            nn.ReLU(),
            nn.MaxPool1d(8),
            nn.AdaptiveAvgPool1d(1)
        )
        self.classifier = nn.Linear(64, num_classes)

    def forward(self, x):
        if x.dim() == 1:
            x = x.unsqueeze(0)
        x = x.unsqueeze(1)
        x = self.cnn(x)
        x = x.squeeze(-1)
        return self.classifier(x)

class VideoModel(nn.Module):
    def __init__(self, num_classes=7):
        super().__init__()
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Identity()
        self.lstm = nn.LSTM(512, 256, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(512, num_classes)

    def forward(self, x):
        batch_size, num_frames, C, H, W = x.shape
        x = x.view(batch_size * num_frames, C, H, W)
        features = self.cnn(x)
        features = features.view(batch_size, num_frames, -1)
        lstm_out, _ = self.lstm(features)
        pooled = torch.mean(lstm_out, dim=1)
        return self.classifier(pooled)

class MultimodalFusionModel(nn.Module):
    def __init__(self, fusion_type='all', num_classes=7):
        super().__init__()
        self.fusion_type = fusion_type
        self.text_model = TextModel(num_classes)
        self.audio_model = AudioModel(num_classes)
        self.video_model = VideoModel(num_classes)

        input_size = 0
        if fusion_type in ['all', 'text_audio', 'text_video']:
            input_size += 768 # BERT hidden size
        if fusion_type in ['all', 'text_audio', 'audio_video']:
            input_size += 64 # Audio CNN output size
        if fusion_type in ['all', 'text_video', 'audio_video']:
            input_size += 512 # Video LSTM output size

        self.fusion_classifier = nn.Linear(input_size, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text_input=None, attention_mask=None, audio_input=None, video_input=None):
        features = []

        if self.fusion_type in ['all', 'text_audio', 'text_video'] and text_input is not None:
            text_features = self.text_model.bert(input_ids=text_input, attention_mask=attention_mask).pooler_output
            features.append(text_features)

        if self.fusion_type in ['all', 'text_audio', 'audio_video'] and audio_input is not None:
            if audio_input.dim() == 1:
                audio_input = audio_input.unsqueeze(0)
            audio_input = audio_input.unsqueeze(1)
            audio_features = self.audio_model.cnn(audio_input)
            audio_features = audio_features.squeeze(-1) # Squeeze the last dimension to make it 2D
            features.append(audio_features)

        if self.fusion_type in ['all', 'text_video', 'audio_video'] and video_input is not None:
            batch_size, num_frames, C, H, W = video_input.shape
            video_flat = video_input.view(batch_size * num_frames, C, H, W)
            video_cnn_features = self.video_model.cnn(video_flat)
            video_cnn_features = video_cnn_features.view(batch_size, num_frames, -1)
            lstm_out, _ = self.video_model.lstm(video_cnn_features)
            video_features = torch.mean(lstm_out, dim=1)
            features.append(video_features)

        fused_features = torch.cat(features, dim=1)
        fused_features = self.dropout(fused_features)
        return self.fusion_classifier(fused_features)

def train_model(modality='all', epochs=2):
    train_dataset = MELDDataset(train_df, 'MELD_Raw', modality=modality)
    dev_dataset = MELDDataset(dev_df, 'MELD_Raw', modality=modality)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=8, shuffle=False)

    if modality == 'text':
        model = TextModel().to(device)
    elif modality == 'audio':
        model = AudioModel().to(device)
    elif modality == 'video':
        model = VideoModel().to(device)
    else:
        model = MultimodalFusionModel(fusion_type=modality).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    train_losses = []
    val_accuracies = []

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')

        for batch in progress_bar:
            if modality == 'text':
                inputs, masks, labels = batch
                inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs, masks)
            elif modality == 'audio':
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
            elif modality == 'video':
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs)
            elif modality == 'text_audio':
                text_inputs, masks, audio_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                audio_inputs, labels = audio_inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(text_input=text_inputs, attention_mask=masks, audio_input=audio_inputs)
            elif modality == 'text_video':
                text_inputs, masks, video_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(text_input=text_inputs, attention_mask=masks, video_input=video_inputs)
            elif modality == 'audio_video':
                audio_inputs, video_inputs, labels = batch
                audio_inputs = audio_inputs.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(audio_input=audio_inputs, video_input=video_inputs)
            else:
                text_inputs, masks, audio_inputs, video_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                audio_inputs = audio_inputs.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(text_input=text_inputs, attention_mask=masks,
                              audio_input=audio_inputs, video_input=video_inputs)

            loss = criterion(outputs, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

        scheduler.step()
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)

        val_acc = evaluate_model(model, dev_loader, modality, device)
        val_accuracies.append(val_acc)

        print(f'Epoch {epoch+1}: Loss = {avg_loss:.4f}, Val Acc = {val_acc:.4f}')

    torch.save(model.state_dict(), f'model_{modality}.pth')

    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(train_losses)
    plt.title(f'Training Loss - {modality}')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')

    plt.subplot(1, 2, 2)
    plt.plot(val_accuracies)
    plt.title(f'Validation Accuracy - {modality}')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.savefig(f'training_plots_{modality}.png')
    plt.show()

    return model, train_losses, val_accuracies

def evaluate_model(model, data_loader, modality, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            if modality == 'text':
                inputs, masks, labels = batch
                inputs, masks, labels = inputs.to(device), masks.to(device), labels.to(device)
                outputs = model(inputs, masks)
            elif modality == 'audio':
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
            elif modality == 'video':
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
            elif modality == 'text_audio':
                text_inputs, masks, audio_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                audio_inputs, labels = audio_inputs.to(device), labels.to(device)
                outputs = model(text_input=text_inputs, attention_mask=masks, audio_input=audio_inputs)
            elif modality == 'text_video':
                text_inputs, masks, video_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                outputs = model(text_input=text_inputs, attention_mask=masks, video_input=video_inputs)
            elif modality == 'audio_video':
                audio_inputs, video_inputs, labels = batch
                audio_inputs = audio_inputs.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                outputs = model(audio_input=audio_inputs, video_input=video_inputs)
            else:
                text_inputs, masks, audio_inputs, video_inputs, labels = batch
                text_inputs, masks = text_inputs.to(device), masks.to(device)
                audio_inputs = audio_inputs.to(device)
                video_inputs, labels = video_inputs.to(device), labels.to(device)
                outputs = model(text_input=text_inputs, attention_mask=masks,
                              audio_input=audio_inputs, video_input=video_inputs)

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}')
    print(classification_report(all_labels, all_preds,
                              target_names=list(emotion_labels.keys())))
    return accuracy

modalities = ['text', 'audio', 'video', 'text_audio', 'text_video', 'audio_video', 'all']
results = {}

for modality in modalities:
    print(f"\n{'='*50}")
    print(f"Training {modality} model...")
    print(f"{'='*50}")
    model, train_losses, val_accuracies = train_model(modality, epochs=2)
    results[modality] = {
        'train_losses': train_losses,
        'val_accuracies': val_accuracies,
        'final_val_accuracy': val_accuracies[-1]
    }

plt.figure(figsize=(10, 6))
for modality in modalities:
    plt.plot(results[modality]['val_accuracies'], label=modality, marker='o')
plt.xlabel('Epoch')
plt.ylabel('Validation Accuracy')
plt.title('Validation Accuracy by Modality Combination')
plt.legend()
plt.grid(True)
plt.savefig('all_modalities_comparison.png')
plt.show()

results_df = pd.DataFrame({
    'Modality': modalities,
    'Final_Accuracy': [results[m]['final_val_accuracy'] for m in modalities]
})
print("\nFinal Results Comparison:")
print(results_df)

Checking GPU...
GPU available: True
GPU name: Tesla T4

Checking required libraries...

Downloading MELD dataset...
-- 2025-11-19 04:44:57 --
Connecting to: https://web.eecs.umich.edu/~mihalcea/downloads/MELD.Raw.tar.gz
Status: 200 OK
File size: 10.13 GB
Saving to: MELD.Raw.tar.gz
Download completed in 6m 01s (20.6 MB/s)

Downloading train_sent_emo.csv...
Status: 200 OK
File size: 1.07 MB
Saved: train_sent_emo.csv

Downloading dev_sent_emo.csv...
Status: 200 OK
File size: 118 KB
Saved: dev_sent_emo.csv

Downloading test_sent_emo.csv...
Status: 200 OK
File size: 287 KB
Saved: test_sent_emo.csv


Training text model...
Epoch 1/5: 100%|██████████| 1249/1249 [06:11<00:00, 3.36it/s, loss=1.1280] Accuracy: 0.6452, Precision: 0.6263, Recall: 0.6164, F1 Score: 0.6213
Epoch 2/5: 100%|██████████| 1249/1249 [06:03<00:00, 3.44it/s, loss=1.0420] Accuracy: 0.6623, Precision: 0.6478, Recall: 0.6320, F1 Score: 0.6398
Epoch 3/5: 100%|██████████| 1249/1249 [06:00<00:00, 3.47it/s, loss=0.9840] Accuracy: 