In [1]:
import torch

from dataset import iemocap
from torch.utils.data import DataLoader, random_split
from model import MultimodalClassificationHead
from train import train



In [2]:
# Hyperparameters
MODEL_NAME = "text_frozen"
BATCH_SIZE = 64
LEARNING_RATE = 0.001
EPOCHS = 5
AUDIO_MODALITY = False
TEXT_MODALITY = True
VIDEO_MODALITY = False

In [3]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [5]:
import torchaudio
from pathlib import Path
import numpy as np

total_mean = 0
total_std = 0
total_samples = 0

for session in [1, 2, 3, 4, 5]:
    audio_dir = f'E:/IEMOCAP_full_release/Session{session}/audio'
    audio_paths = Path(audio_dir).glob("*/*.wav")
    for audio_path in audio_paths:
        waveform, _ = torchaudio.load(audio_path)
        waveform = waveform.numpy().squeeze()

        mean = np.mean(waveform)
        std = np.std(waveform)
        samples = waveform.size

        total_mean += mean * samples
        total_std += std * samples
        total_samples += samples

global_mean = total_mean / total_samples
global_std = total_std / total_samples

In [6]:
print(global_mean)
print(global_std)

-1.6445859429616712e-05
0.029528086858860082


In [4]:
# precalculated to save time
global_mean = -1.6445859429616712e-05
global_std = 0.029528086858860082

In [5]:
dataset = iemocap.IEMOCAP('E:/IEMOCAP_full_release', global_mean, global_std)

total_samples = len(dataset)
train_size = int(0.8 * total_samples)
val_size = int(0.2 * total_samples)
test_size = total_samples - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset, [train_size, val_size, test_size]
)

In [6]:
model = MultimodalClassificationHead(audio_modality=AUDIO_MODALITY, 
                                     text_modality=TEXT_MODALITY, 
                                     video_modality=VIDEO_MODALITY)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

modalities.text.weights True
modalities.text.model.embeddings.word_embeddings.weight False
modalities.text.model.embeddings.position_embeddings.weight False
modalities.text.model.embeddings.token_type_embeddings.weight False
modalities.text.model.embeddings.LayerNorm.weight False
modalities.text.model.embeddings.LayerNorm.bias False
modalities.text.model.encoder.layer.0.attention.self.query.weight False
modalities.text.model.encoder.layer.0.attention.self.query.bias False
modalities.text.model.encoder.layer.0.attention.self.key.weight False
modalities.text.model.encoder.layer.0.attention.self.key.bias False
modalities.text.model.encoder.layer.0.attention.self.value.weight False
modalities.text.model.encoder.layer.0.attention.self.value.bias False
modalities.text.model.encoder.layer.0.attention.output.dense.weight False
modalities.text.model.encoder.layer.0.attention.output.dense.bias False
modalities.text.model.encoder.layer.0.attention.output.LayerNorm.weight False
modalities.text.mod

In [8]:
train(
    model_name=MODEL_NAME,
    model=model,
    train_data=train_dataset,
    val_data=val_dataset,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    audio_modality=AUDIO_MODALITY, 
    text_modality=TEXT_MODALITY, 
    video_modality=VIDEO_MODALITY
)

  return torch.tensor(value)
100%|██████████| 93/93 [1:41:59<00:00, 65.80s/it]


Epoch [1/5] - Train Loss: 1.6256 - Val Loss: 1.5937 - Val Acc: 0.2408


100%|██████████| 93/93 [1:42:34<00:00, 66.18s/it]


Epoch [2/5] - Train Loss: 1.6110 - Val Loss: 1.6048 - Val Acc: 0.2408


100%|██████████| 93/93 [1:42:28<00:00, 66.11s/it]


Epoch [3/5] - Train Loss: 1.5966 - Val Loss: 1.5788 - Val Acc: 0.2673


 55%|█████▍    | 51/93 [56:40<46:33, 66.52s/it]  

In [None]:
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)