In [1]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/cs229/packages')

data_file = "/content/drive/MyDrive/cs229/mosei-data/mosei_senti_data.pkl"
%ls {data_file}

model_dir = "/content/drive/MyDrive/cs229/mosei-model"
%cd /content/drive/MyDrive/cs229/CMU-MultimodalSDK-Tutorials
%pwd

Mounted at /content/drive
/content/drive/MyDrive/cs229/mosei-data/mosei_senti_data.pkl
/content/drive/MyDrive/cs229/CMU-MultimodalSDK-Tutorials


'/content/drive/MyDrive/cs229/CMU-MultimodalSDK-Tutorials'

In [2]:
!ls


cmu_mosei_sdk_tutorial.ipynb  model.std    README.md
constants		      mosei-model  tutorial_interactive.ipynb
data			      optim.std


In [3]:
#import mmsdk
import os
import re
import numpy as np
#from mmsdk import mmdatasdk as md
from subprocess import check_call, CalledProcessError
import torch.nn as nn
import torch.optim as optim

In [4]:
import torch
from torch.utils.data import Dataset

class MOSEIDataset(Dataset):
    def __init__(self, data_split):
        self.text = torch.tensor(data_split['text'], dtype=torch.float32)
        self.audio = torch.tensor(data_split['audio'], dtype=torch.float32)
        self.vision = torch.tensor(data_split['vision'], dtype=torch.float32)
        self.labels = torch.tensor(data_split['labels'], dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'text': self.text[idx],
            'audio': self.audio[idx],
            'vision': self.vision[idx],
            'label': self.labels[idx],
        }


In [5]:
from torch.utils.data import DataLoader

# load the pickle
import pickle
with open(data_file, "rb") as f:
    data = pickle.load(f)





In [6]:
train_set = MOSEIDataset(data['train'])
valid_set = MOSEIDataset(data['valid'])
test_set  = MOSEIDataset(data['test'])

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=32)
test_loader  = DataLoader(test_set, batch_size=32)

print(f"train {len(train_loader.dataset)}, valid {len(valid_loader.dataset)}, test {len(test_loader.dataset)}")

train 16265, valid 1869, test 4643


In [7]:
class MultiModal(nn.Module):
  def __init__(self, input_dim, dim, nhead=8, nlayer=4):
    super(MultiModal, self).__init__()
    self.input_dim = input_dim
    self.dim = dim
    self.nhead = nhead
    self.nlayer = nlayer

    self.input_proj = nn.Linear(input_dim, dim)
    encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=self.nhead, batch_first=True)
    self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=self.nlayer)
    self.fc1 = nn.Linear(dim, 512)
    self.dropout = nn.Dropout(0.2)
    self.fc2 = nn.Linear(512, 1)

  def forward(self, text, audio, vision):
    text[torch.isinf(text)] = 0.0
    audio[torch.isinf(audio)] = 0.0
    vision[torch.isinf(vision)] = 0.0

    x_concat = torch.cat((text, audio, vision), dim=2)

    if torch.isinf(x_concat).any():
      inf_indices = torch.nonzero(torch.isinf(x_concat), as_tuple=False)
      print(f"infinities {inf_indices}")
      print(x_concat[inf_indices[0], inf_indices[1], :])

    x_proj = self.input_proj(x_concat)
    #print(f"x_proj = {x_proj[2:4, 2:5, 5:8]}")
    encoder_output = self.encoder(x_proj)

    #print(f"encoder output {encoder_output[2:4, 2:5, 5:8]}")
    seq_mean = encoder_output.mean(dim=1)
    #print(f"seq mean {seq_mean.shape} {seq_mean[2:4, 5:8]}")

    pred = self.fc1(seq_mean)
    pred = torch.relu(pred)
    pred = self.dropout(pred)
    pred = self.fc2(pred)
    #print(f"final pred ==== {pred[2:4, :].squeeze()}")

    return pred

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultiModal(409, 256, nhead=8, nlayer=6)
model.to(device)
#criterion = nn.L1Loss()
criterion = nn.MSELoss()

optimizer = optim.Adam(model.parameters(), lr=0.0001)

MAX_EPOCH = 35

batch_num = 0
running_loss = 0
model.train()
for epoch in range(MAX_EPOCH):
  for data_batch in train_loader:
    batch_num += 1
    text = data_batch['text'].to(device)
    audio = data_batch['audio'].to(device)
    vision = data_batch['vision'].to(device)
    label = data_batch['label'].to(device)

    '''
    print(data_batch['text'].shape)
    print(data_batch['audio'].shape)
    print(data_batch['vision'].shape)
    print(data_batch['label'].shape)
    print(pred.shape)
    '''

    pred = model(text, audio, vision)
    pred = pred.squeeze()
    label = label.squeeze()

    optimizer.zero_grad()

    loss = criterion(pred, label)
    running_loss += loss.item()

    if batch_num % 10 == 0:
      print(f"epoch {epoch}, batch {batch_num}")
      print(f"loss = {loss}, avg loss = {running_loss / batch_num}")

    loss.backward()
    optimizer.step()


epoch 0, batch 10
loss = 1.1659321784973145, avg loss = 1.2700236678123473
epoch 0, batch 20
loss = 1.891169548034668, avg loss = 1.3156824171543122
epoch 0, batch 30
loss = 0.9379133582115173, avg loss = 1.3066065589586893
epoch 0, batch 40
loss = 1.2557971477508545, avg loss = 1.2979706376791
epoch 0, batch 50
loss = 1.2434903383255005, avg loss = 1.2853982770442962
epoch 0, batch 60
loss = 0.7189827561378479, avg loss = 1.2825924187898636
epoch 0, batch 70
loss = 1.2519173622131348, avg loss = 1.2775922613484518
epoch 0, batch 80
loss = 1.3675386905670166, avg loss = 1.288374599069357
epoch 0, batch 90
loss = 1.4540151357650757, avg loss = 1.2853212104903327
epoch 0, batch 100
loss = 1.3032591342926025, avg loss = 1.2805800974369048
epoch 0, batch 110
loss = 1.2223310470581055, avg loss = 1.280404880913821
epoch 0, batch 120
loss = 0.826167106628418, avg loss = 1.2790079598625501
epoch 0, batch 130
loss = 0.8383549451828003, avg loss = 1.2753815352916718
epoch 0, batch 140
loss = 0.

In [9]:
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error

model.eval()
total_sample = 0
acc_2 = 0
acc_7 = 0
all_pred = np.array([])
all_label = np.array([])
with torch.no_grad():
  for valid_batch in valid_loader:
    text = valid_batch['text'].to(device)
    audio = valid_batch['audio'].to(device)
    vision = valid_batch['vision'].to(device)
    label = valid_batch['label'].to(device)
    #print(f"text {text.shape}")

    pred = model(text, audio, vision).squeeze()

    label = label.squeeze()

    all_pred = np.append(all_pred, pred.cpu().detach().numpy())
    all_label = np.append(all_label, label.cpu().detach().numpy())


mae = np.mean(np.abs(all_pred - all_label))
binary_pred = all_pred[all_label != 0] > 0
binary_label = all_label[all_label != 0] > 0
f1 = f1_score(binary_pred , binary_label, average="weighted")
corr = np.corrcoef(all_pred, all_label)[0,1]
pred_7 = np.clip(all_pred, a_min=-3, a_max=3)
label_7 = np.clip(all_label, a_min=-3, a_max=3)
print(f"acc_2 = {np.mean(binary_pred == binary_label)}, acc_7 = {np.mean(np.round(pred_7) == np.round(label_7))}, MAE = {mae}, f1 = {f1}, corr={corr}")

acc_2 = 0.7576601671309192, acc_7 = 0.48582129481005887, MAE = 0.6630311481337725, f1 = 0.7540810070951669, corr=0.55597135224185


In [10]:
class Encoder(nn.Module):
  def __init__(self, in_dim, dim):
    super(Encoder, self).__init__()
    self.input_dim = in_dim
    self.dim = dim

    ,

    self.transformer = nn.Sequential(
      nn.Linear(in_dim, dim),
      nn.TransformerEncoder(
          nn.TransformerEncoderLayer(d_model=dim, nhead=8, batch_first=True),
          num_layers=4,
      ),
    )

    self.fc = nn.Sequential(
      nn.Linear(dim, 512),
      nn.ReLU(),
      nn.Linear(512, 1)
    )

  def forward(self, x):
    transformer_out = self.transformer(x)
    seq_mean = transformer_out.permute(1, 0, 2).mean(dim=0) # from b, s, d to s, b, d

    out = self.fc(seq_mean)
    return out

class LateFusion(nn.Module):
  def __init__(self):
    super(LateFusion, self).__init__()

    dim = 512
    self.textEncoder = Encoder(300, dim)
    self.audioEncoder = Encoder(74, dim)
    self.visionEncoder = Encoder(35, dim)

    self.fc = nn.Linear(3, 1)

  def forward(self, x):
    text = x['text']
    audio = x['audio']
    vision = x['vision']

    text[torch.isinf(text)] = 0.0
    audio[torch.isinf(audio)] = 0.0
    vision[torch.isinf(vision)] = 0.0

    text_out = self.textEncoder(text)
    audio_out = self.audioEncoder(audio)
    vision_out = self.visionEncoder(vision)

    all = torch.cat((text_out, audio_out, vision_out), dim=1)

    pred = self.fc(all)
    print(pred.squeeze())
    return pred
