In [1]:
import torch
import torchaudio
import torchvision
import numpy as np

from transformers import (
    Wav2Vec2Model,
    RobertaModel,
    VivitModel,
    Wav2Vec2Processor,
    RobertaTokenizer,
    VivitImageProcessor
)

In [2]:
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070'

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [4]:
def get_gpu_memory():
    total_memory = torch.cuda.get_device_properties(0).total_memory
    allocated_memory = torch.cuda.memory_allocated(0)
    reserved_memory = torch.cuda.memory_reserved(0)

    free_memory = total_memory - max(allocated_memory, reserved_memory)
    print(f"Available GPU Memory: {free_memory / (1024 ** 3)} GB")

get_gpu_memory()

Available GPU Memory: 7.99969482421875 GB


In [10]:
import torch.nn as nn
import torch.nn.functional as F

weights = nn.Parameter(torch.randn(12))
weights = F.softmax(weights, dim=0)
print(weights)
weights = weights.view(12, 1, 1, 1)
hidden_states = torch.randn(12, 2, 97, 768)


tensor([0.0579, 0.0873, 0.0681, 0.0319, 0.0161, 0.0089, 0.0077, 0.4500, 0.1338,
        0.0418, 0.0611, 0.0354], grad_fn=<SoftmaxBackward0>)


In [13]:
%%time

import torch.nn as nn
import torch.nn.functional as F

# Load audio file
waveform1, sample_rate = torchaudio.load('E:/IEMOCAP_full_release/Session1/audio/Ses01F_impro01/Ses01F_impro01_F000.wav')
waveform2, sample_rate = torchaudio.load('E:/IEMOCAP_full_release/Session1/audio/Ses01F_impro01/Ses01F_impro01_F001.wav')

waveform1 = waveform1.numpy().squeeze()
waveform2 = waveform2.numpy().squeeze()

# Load models
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
wav2vec2.gradient_checkpointing_enable()

# Evaluate model
wav2vec2.eval()

# Process inputs
with torch.no_grad():  # Disable gradient tracking
    inputs = audio_processor([waveform1, waveform2], return_tensors='pt', sampling_rate=16000, padding=True)
    inputs.to(device)
    wav2vec2.to(device)

    # Get outputs
    outputs = wav2vec2(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states[1:]
    hidden_states = torch.stack(hidden_states, axis=0)

    print(hidden_states.shape)


# Free memory
get_gpu_memory()

del inputs, outputs, hidden_states, wav2vec2
torch.cuda.empty_cache()


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([12, 2, 97, 768])
Available GPU Memory: 7.43914794921875 GB
CPU times: total: 1.59 s
Wall time: 1.53 s


In [7]:
%%time
# Load text file
text1 = "Replace me by any text you'd like."
text2 = "Replace me by any text you'd like. asdasd"


# Load models
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta = RobertaModel.from_pretrained("roberta-base")

# Evaluate model
roberta.eval()

# Process inputs
with torch.no_grad():  # Disable gradient tracking
    inputs = tokenizer([text1, text2], return_tensors='pt', padding=True)
    print(inputs['input_ids'].shape)
    inputs.to(device)
    roberta.to(device)

    # Get outputs
    outputs = roberta(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    last_hidden_states = outputs.last_hidden_state

# Print shape
print(len(hidden_states))
print(list(last_hidden_states.shape))

# Free memory
get_gpu_memory()

del inputs, outputs, hidden_states, last_hidden_states, roberta
torch.cuda.empty_cache()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([2, 16])
13
[2, 16, 768]
Available GPU Memory: 7.47625732421875 GB
CPU times: total: 2.23 s
Wall time: 4.22 s


In [37]:
%%time
# Load audio file
video1, _, _ = torchvision.io.read_video("E:/IEMOCAP_full_release/Session1/video/Ses01F_script01_1/Ses01F_script01_1_F039.avi", output_format="THWC", pts_unit='sec')
video2, _, _ = torchvision.io.read_video("E:/IEMOCAP_full_release/Session1/video/Ses01F_script01_1/Ses01F_script01_1_F038.avi", output_format="THWC", pts_unit='sec')

# Load models
image_processor = VivitImageProcessor.from_pretrained("google/vivit-b-16x2-kinetics400")
vivit = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400")

indices1 = np.linspace(0, video1.shape[0] - 1, 32, dtype=int)
indices2 = np.linspace(0, video2.shape[0] - 1, 32, dtype=int)

video1 = video1[indices1]
video1_frames = [video1[i] for i in range(video1.shape[0])]
video2 = video2[indices2]
video2_frames = [video2[i] for i in range(video2.shape[0])]

# Evaluate model
vivit.eval()
# Process inputs
with torch.no_grad():  # Disable gradient tracking
    inputs = image_processor([video1_frames, video2_frames], return_tensors="pt", padding=True)
    inputs.to(device)
    vivit.to(device)

    # Get outputs
    outputs = vivit(**inputs, output_hidden_states=True)
    hidden_states = outputs.hidden_states
    last_hidden_state = outputs.last_hidden_state

# Print shape
print(torch.stack(hidden_states[1:], axis=0).shape)
print(list(last_hidden_state.shape))

# Free memory
get_gpu_memory()

del inputs, outputs, hidden_states, last_hidden_state, vivit
torch.cuda.empty_cache()

Some weights of VivitModel were not initialized from the model checkpoint at google/vivit-b-16x2-kinetics400 and are newly initialized: ['vivit.pooler.dense.weight', 'vivit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([12, 2, 3137, 768])
[2, 3137, 768]
Available GPU Memory: 5.47235107421875 GB
CPU times: total: 4.17 s
Wall time: 4.11 s


In [75]:
from dataset import iemocap

dataset = iemocap.IEMOCAP('E:/IEMOCAP_full_release')

In [76]:
dataset = dataset.map(
    batch_size=2
)

AttributeError: 'IEMOCAP' object has no attribute 'map'

In [4]:
audio_input, text_input, video_input, label = dataset[1]

In [6]:
print(text_input)

{'input_ids': tensor([[    0, 10105,    38,    21,  2445,    13,    47,     6,  1573,     4,
          1437, 15628,   172,    47,   393,   875,   162,     8,   172,    77,
            47,   222,     6,   157,     6,    47,   686,    64,    28, 33406,
            47,   216,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
