In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
import os
import pathlib

In [None]:
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch
from transformers import WhisperProcessor, AutoTokenizer
from datasets import load_dataset
import huggingface_hub as hf_hub

In [None]:
WHISPER_MODEL_NAME = "openai/whisper-base"
LLAMA_MODEL_NAME = "meta-llama/Llama-3.2-3B"

In [None]:
whisper_processor = WhisperProcessor.from_pretrained(WHISPER_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(LLAMA_MODEL_NAME)

In [None]:
dataset_name = "openslr/librispeech_asr"
ds_name = "gpt-omni/VoiceAssistant-400K"

In [None]:
location = os.path.dirname(os.getcwd())

download_folder = hf_hub.snapshot_download(
    repo_id=ds_name,
    repo_type="dataset",
    cache_dir=location,
    resume_download=True,
    ignore_patterns=[]
)
location = pathlib.Path(download_folder) / "data"

In [None]:
print(location)

In [None]:
dataset = load_dataset('/workspace/datasets--gpt-omni--VoiceAssistant-400K/', split='train')

In [None]:
iterator = iter(dataset)
batch = [next(iterator) for _ in range(10)]

In [None]:
batch[0]['question_audio']

In [None]:
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print(torch.cuda.memory_summary())

In [None]:
from utils import GPTVoiceAssistantDataCollator

vadc = GPTVoiceAssistantDataCollator(
    whisper_processor=whisper_processor,
    tokenizer=tokenizer
)

In [None]:
input_parameters = vadc(batch)

In [None]:
print(input_parameters["input_features"].shape)
print(input_parameters["labels"].shape)
print(input_parameters["input_ids"].shape)
print(input_parameters["attention_mask"].shape)

In [None]:
input_parameters['input_features'] = input_parameters['input_features'].cuda(0).to(torch.bfloat16)
input_parameters['labels'] = input_parameters['labels'].cuda(0)
input_parameters['input_ids'] = input_parameters['input_ids'].cuda(0)
input_parameters['attention_mask'] = input_parameters['attention_mask'].cuda(0)

In [None]:
from models import SpeechToTextModel

In [None]:
model = SpeechToTextModel(
    whisper_model_name=WHISPER_MODEL_NAME,
    llama_model_name=LLAMA_MODEL_NAME,
    hidden_dims=[2048, 1024, 2048, 1024, 2048],
    train_whisper=False,
    train_llama=False
)
model = model.to(torch.device("cuda:0"), dtype=torch.bfloat16)

In [None]:
for param in model.parameters():
    print(param.device)
    print(param.dtype)

In [None]:
outputs = model(
    input_features=input_parameters['input_features'],
    input_ids=input_parameters['input_ids'],
    attention_mask=input_parameters['attention_mask'],
    labels=input_parameters['labels'],
)

In [None]:
# Whisper Padding Issue
# SafeTensor Saving Issue

# 2 Cuda devices issues
# Accelerate issue
# Dataset download issue