# Speech transformer 

## Environment

### Imports

In [1]:
import sys
sys.path.append('~/Projects/transformer_wrappers/src')

In [1]:
import torch

In [2]:
from transformers import BitsAndBytesConfig
from peft import LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from transformer_wrappers.wrappers import SpeechCausalLMWrapper

### Constants and globals

In [4]:
TOKEN = None  # HF Token

In [5]:
MODEL = 'gpt2'  
# MODEL = 'mistralai/Mistral-7B-Instruct-v0.3'  
# MODEL = 'meta-llama/Llama-3.1-8B-Instruct'
# MODEL = 'google/gemma-2-9b-it'
MODEL_CONFIGS = {
    'torch_dtype': torch.bfloat16,
    'device_map': 'cpu',  # torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
    'token': TOKEN
}
TOKENIZER_CONFIGS = {'token': TOKEN, 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|audio|>']}

In [6]:
QUANTIZATION_CONFIGS = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_use_double_quant=True, 
    bnb_4bit_quant_type='nf4', 
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [7]:
LORA_CONFIGS = LoraConfig(
    target_modules='all-linear',
    lora_alpha=16,
    lora_dropout=0.1,
    r=16,
    bias='none',
    task_type='CAUSAL_LM'
)

In [8]:
model = SpeechCausalLMWrapper.from_pretrained(
    MODEL,
    model_kwargs=MODEL_CONFIGS,
    # quantization_configs=QUANTIZATION_CONFIGS,
    lora_configs=LORA_CONFIGS,
    tokenizer_kwargs=TOKENIZER_CONFIGS
)



### Helper functions

In [9]:
...

Ellipsis

## Speech recognition

In this first example we show how to forward an input composed of text and audio to the model

In [10]:
text = f'Transcribe the following audio clip:\n{model.audio_token}\n\nTranscription:\n"In a hole in the ground there lived a hobbit."'

In [11]:
audio_file_path = '../audio.wav'

In [12]:
input_encoding = model.prepare_input(text, audio_file_path)

In [13]:
input_encoding['input_spectrograms'][0].size()

torch.Size([128, 402])

In [14]:
output = model.forward(**input_encoding)

In [15]:
target_output = model.prepare_output(text, audio_file_path)

In [16]:
target_output['target_spectrograms'][0].size()

torch.Size([128, 402])

In [17]:
loss = model._loss(
    token_logits=output['logits'],
    token_labels=target_output['token_labels'],
    predicted_spectrograms=output['spectrograms'],
    target_spectrograms=target_output['target_spectrograms']
)
loss

(tensor(13674.4873, grad_fn=<AddBackward0>),
 {'language_modelling_loss': tensor(24.9689, grad_fn=<NllLossBackward0>),
  'spectrogram_generation_loss': tensor(13649.5186, grad_fn=<MeanBackward0>)})