In [1]:
             # type: ignore
import torch
import torchaudio
from utils import load_model, prepare_data

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load models
print("Loading models...")
model, processor = load_model(causal=False)

Loading models...


In [13]:
print("Preparing data...")
ds = prepare_data()     
len(ds)

Preparing data...


17909930

In [12]:
print("Running inference...")
for i in range(3):
    sample = ds[i]                                           # type: ignore[attr-defined]
    audio_wav = sample["mp3"]["array"]                       # type: ignore[attr-defined]
    audio_array = torch.from_numpy(audio_wav).to(torch.float32)
    sr = sample["mp3"]["sampling_rate"]                      # type: ignore[attr-defined]
    ground_truth = sample["json"]['text']                    # type: ignore[attr-defined]
    
    if sr != 16000:
        audio_array = torchaudio.transforms.Resample(sr, 16000)(audio_array)
    
    # Process audio
    inputs = processor(audio_array, sampling_rate=16000, return_tensors="pt")  # type: ignore[attr-defined]
    
    seq_len = 1500  # typical for 30s audio
    attn_mask = torch.full((1, 1, seq_len, seq_len), float('-inf'))
    torch.diagonal(attn_mask[0, 0]).fill_(0.0)  # allow only self-attention
    
    # Generate transcription
    with torch.no_grad():
        latents = model.model.encoder(inputs.input_features, attention_mask=attn_mask)
        predicted_ids = model.model.decoder(latents)
    
    # Decode transcription
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]  # type: ignore[attr-defined]

    print(f"Ground truth: {ground_truth}")
    print(f"Transcription: {transcription}")
    print(f"\n")

Running inference...


Ground truth:  You can help my mother and you- No. You didn't leave a bad situation back home to get caught up in another one here. What happened to you, Los Angeles?
Transcription:  You can help my mother in need. No. You didn't leave a bad situation back home to get caught up in another one here. What happened to you, Los Angeles?


Ground truth:  Honda's gone, 20 squads done. X is gonna split us up and put us on different squads. The team's come and go, but 20 squad, can't believe it's ending.
Transcription:  Honda's gone, the 20 squats done. X is gonna split us up and put us on different squats. The team's coming, go, but 20 squats. Can't believe it's ending.


Ground truth:  Alright. TCB! Sure you don't want some.
Transcription:  All right, TCB. Sure you don't want some.


