### dataset preparation

just do this once, then take the same `manifest.json` file from `/model-v3`

In [1]:
import json
import os
import soundfile as sf
from tqdm import tqdm

input_jsonl = "/home/jupyter/advanced/asr/asr.jsonl"
output_manifest = "model-v5/manifest.json"
audio_dir = "/home/jupyter/advanced/asr"

with open(input_jsonl, 'r') as f:
    data = [json.loads(line) for line in f]

print(len(data))

manifest = []

# track progress with tqdm
for item in tqdm(data, desc='creating manifest'):
    audio_file = item['audio']
    transcript = item['transcript']
    
    full_path = os.path.join(audio_dir, audio_file)
    
    duration = sf.info(full_path).duration
    
    manifest.append({
        'audio_filepath': full_path,
        'duration': duration,
        'text': transcript
    })
    
# write manifest with progress
with open(output_manifest, 'w') as f:
    for entry in tqdm(manifest, desc='writing manifest'):
        f.write(json.dumps(entry) + '\n')
        
print(f'manifest written to {output_manifest}')

4500


creating manifest: 100%|██████████| 4500/4500 [08:10<00:00,  9.18it/s]
writing manifest: 100%|██████████| 4500/4500 [00:00<00:00, 118305.67it/s]

manifest written to model-v5/manifest.json





In [4]:
import os
import shutil
import random

data_dir = "/home/jupyter/advanced/asr"
train_dir = "/home/jupyter/til25-import-torch/asr/data/train"
val_dir = "/home/jupyter/til25-import-torch/asr/data/validation"

os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

sample_indices = list(range(4500))
random.shuffle(sample_indices)

train_indices = sample_indices[:4050]
val_indices = sample_indices[4050:]

for idx in train_indices:
    wav_file = f'sample_{idx}.wav'
    txt_file = f'sample_{idx}.txt'
    shutil.copy(os.path.join(data_dir, wav_file), train_dir)
    shutil.copy(os.path.join(data_dir, txt_file), train_dir)
    
for idx in val_indices:
    wav_file = f'sample_{idx}.wav'
    txt_file = f'sample_{idx}.txt'
    shutil.copy(os.path.join(data_dir, wav_file), val_dir)
    shutil.copy(os.path.join(data_dir, txt_file), val_dir)

In [7]:
import json
import soundfile as sf
import os

def create_manifest(data_split):
    data_dir = f"/home/jupyter/til25-import-torch/asr/data/{data_split}"
    manifest_path = f"/home/jupyter/til25-import-torch/asr/data/{data_split}_manifest.json"
    
    print(f'writing manifest to {manifest_path}')
    
    with open(manifest_path, 'w') as manifest_file:
        for filename in os.listdir(data_dir):
            if not filename.endswith('.wav'):
                continue
            
            wav_path = os.path.join(data_dir, filename)
            txt_path = wav_path.replace('.wav', '.txt')
            
            if not os.path.exists(txt_path):
                continue
                
            with open(txt_path, 'r') as txt_file:
                transcript = txt_file.read().strip()
            duration = sf.info(wav_path).duration
            manifest_entry = {
                'audio_filepath': wav_path,
                'text': transcript,
                'duration': duration
            }
            manifest_file.write(json.dumps(manifest_entry) + '\n')
            
create_manifest('train')
create_manifest('validation')

writing manifest to /home/jupyter/til25-import-torch/asr/data/validation_manifest.json


### preparing config

In [9]:
%pip install omegaconf

Note: you may need to restart the kernel to use updated packages.


In [1]:
from omegaconf import OmegaConf
config = OmegaConf.load("/home/jupyter/til25-import-torch/asr/data/config.yaml")

In [2]:
print(OmegaConf.to_yaml(config))

name: Speech_To_Text_Finetuning
init_from_pretrained_model: null
model:
  sample_rate: 16000
  train_ds:
    manifest_filepath: /home/jupyter/til25-import-torch/asr/data/train_manifest.json
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: true
    num_workers: 8
    pin_memory: true
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    text_key: transcription
    normalize_text: true
  validation_ds:
    manifest_filepath: /home/jupyter/til25-import-torch/asr/data/validation_manifest.json
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: false
    use_start_end_token: false
    num_workers: 8
    pin_memory: true
    text_key: transcription
    normalize_text: true
  test_ds:
    manifest_filepath: null
    sample_rate: ${model.sample_rate}
    batch_size: 16
    shuffle: false
    use_start_end_token: false


### training!

In [3]:
print(OmegaConf.to_yaml(config.model.optim))

name: adamw
lr: 0.0001
betas:
- 0.9
- 0.98
weight_decay: 0.001
sched:
  name: CosineAnnealing
  warmup_steps: 500
  warmup_ratio: null
  min_lr: 5.0e-06



In [4]:
from nemo.collections.asr.models import ASRModel
asr_model = ASRModel.from_pretrained('nvidia/parakeet-rnnt-0.6b')

    


[NeMo I 2025-05-25 02:49:38 mixins:181] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-05-25 02:49:38 modelPT:180] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpeech_NeMo/librivox-train-all.json
    sample_rate: 16000
    batch_size: 16
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 16.7
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-05-25 02:49:38 modelPT:187] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /disk1/NVIDIA/datasets/LibriSpee

[NeMo I 2025-05-25 02:49:38 features:305] PADDING: 0
[NeMo I 2025-05-25 02:49:44 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-05-25 02:49:44 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-05-25 02:49:44 rnnt_models:226] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-05-25 02:49:48 save_restore_connector:275] Model EncDecRNNTBPEModel was successfully restored from /home/jupyter/.cache/huggingface/hub/models--nvidia--parakeet-rnnt-0.6b/snapshots/b8a90ce81e7b6d41486154c7b2d96935077806c8/parakeet-rnnt-0.6b.nemo.


In [5]:
from nemo.utils import logging, model_utils

cfg = model_utils.convert_model_config_to_dict_config(config)
asr_model.setup_training_data(cfg.model.train_ds)
asr_model.setup_validation_data(cfg.model.validation_ds)

[NeMo I 2025-05-25 02:49:49 collections:201] Dataset loaded with 1030 files totalling 4.78 hours
[NeMo I 2025-05-25 02:49:49 collections:202] 3020 files were filtered totalling 24.00 hours
[NeMo I 2025-05-25 02:49:49 collections:201] Dataset loaded with 450 files totalling 3.13 hours
[NeMo I 2025-05-25 02:49:49 collections:202] 0 files were filtered totalling 0.00 hours


In [6]:
asr_model.setup_optimization(cfg.model.optim)

[NeMo W 2025-05-25 02:49:49 modelPT:681] Trainer wasn't specified in model constructor. Make sure that you really wanted it.


[NeMo I 2025-05-25 02:49:49 modelPT:802] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        decoupled_weight_decay: True
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.001
    )


[NeMo W 2025-05-25 02:49:49 lr_scheduler:930] Neither `max_steps` nor `iters_per_batch` were provided to `optim.sched`, cannot compute effective `max_steps` !
    Scheduler will not be instantiated !


(AdamW (
 Parameter Group 0
     amsgrad: False
     betas: [0.9, 0.98]
     capturable: False
     decoupled_weight_decay: True
     differentiable: False
     eps: 1e-08
     foreach: None
     fused: None
     lr: 0.0001
     maximize: False
     weight_decay: 0.001
 ),
 None)

In [8]:
# pytorch lightning trainer
import torch
from lightning.pytorch import Trainer
from lightning.pytorch.accelerators import find_usable_cuda_devices

MAX_STEPS = 6000

trainer = Trainer(accelerator='cpu', devices='auto', max_epochs=-1, max_steps=MAX_STEPS, enable_checkpointing=False, logger=False, log_every_n_steps=100, check_val_every_n_epoch=10, precision='32')

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [36]:
asr_model.decoding.decoding.use_cuda_graph_decoder = False
asr_model.decoding.decoding.loop_labels = False
asr_model.decoding.decoding.disable_cuda_graphs()

In [37]:
trainer.fit(asr_model)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


[NeMo I 2025-05-25 03:18:21 modelPT:802] Optimizer config = AdamW (
    Parameter Group 0
        amsgrad: False
        betas: [0.9, 0.98]
        capturable: False
        decoupled_weight_decay: True
        differentiable: False
        eps: 1e-08
        foreach: None
        fused: None
        lr: 0.0001
        maximize: False
        weight_decay: 0.001
    )
[NeMo I 2025-05-25 03:18:21 lr_scheduler:950] Scheduler "<nemo.core.optim.lr_scheduler.CosineAnnealing object at 0x7f9bf1137940>" 
    will be used during training (effective maximum steps = 6000) - 
    Parameters : 
    (warmup_steps: 500
    warmup_ratio: null
    min_lr: 5.0e-06
    max_steps: 6000
    )



  | Name              | Type                              | Params | Mode
-------------------------------------------------------------------------------
0 | preprocessor      | AudioToMelSpectrogramPreprocessor | 0      | eval
1 | encoder           | ConformerEncoder                  | 607 M  | eval
2 | decoder           | RNNTDecoder                       | 7.2 M  | eval
3 | joint             | RNNTJoint                         | 1.7 M  | eval
4 | loss              | RNNTLoss                          | 0      | eval
5 | spec_augmentation | SpectrogramAugmentation           | 0      | eval
6 | wer               | WER                               | 0      | eval
-------------------------------------------------------------------------------
616 M     Trainable params
0         Non-trainable params
616 M     Total params
2,466.769 Total estimated model params size (MB)
0         Modules in train mode
706       Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

[NeMo I 2025-05-25 03:18:22 optional_cuda_graphs:79] Enabled CUDA graphs for module <class 'nemo.collections.asr.models.rnnt_bpe_models.EncDecRNNTBPEModel'>.decoding.decoding
[NeMo I 2025-05-25 03:18:22 optional_cuda_graphs:79] Enabled CUDA graphs for module <class 'nemo.collections.asr.metrics.wer.WER'>joint._wer.decoding.decoding


RuntimeError: Called CUDAGraph::replay without a preceding successful capture.

In [None]:
asr_model.save_to("240525-finetuned-parakeet-06b")