## Tuning facebook:wav2vec2-large-960h

Here, we finetune the facebook:wav2vec2-large-960h model from huggingface using the `cv-valid-train` common_voice dataset. This notebook follows the finetuning framework from this [hugginface blog](https://huggingface.co/blog/fine-tune-wav2vec2-english) with minor adaptations. First, we import the required libraries.

In [1]:
# Imports
import os
import re
import random
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
import gc
from multiprocessing import Pool, cpu_count

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import Audio as PlayAudio

from accelerate import Accelerator
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor
from transformers import TrainingArguments, Trainer
from datasets import load_dataset, Audio, DatasetDict, load_from_disk, Dataset
import evaluate

import torch
from torch.utils.data import DataLoader
import torchaudio
from transformers import get_linear_schedule_with_warmup
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
from torch.amp import autocast, GradScaler
from tqdm import tqdm

from pydub import AudioSegment
import soundfile as sf
from mutagen import File

from jiwer import wer

HOME_DIR = os.path.expanduser('~')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Helpers
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

### Pre-processing

We first convert all mp3 files to wav files, which the wav2vec2 model assumes. Additionally converting transcript texts to upper case to match the original model. This may take some time.

In [3]:
# File locations. All files assumed placed in asr_proejct folder
audio_or_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-train/')
audio_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-train/cv-valid-train/')
audioloc_transcript_or_dir = os.path.join(HOME_DIR,'asr_project/common_voice/cv-valid-train.csv')
audioloc_transcript_dir = os.path.join(HOME_DIR,'asr_project/asr-train/selected_transcript.csv')
temp_dir = os.path.join(HOME_DIR,'asr_project/asr-train/temp.csv')

In [4]:
# # Function to convert mp3 to wav
# def convert_mp3_to_wav(mp3_file):
#     # Generate the output wav file path
#     wav_file = mp3_file.replace('.mp3', '.wav')
    
#     # Convert mp3 to wav if wav file does not exist
#     if not os.path.exists(wav_file):
#         waveform, sample_rate = torchaudio.load(mp3_file)
#         torchaudio.save(wav_file, waveform, sample_rate)
    
#     return wav_file


# df = pd.read_csv(audioloc_transcript_or_dir)

# # Convert mp3 to wav. Change mp3 file extension in df accordingly
# df['filename'] = df['filename'].apply(
#     lambda filename: convert_mp3_to_wav(
#         os.path.join(audio_or_dir, filename)))

# # Put texts to uppercase to match pre-finetuned model
# df['text'] = df['text'].str.upper()
# df['filename'] = df['filename'].map(lambda x: os.path.basename(x))

# df_transcript = df

Checking audio file characteristics ...

In [5]:
# def get_audio_info(file_path):
#     # Extract filename and extension
#     file_name, file_ext = os.path.splitext(os.path.basename(file_path))
#     file_size = os.path.getsize(file_path)  # Size in bytes

#     # Try to get audio length with mutagen
#     try:
#         audio = File(file_path)
#         audio_length = audio.info.length if audio and audio.info else None
#     except Exception as e:
#         print(f"Could not process file {file_name}: {e}")
#         audio_length = None

#     return {
#         'filename': file_name,
#         'extension': file_ext,
#         'size_bytes': file_size,
#         'length_seconds': audio_length
#     }

# def process_directory(directory):
#     # List all audio files in directory
#     audio_files = [
#         os.path.join(directory, f) for f in os.listdir(directory) 
#         if os.path.isfile(os.path.join(directory, f))
#     ]

#     # Use tqdm with multiprocessing
#     with Pool(cpu_count()) as pool:
#         # Wrap audio files list with tqdm for progress bar
#         audio_info = list(tqdm(pool.imap(get_audio_info, audio_files), total=len(audio_files), desc="Processing files"))

#     # Create DataFrame from the list of dictionaries
#     df = pd.DataFrame(audio_info)
#     return df

# # Get audio file information
# audio_df = process_directory(audio_dir)
# audio_df_mp3 = audio_df.loc[audio_df['extension']=='.mp3'].copy()
# audio_df_wav = audio_df.loc[audio_df['extension']=='.wav'].copy().drop(columns=['length_seconds'])
# audio_df_wav = audio_df_wav.merge(audio_df_mp3[['filename','length_seconds']], on='filename', how='left')
# audio_df_wav['filename'] = audio_df_wav['filename'].map(lambda x: x+'.wav')
# audio_df_wav.head()

We see that some of them have very high durations, up to 6 minutes long.

In [6]:
# audio_df_wav.describe()

Checking the transcript, we find that the longest line read is only 33 words long, which should not take that long to read.

In [7]:
# df_transcript['len'] = df_transcript['text'].str.len()
# df_transcript = df_transcript[['filename','len','text']]

# filename_longest = df_transcript.loc[df_transcript['len']==df_transcript['len'].max(), 'filename'].item()
# text_longest = df_transcript.loc[df_transcript['len']==df_transcript['len'].max(), 'text'].item()

# print(f'Filename with longest transcript: {filename_longest}')
# print(f'Longest transcript text: {text_longest}')

# longest_clip_duration = audio_df_wav.loc[audio_df_wav['filename']==filename_longest,'length_seconds'].item()
# print(f'Longest transcript duration: {longest_clip_duration}s')

The clip with the longest transcript is 11s long. Considering differences in reading speeds, we assume the longest legitimate script reading to be 15s long. __We discard all samples with durations above 15s__. This will help prevent memory issues during model finetuning. We drop a total of 397 samples, keeping ~195k samples, saving a copy as csv file for later reference.

In [8]:
# df_transcript = df_transcript.merge(audio_df_wav[['filename', 'length_seconds']], on='filename', how='left')
# (df_transcript['length_seconds'] > 15).sum().item(),  (df_transcript['length_seconds'] < 15).sum().item()

In [9]:
# df_transcript = df_transcript.loc[df_transcript['length_seconds']<15].drop(columns=['len','length_seconds'])
# df_transcript.to_csv(audioloc_transcript_dir, index=False)

We create a `DatasetDict` for easy access to train-val splits.

In [10]:
# # Load csv file with wav filenames, complete path and create dataset
# df = pd.read_csv(audioloc_transcript_dir)
# df['filename'] = df['filename'].map(lambda x: os.path.join(audio_dir,x))
# df.to_csv(temp_dir,index=False)

In [11]:
# dataset = load_dataset('csv', data_files=temp_dir, split='train')
# dataset = dataset.cast_column("filename",
#                               Audio(sampling_rate=16000))         # Cast audio files with 16kHz sampling rate

# # train-val 70-30 split
# dataset = dataset.train_test_split(test_size=0.3, seed=42)        # Split to train-val

# # Final, combined dataset
# dataset = DatasetDict({
#     'train': dataset['train'],
#     'val': dataset['test']})

# dataset

We will make use of the tokenizer and processor from `facebook/wav2vec2-large-960h` in the model finetuning below. First, the transcripts are converted to the format expected by the model. The transcript have already been converted into uppercase earlier for this purpose. We insert start, end, and delimited tokens below.

In [12]:
# # Following the style of facebook/wav2ec2-large-960h model
# start_token = "<s>"
# end_token = "</s>"
# word_delimiter_token = "|"

# # Define the preprocessing function
# def preprocess_transcript(example):
#     transcript = example['text']  # Assuming the column with text is named 'text'
    
#     # Step 1: Replace multiple spaces with a single space
#     transcript = re.sub(r'\s+', ' ', transcript)  # Remove extra spaces
    
#     # Step 2: Add start and end tokens, and replace spaces with '|'
#     processed_transcript = start_token + transcript.replace(" ", f"{word_delimiter_token}") + end_token
    
#     return {"processed_text": processed_transcript}  # Return the processed text in a dictionary

# # Apply the preprocessing to both train and validation splits
# dataset = dataset.map(preprocess_transcript, remove_columns=["text"],num_proc=4)

Converting to column names expected by model.

In [13]:
# dataset = dataset.rename_column("filename", "input_values")
# dataset = dataset.rename_column("processed_text", "labels")

Then, we tokenize the transcripts and use the `input_values` and `labels` column names in the datasets.

In [14]:
# # Load processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

# def prepare_dataset(batch):
#     # Process 'input_values' column for 1D waveform values
#     batch["input_values"] = processor(batch["input_values"]["array"],
#                                       sampling_rate=16000).input_values[0]
    
#     # Process the 'labels' column to create 'labels' (text data)
#     batch["labels"] = processor(text=batch["labels"]).input_ids
    
#     return batch

# # Map the dataset transformation to both 'train' and 'val' splits
# dataset = dataset.map(prepare_dataset, num_proc=2)


In [15]:
# # Save the dataset to a directory
# dataset.save_to_disk("temp_dataset")

For a quick check, play a random audio file below...

In [16]:
# rand_int = random.randint(0, len(dataset["train"]))
# print(dataset["train"]["labels"][rand_int])

# audio_data = dataset["train"][rand_int]["input_values"]
# PlayAudio(data=audio_data, rate=16000)

... and check the data formats, e.g. 1-D waveform.

In [17]:
# rand_int = random.randint(0, len(dataset["train"]))

# print("Target (encoded) text:", dataset["train"][rand_int]["labels"])
# print("Input array shape:", np.asarray(dataset["train"][rand_int]["input_values"]).shape)

### Training

As elaborated [here](https://huggingface.co/blog/fine-tune-wav2vec2-english), a data collator with dynamic padding is more efficient for ASR applications, considering the lengths of the input sequences.

In [18]:
# Load dataset
dataset = load_from_disk("temp_dataset")

In [19]:
# Load processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

In [20]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

Using the WER metric.

In [21]:
wer_metric = evaluate.load("wer")

def remove_start_end_tags(texts):
    # Remove the <s> and </s> tags from both ends of each string
    return [re.sub(r"^<s>|</s>$", "", text) for text in texts]

def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    # Replace padding token id with -100
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and references
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # Remove the <s> and </s> tags from the decoded strings
    pred_str = remove_start_end_tags(pred_str)
    label_str = remove_start_end_tags(label_str)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

Finally, we load the pre-trained model.

In [22]:
# Load model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

# # Freeze feature extractor layers
# model.freeze_feature_encoder()

# Freeze all layers except the head
for param in model.parameters():
    param.requires_grad = False  # Freeze all parameters

# Assuming the head is the `classifier` in Wav2Vec2ForCTC
for param in model.lm_head.parameters():  # For the head (classifier) layer
    param.requires_grad = True  # Unfreeze the head


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We first get a baseline WER for a quick comparison with the finetuned model's performance later.

In [23]:
# def map_to_result(batch):
#     with torch.no_grad():
#         input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
#         logits = model(input_values).logits

#     pred_ids = torch.argmax(logits, dim=-1)
#     batch["pred_str"] = processor.batch_decode(pred_ids)[0]
#     batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
#     return batch

# model.to('cuda')
# results = dataset["val"].map(map_to_result, remove_columns=dataset["val"].column_names)

In [24]:
# def remove_start_end_tags(batch):
#     # Remove the <s> and </s> tags from both ends of each string in 'pred_str' and 'text'
#     batch["pred_str"] = re.sub(r"^<s>|</s>$", "", batch["pred_str"])
#     batch["text"] = re.sub(r"^<s>|</s>$", "", batch["text"])
#     return batch

# # Apply the function to the entire dataset
# results = results.map(remove_start_end_tags)


# print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))

We see that the model shows a WER of about 10.5% before finetuning. We complete the setup for the huggingface trainer and begin training below.

In [25]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir=os.path.expanduser('~/asr_project/asr-train/model_outputs'),
    logging_dir=os.path.expanduser('~/asr_project/asr-train/logs'),
    per_device_train_batch_size=16,              # batch size for training
    per_device_eval_batch_size=16,               # batch size for evaluation
    num_train_epochs=1,                           # total number of training epochs
    logging_steps=250,                            # log every 100 steps
    eval_strategy="steps",                  # evaluate during training
    save_steps=500,                               # save checkpoint every 500 steps
    eval_steps=500,                               # evaluate every 500 steps
    warmup_steps=1000,
    load_best_model_at_end=True,                  # load the best model at the end of training
    gradient_checkpointing=True,
    gradient_accumulation_steps=4,
    fp16=True,
    # optim="adamw_bnb_8bit",                # adamw_apex_fused, adamw_bnb_8bit. did not help.
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
)

# Define huggingface trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    processing_class= processor.feature_extractor
)

In [26]:
# Start training
result = trainer.train()
print_summary(result)

 12%|█▏        | 250/2137 [05:12<38:47,  1.23s/it]

{'loss': 9.9285, 'grad_norm': 4.3315043449401855, 'learning_rate': 1.2450000000000001e-05, 'epoch': 0.12}


 23%|██▎       | 500/2137 [10:20<29:53,  1.10s/it]

{'loss': 9.3718, 'grad_norm': 3.421187162399292, 'learning_rate': 2.495e-05, 'epoch': 0.23}


                                                  
 23%|██▎       | 500/2137 [19:23<29:53,  1.10s/it] 

{'eval_loss': 2.665811061859131, 'eval_wer': 0.1104161030247281, 'eval_runtime': 542.9331, 'eval_samples_per_second': 107.958, 'eval_steps_per_second': 6.749, 'epoch': 0.23}


 35%|███▌      | 750/2137 [24:13<28:57,  1.25s/it]    

{'loss': 8.5016, 'grad_norm': 3.1834044456481934, 'learning_rate': 3.745e-05, 'epoch': 0.35}


 47%|████▋     | 1000/2137 [29:04<22:49,  1.20s/it]

{'loss': 7.5337, 'grad_norm': 2.6247568130493164, 'learning_rate': 4.995e-05, 'epoch': 0.47}


                                                   
 47%|████▋     | 1000/2137 [35:58<22:49,  1.20s/it]

{'eval_loss': 1.6904981136322021, 'eval_wer': 0.11616434897101528, 'eval_runtime': 413.9829, 'eval_samples_per_second': 141.586, 'eval_steps_per_second': 8.851, 'epoch': 0.47}


 58%|█████▊    | 1250/2137 [41:07<16:26,  1.11s/it]    

{'loss': 6.5242, 'grad_norm': 2.016247510910034, 'learning_rate': 3.9050131926121375e-05, 'epoch': 0.58}


 70%|███████   | 1500/2137 [46:22<13:34,  1.28s/it]

{'loss': 5.7639, 'grad_norm': 1.742484450340271, 'learning_rate': 2.805628847845207e-05, 'epoch': 0.7}


                                                   
 70%|███████   | 1500/2137 [53:04<13:34,  1.28s/it]

{'eval_loss': 1.2211542129516602, 'eval_wer': 0.1202514293959562, 'eval_runtime': 401.3354, 'eval_samples_per_second': 146.047, 'eval_steps_per_second': 9.13, 'epoch': 0.7}


 82%|████████▏ | 1750/2137 [58:12<08:27,  1.31s/it]    

{'loss': 5.2705, 'grad_norm': 2.2663869857788086, 'learning_rate': 1.706244503078276e-05, 'epoch': 0.82}


 94%|█████████▎| 2000/2137 [1:03:24<02:56,  1.29s/it]

{'loss': 4.9872, 'grad_norm': 2.0656380653381348, 'learning_rate': 6.068601583113457e-06, 'epoch': 0.94}


                                                     
 94%|█████████▎| 2000/2137 [1:10:00<02:56,  1.29s/it]

{'eval_loss': 1.0519750118255615, 'eval_wer': 0.1210215897408149, 'eval_runtime': 396.6229, 'eval_samples_per_second': 147.783, 'eval_steps_per_second': 9.238, 'epoch': 0.94}


100%|██████████| 2137/2137 [1:12:38<00:00,  2.04s/it]   

{'train_runtime': 4358.677, 'train_samples_per_second': 31.377, 'train_steps_per_second': 0.49, 'train_loss': 7.080849418586585, 'epoch': 1.0}
Time: 4358.68
Samples/second: 31.38
GPU memory occupied: 14071 MB.





In [27]:
# Load audio file
audio_file = "/home/tfc/asr_project/common_voice/cv-valid-train/cv-valid-train/sample-000000.wav"  # Replace with your audio file path
waveform, sample_rate = torchaudio.load(audio_file)


# If the sample rate is not 16kHz, resample it
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert to the right format for the model
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

# Get logits from the model
model.to('cpu')
with torch.no_grad():
    logits = model(input_values).logits

# Get predicted ids
predicted_ids = logits.argmax(dim=-1)

# Decode the predicted ids to text
transcription = processor.batch_decode(predicted_ids)

print(transcription)  # Print the transcription result



['LEARNED T RECOGNIE OMEN AND  FOLLOW THEM THEOLD  KING HAD SAID']


In [16]:
# Start the training 16x1 1024
result = trainer.train()

100%|██████████| 64/64 [00:22<00:00,  2.90it/s]

{'train_runtime': 22.0575, 'train_samples_per_second': 46.424, 'train_steps_per_second': 2.902, 'train_loss': 8.077967643737793, 'epoch': 1.0}





In [17]:
print_summary(result)

Time: 22.06
Samples/second: 46.42
GPU memory occupied: 11883 MB.


In [16]:
# Start the training 4x4
result = trainer.train()

100%|██████████| 64/64 [00:22<00:00,  2.86it/s]

{'train_runtime': 22.4053, 'train_samples_per_second': 45.704, 'train_steps_per_second': 2.856, 'train_loss': 25.687761306762695, 'epoch': 1.0}





In [17]:
print_summary(result)

Time: 22.41
Samples/second: 45.70
GPU memory occupied: 5186 MB.


In [17]:
print_summary(result)

Time: 22.01
Samples/second: 46.53
GPU memory occupied: 11907 MB.


In [17]:
print_summary(result)

Time: 11.87
Samples/second: 43.13
GPU memory occupied: 8737 MB.


In [17]:
print_summary(result)

Time: 11.80
Samples/second: 43.37
GPU memory occupied: 8761 MB.


In [17]:
print_summary(result)

Time: 1.80
Samples/second: 17.80
GPU memory occupied: 3791 MB.


In [17]:
print_summary(result)

Time: 1.97
Samples/second: 16.28
GPU memory occupied: 3495 MB.


In [17]:
print_summary(result)

Time: 1.61
Samples/second: 19.93
GPU memory occupied: 2632 MB.


In [42]:
import os
import pandas as pd

# Define the folder path
folder_path = '/home/tfc/asr_project/common_voice/cv-valid-train/cv-valid-train'

# Create a list to store file information
file_info = []

# Iterate over all files in the directory
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):  # Ensure it is a file
        file_size = os.path.getsize(file_path)  # Get file size in bytes
        file_extension = os.path.splitext(filename)[1]  # Get file extension
        file_info.append({
            "name": filename,
            "size": file_size/1024/1024,
            "ext": file_extension
        })

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(file_info)

# Display the DataFrame
print(df)


                     name      size   ext
0       sample-008219.mp3  0.038301  .mp3
1       sample-183811.wav  0.189039  .wav
2       sample-154156.mp3  0.018159  .mp3
3       sample-004348.wav  0.408766  .wav
4       sample-145859.wav  0.501051  .wav
...                   ...       ...   ...
391547  sample-157721.mp3  0.029511  .mp3
391548  sample-185757.mp3  0.045442  .mp3
391549  sample-116631.mp3  0.036836  .mp3
391550  sample-106544.wav  0.404371  .wav
391551  sample-172862.mp3  0.027864  .mp3

[391552 rows x 3 columns]


In [45]:
df_wav = df.loc[df['ext']=='.wav']
df_wav.describe()

Unnamed: 0,size
count,195776.0
mean,0.408366
std,0.235453
min,0.06819
25%,0.283522
50%,0.375807
75%,0.494459
max,35.993483


In [46]:
0.408366+3*0.235453


1.114725

In [47]:
large = df_wav.loc[df_wav['size']>1]['name'].to_list()
large
len(large)

1568

In [35]:
df_temp = pd.read_csv('temp.csv')
df_temp = df_temp.loc[df_temp['filename'].str.contains('.wav'), :]
df_temp['len'] = df_temp['text'].str.len()

df_temp['name'] = df_temp['filename'].str[65:]

df_temp = df_temp.loc[df_temp['name'].isin(large),:]
df_temp.drop(columns=['name','len'], inplace=True)
df_temp.to_csv('temp2.csv',index=False)





In [20]:
df_temp.loc[df_temp['len']>100]

Unnamed: 0,filename,text,len
31,/home/tfc/asr_project/common_voice/cv-valid-tr...,BEFORE GUNS WERE INVENTED ARMIES HAD TO THROW ...,133
32,/home/tfc/asr_project/common_voice/cv-valid-tr...,BEFORE GUNS WERE INVENTED ARMIES HAD TO THROW ...,133
73,/home/tfc/asr_project/common_voice/cv-valid-tr...,THE GREATEST AUTHORITY ON METEORITES STATED TH...,121
367,/home/tfc/asr_project/common_voice/cv-valid-tr...,THE GREATEST AUTHORITY ON METEORITES STATED TH...,121
582,/home/tfc/asr_project/common_voice/cv-valid-tr...,THIS WILL HELP YOU EFFECTIVELY COLLABORATE WIT...,115
...,...,...,...
195492,/home/tfc/asr_project/common_voice/cv-valid-tr...,BEFORE GUNS WERE INVENTED ARMIES HAD TO THROW ...,133
195606,/home/tfc/asr_project/common_voice/cv-valid-tr...,I REMEMBER HOW I SAT ON THE TABLE THERE IN THE...,114
195645,/home/tfc/asr_project/common_voice/cv-valid-tr...,THE GREATEST AUTHORITY ON METEORITES STATED TH...,121
195710,/home/tfc/asr_project/common_voice/cv-valid-tr...,I REMEMBER HOW I SAT ON THE TABLE THERE IN THE...,114


In [27]:
df_temp['text'].drop_duplicates().count()

np.int64(6994)

In [28]:
df = pd.read_csv('/home/tfc/asr_project/common_voice/cv-valid-train.csv')

In [None]:
# optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.005)

# train_dataloader = DataLoader(dataset["train"].select(range(32)),
#                               batch_size=training_args.per_device_train_batch_size,
#                               shuffle=True,
#                               collate_fn=data_collator)
# val_dataloader = DataLoader(dataset["val"].select(range(16)),
#                             batch_size=training_args.per_device_eval_batch_size,
#                             shuffle=False,
#                             collate_fn=data_collator)

# if training_args.gradient_checkpointing:
#     model.gradient_checkpointing_enable()

# accelerator = Accelerator(fp16=training_args.fp16)
# model, optimizer, dataloader = accelerator.prepare(model, optimizer, train_dataloader)

# model.train()
# for step, batch in enumerate(dataloader, start=1):
#     loss = model(**batch).loss
#     loss = loss / training_args.gradient_accumulation_steps
#     accelerator.backward(loss)
#     if step % training_args.gradient_accumulation_steps == 0:
#         optimizer.step()
#         optimizer.zero_grad()

In [None]:
# from accelerate import Accelerator

# accelerator = Accelerator(gradient_accumulation_steps=1)
# dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

# for input, output in dataloader:
#     with accelerator.accumulate(model):
#         outputs = model(input)
#         loss = loss_func(outputs)
#         loss.backward()
#         optimizer.step()
#         scheduler.step()
#         optimizer.zero_grad()

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  3%|▎         | 50/1500 [00:14<06:28,  3.74it/s]

{'loss': 7.3456, 'grad_norm': 10.738499641418457, 'learning_rate': 4.8433333333333336e-05, 'epoch': 0.2}


  7%|▋         | 100/1500 [00:28<07:05,  3.29it/s]

{'loss': 6.4887, 'grad_norm': 22.676118850708008, 'learning_rate': 4.676666666666667e-05, 'epoch': 0.4}


 10%|█         | 150/1500 [00:43<06:12,  3.62it/s]

{'loss': 5.4989, 'grad_norm': 7.022335529327393, 'learning_rate': 4.5100000000000005e-05, 'epoch': 0.6}


 13%|█▎        | 200/1500 [00:57<06:36,  3.28it/s]

{'loss': 5.6937, 'grad_norm': 10.28407096862793, 'learning_rate': 4.3433333333333336e-05, 'epoch': 0.8}


 17%|█▋        | 250/1500 [01:12<07:49,  2.66it/s]

{'loss': 5.4204, 'grad_norm': 1.9037978649139404, 'learning_rate': 4.176666666666667e-05, 'epoch': 1.0}


                                                  
 17%|█▋        | 250/1500 [01:15<07:49,  2.66it/s]

{'eval_loss': 5.385029315948486, 'eval_wer': 1.0, 'eval_runtime': 3.6403, 'eval_samples_per_second': 54.941, 'eval_steps_per_second': 6.868, 'epoch': 1.0}


 20%|██        | 300/1500 [01:32<05:51,  3.42it/s]

{'loss': 5.4364, 'grad_norm': 9.899394035339355, 'learning_rate': 4.0100000000000006e-05, 'epoch': 1.2}


 23%|██▎       | 350/1500 [01:47<05:06,  3.75it/s]

{'loss': 5.2742, 'grad_norm': 7.543509483337402, 'learning_rate': 3.843333333333334e-05, 'epoch': 1.4}


 27%|██▋       | 400/1500 [02:01<05:16,  3.48it/s]

{'loss': 5.1096, 'grad_norm': 6.498869895935059, 'learning_rate': 3.676666666666667e-05, 'epoch': 1.6}


 30%|███       | 450/1500 [02:15<05:49,  3.00it/s]

{'loss': 5.2477, 'grad_norm': 1.3148185014724731, 'learning_rate': 3.51e-05, 'epoch': 1.8}


 33%|███▎      | 500/1500 [02:29<04:03,  4.11it/s]

{'loss': 4.8424, 'grad_norm': 11.746952056884766, 'learning_rate': 3.343333333333333e-05, 'epoch': 2.0}


                                                  
 33%|███▎      | 500/1500 [02:32<04:03,  4.11it/s]

{'eval_loss': 5.293658256530762, 'eval_wer': 1.0, 'eval_runtime': 3.6049, 'eval_samples_per_second': 55.48, 'eval_steps_per_second': 6.935, 'epoch': 2.0}


 37%|███▋      | 550/1500 [02:49<04:36,  3.43it/s]

{'loss': 5.0363, 'grad_norm': 16.386117935180664, 'learning_rate': 3.176666666666667e-05, 'epoch': 2.2}


 40%|████      | 600/1500 [03:04<04:06,  3.65it/s]

{'loss': 5.1616, 'grad_norm': 2.575950860977173, 'learning_rate': 3.01e-05, 'epoch': 2.4}


 43%|████▎     | 650/1500 [03:18<03:51,  3.67it/s]

{'loss': 5.3157, 'grad_norm': 5.570316314697266, 'learning_rate': 2.8433333333333334e-05, 'epoch': 2.6}


 47%|████▋     | 700/1500 [03:32<03:54,  3.41it/s]

{'loss': 4.717, 'grad_norm': 2.1007020473480225, 'learning_rate': 2.676666666666667e-05, 'epoch': 2.8}


 50%|█████     | 750/1500 [03:45<03:05,  4.05it/s]

{'loss': 5.1545, 'grad_norm': 5.170818328857422, 'learning_rate': 2.51e-05, 'epoch': 3.0}


                                                  
 50%|█████     | 750/1500 [03:48<03:05,  4.05it/s]

{'eval_loss': 5.313230514526367, 'eval_wer': 1.0, 'eval_runtime': 3.5318, 'eval_samples_per_second': 56.629, 'eval_steps_per_second': 7.079, 'epoch': 3.0}


 53%|█████▎    | 800/1500 [04:06<03:11,  3.65it/s]

{'loss': 5.0283, 'grad_norm': 4.516280651092529, 'learning_rate': 2.3433333333333335e-05, 'epoch': 3.2}


 57%|█████▋    | 850/1500 [04:20<03:10,  3.41it/s]

{'loss': 5.2261, 'grad_norm': 4.357248783111572, 'learning_rate': 2.18e-05, 'epoch': 3.4}


 60%|██████    | 900/1500 [04:34<02:31,  3.96it/s]

{'loss': 5.453, 'grad_norm': 19.7100772857666, 'learning_rate': 2.0133333333333336e-05, 'epoch': 3.6}


 63%|██████▎   | 950/1500 [04:48<02:35,  3.53it/s]

{'loss': 5.3564, 'grad_norm': 9.602667808532715, 'learning_rate': 1.8466666666666667e-05, 'epoch': 3.8}


 67%|██████▋   | 1000/1500 [05:02<02:10,  3.84it/s]

{'loss': 4.9006, 'grad_norm': 8.760149955749512, 'learning_rate': 1.6800000000000002e-05, 'epoch': 4.0}


                                                   
 67%|██████▋   | 1000/1500 [05:05<02:10,  3.84it/s]

{'eval_loss': 5.355025768280029, 'eval_wer': 1.0, 'eval_runtime': 3.5516, 'eval_samples_per_second': 56.313, 'eval_steps_per_second': 7.039, 'epoch': 4.0}


 70%|███████   | 1051/1500 [05:22<01:55,  3.88it/s]

{'loss': 5.1488, 'grad_norm': 0.9166431427001953, 'learning_rate': 1.5133333333333333e-05, 'epoch': 4.2}


 73%|███████▎  | 1100/1500 [05:36<01:51,  3.58it/s]

{'loss': 4.9565, 'grad_norm': 1.7678842544555664, 'learning_rate': 1.3466666666666666e-05, 'epoch': 4.4}


 77%|███████▋  | 1150/1500 [05:50<01:49,  3.19it/s]

{'loss': 5.2498, 'grad_norm': 0.9555894136428833, 'learning_rate': 1.18e-05, 'epoch': 4.6}


 80%|████████  | 1200/1500 [06:05<01:23,  3.60it/s]

{'loss': 4.9125, 'grad_norm': 2.0177369117736816, 'learning_rate': 1.0133333333333333e-05, 'epoch': 4.8}


 83%|████████▎ | 1250/1500 [06:18<01:00,  4.10it/s]

{'loss': 4.952, 'grad_norm': 1.9424035549163818, 'learning_rate': 8.466666666666666e-06, 'epoch': 5.0}


                                                   
 83%|████████▎ | 1250/1500 [06:22<01:00,  4.10it/s]

{'eval_loss': 5.36012077331543, 'eval_wer': 1.0, 'eval_runtime': 3.5295, 'eval_samples_per_second': 56.666, 'eval_steps_per_second': 7.083, 'epoch': 5.0}


 87%|████████▋ | 1301/1500 [06:38<00:51,  3.88it/s]

{'loss': 5.0425, 'grad_norm': 1.4594476222991943, 'learning_rate': 6.800000000000001e-06, 'epoch': 5.2}


 90%|█████████ | 1350/1500 [06:52<00:40,  3.75it/s]

{'loss': 5.2433, 'grad_norm': 1.1296777725219727, 'learning_rate': 5.133333333333334e-06, 'epoch': 5.4}


 93%|█████████▎| 1400/1500 [07:06<00:28,  3.48it/s]

{'loss': 5.0715, 'grad_norm': 1.506329894065857, 'learning_rate': 3.466666666666667e-06, 'epoch': 5.6}


 97%|█████████▋| 1450/1500 [07:21<00:12,  3.89it/s]

{'loss': 5.0711, 'grad_norm': 2.138375997543335, 'learning_rate': 1.8e-06, 'epoch': 5.8}


100%|██████████| 1500/1500 [07:35<00:00,  3.46it/s]

{'loss': 5.0311, 'grad_norm': 0.991845428943634, 'learning_rate': 1.3333333333333334e-07, 'epoch': 6.0}


                                                   
100%|██████████| 1500/1500 [07:38<00:00,  3.46it/s]

{'eval_loss': 5.36480712890625, 'eval_wer': 1.0, 'eval_runtime': 3.5382, 'eval_samples_per_second': 56.525, 'eval_steps_per_second': 7.066, 'epoch': 6.0}


100%|██████████| 1500/1500 [07:41<00:00,  3.25it/s]


{'train_runtime': 461.9248, 'train_samples_per_second': 25.978, 'train_steps_per_second': 3.247, 'train_loss': 5.279536122639974, 'epoch': 6.0}


In [None]:

# per_device_train_batch_size=1,
# gradient_accumulation_steps=4,
# fp16=True,


# # Enable gradient checkpointing
# model.gradient_checkpointing_enable()

# # Save the final model
# trainer.save_model(model_dir)



In [20]:
# Load the processor and model
model_dir = os.path.expanduser('~/asr_project/asr-train/model_outputs/wav2vec2-finetuned-smol')
processor = Wav2Vec2Processor.from_pretrained('facebook/wav2vec2-large-960h')
model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-large-960h')

# Load audio file
audio_file = "/home/tfc/asr_project/common_voice/cv-valid-train/cv-valid-train/sample-000000.wav"  # Replace with your audio file path
waveform, sample_rate = torchaudio.load(audio_file)


# If the sample rate is not 16kHz, resample it
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert to the right format for the model
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

# Get logits from the model
with torch.no_grad():
    logits = model(input_values).logits

# Get predicted ids
predicted_ids = logits.argmax(dim=-1)

# Decode the predicted ids to text
transcription = processor.batch_decode(predicted_ids)

print(transcription)  # Print the transcription result


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['LEARNED TO RECOGNIZE OMENS AND FOLLOW THEM THE OLD KING HAD SAID']


In [None]:
# Load the processor and model
model_dir = os.path.expanduser('~/asr_project/asr-train/model_outputs/wav2vec2-finetuned-smol')
processor = Wav2Vec2Processor.from_pretrained(model_dir)
model = Wav2Vec2ForCTC.from_pretrained(model_dir)

# Load audio file
audio_file = "/home/tfc/asr_project/common_voice/cv-valid-train/cv-valid-train/sample-000000.wav"  # Replace with your audio file path
waveform, sample_rate = torchaudio.load(audio_file)


# If the sample rate is not 16kHz, resample it
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert to the right format for the model
input_values = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_values

# Get logits from the model
with torch.no_grad():
    logits = model(input_values).logits

# Get predicted ids
predicted_ids = logits.argmax(dim=-1)

# Decode the predicted ids to text
transcription = processor.batch_decode(predicted_ids)

print(transcription)  # Print the transcription result


In [None]:
# File locations assumed in parent directory
transcription_file = os.path.expanduser(
    '~/asr_project/common_voice/cv-valid-train.csv')              # Transcription file location
audio_folder = os.path.expanduser(
    '~/asr_project/common_voice/cv-valid-train')   # Audio files directory
df = pd.read_csv(transcription_file)[['filename','text']]         # Read transcription file

# Convert mp3 to wav. Change mp3 file extension in df accordingly
df['filename'] = df['filename'].apply(
    lambda filename: convert_mp3_to_wav(
        os.path.join(audio_folder, filename)))
df.to_csv('temp.csv',index=False)                                 # Save temp copy of csv

In [None]:
# Accessing log history to find train and val losses
train_losses = []
val_losses = []

for log in trainer.state.log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])  # Training loss
    if 'eval_loss' in log:
        val_losses.append(log['eval_loss'])  # Validation loss

# Get the final train and val losses
final_train_loss = train_losses[-1] if train_losses else None
final_val_loss = val_losses[-1] if val_losses else None

print(f"Final Training Loss: {final_train_loss}")
print(f"Final Validation Loss: {final_val_loss}")


250.0

In [22]:
trainer.state.log_history

[{'loss': 16.7967,
  'grad_norm': 6.2722487449646,
  'learning_rate': 4.8433333333333336e-05,
  'epoch': 0.2,
  'step': 50},
 {'loss': 15.7987,
  'grad_norm': 14.26067066192627,
  'learning_rate': 4.676666666666667e-05,
  'epoch': 0.4,
  'step': 100},
 {'loss': 14.7088,
  'grad_norm': 12.909340858459473,
  'learning_rate': 4.5100000000000005e-05,
  'epoch': 0.6,
  'step': 150},
 {'loss': 13.7017,
  'grad_norm': 13.007761001586914,
  'learning_rate': 4.3433333333333336e-05,
  'epoch': 0.8,
  'step': 200},
 {'loss': 13.6914,
  'grad_norm': 4.1002326011657715,
  'learning_rate': 4.176666666666667e-05,
  'epoch': 1.0,
  'step': 250},
 {'eval_loss': 27.835580825805664,
  'eval_wer': 1.0,
  'eval_runtime': 3.6111,
  'eval_samples_per_second': 55.385,
  'eval_steps_per_second': 6.923,
  'epoch': 1.0,
  'step': 250},
 {'loss': 13.462,
  'grad_norm': 4.646790027618408,
  'learning_rate': 4.013333333333333e-05,
  'epoch': 1.2,
  'step': 300},
 {'loss': 11.7557,
  'grad_norm': 4.622196674346924,


In [25]:
processor(text='test')

{'input_ids': [3, 3, 3, 3]}

In [21]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load the processor and model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-base", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

# Access the vocabulary
vocab = processor.tokenizer.get_vocab()

# Print the vocabulary size and a sample of the vocabulary
print(f"Vocabulary size: {len(vocab)}")
print("Sample vocabulary items:")
for token, index in list(vocab.items()):  # Print the first 10 tokens
    print(f"Token: {token}, Index: {index}")


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Vocabulary size: 32
Sample vocabulary items:
Token: <pad>, Index: 0
Token: <s>, Index: 1
Token: </s>, Index: 2
Token: <unk>, Index: 3
Token: |, Index: 4
Token: E, Index: 5
Token: T, Index: 6
Token: A, Index: 7
Token: O, Index: 8
Token: N, Index: 9
Token: I, Index: 10
Token: H, Index: 11
Token: S, Index: 12
Token: R, Index: 13
Token: D, Index: 14
Token: L, Index: 15
Token: U, Index: 16
Token: M, Index: 17
Token: W, Index: 18
Token: C, Index: 19
Token: F, Index: 20
Token: G, Index: 21
Token: Y, Index: 22
Token: P, Index: 23
Token: B, Index: 24
Token: V, Index: 25
Token: K, Index: 26
Token: ', Index: 27
Token: X, Index: 28
Token: J, Index: 29
Token: Q, Index: 30
Token: Z, Index: 31


In [17]:
!rm -rf ~/asr_project/asr-train/logs/*

# Load the processor and model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

# Freeze feature extractor layers
# model.freeze_feature_encoder()

# Freeze all layers except the head
for param in model.parameters():
    param.requires_grad = False  # Freeze all parameters

# Assuming the head is the `classifier` in Wav2Vec2ForCTC
for param in model.lm_head.parameters():  # For the head (classifier) layer
    param.requires_grad = True  # Unfreeze the head

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

# Define the training arguments
training_args = TrainingArguments(
    output_dir=os.path.expanduser('~/asr_project/asr-train/model_outputs'),
    logging_dir=os.path.expanduser('~/asr_project/asr-train/logs'),
    per_device_train_batch_size=8,              # batch size for training
    per_device_eval_batch_size=8,               # batch size for evaluation
    num_train_epochs=3,                           # total number of training epochs
    logging_steps=50,                            # log every 100 steps
    eval_strategy="steps",                  # evaluate during training
    save_steps=500,                               # save checkpoint every 500 steps
    eval_steps=500,                               # evaluate every 500 steps
    load_best_model_at_end=True,                  # load the best model at the end of training
    fp16=True
)

# # Create the Trainer instance
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=dataset['train'],                  # your training dataset
#     eval_dataset=dataset['val'],                      # your validation dataset
# )


# Define huggingface trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    processing_class= processor.feature_extractor
)

# Start the training
trainer.train()

# Save the final model
trainer.save_model(os.path.expanduser('~/asr_project/asr-train/model_outputs/wav2vec2-finetuned-final'))



Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 50/51393 [00:13<3:40:37,  3.88it/s]

{'loss': 16.6252, 'grad_norm': 6.919604778289795, 'learning_rate': 4.995427392835601e-05, 'epoch': 0.0}


  0%|          | 100/51393 [00:25<3:40:29,  3.88it/s]

{'loss': 15.7953, 'grad_norm': 5.722819805145264, 'learning_rate': 4.990562917128792e-05, 'epoch': 0.01}


  0%|          | 150/51393 [00:38<3:56:36,  3.61it/s]

{'loss': 15.4045, 'grad_norm': 5.995153903961182, 'learning_rate': 4.9856984414219837e-05, 'epoch': 0.01}


  0%|          | 200/51393 [00:51<3:03:57,  4.64it/s]

{'loss': 14.2438, 'grad_norm': 4.1416754722595215, 'learning_rate': 4.980833965715175e-05, 'epoch': 0.01}


  0%|          | 251/51393 [01:03<2:57:46,  4.79it/s]

{'loss': 12.9417, 'grad_norm': 10.482752799987793, 'learning_rate': 4.975969490008367e-05, 'epoch': 0.01}


  1%|          | 300/51393 [01:16<3:45:51,  3.77it/s]

{'loss': 12.661, 'grad_norm': 9.821016311645508, 'learning_rate': 4.9712023038156955e-05, 'epoch': 0.02}


  1%|          | 351/51393 [01:29<3:09:16,  4.49it/s]

{'loss': 12.0799, 'grad_norm': 5.653285980224609, 'learning_rate': 4.966337828108887e-05, 'epoch': 0.02}


  1%|          | 401/51393 [01:42<2:58:17,  4.77it/s]

{'loss': 11.4288, 'grad_norm': 5.822386741638184, 'learning_rate': 4.961473352402078e-05, 'epoch': 0.02}


  1%|          | 451/51393 [01:54<3:54:32,  3.62it/s]

{'loss': 11.6048, 'grad_norm': 8.978076934814453, 'learning_rate': 4.95660887669527e-05, 'epoch': 0.03}


  1%|          | 500/51393 [02:07<4:07:09,  3.43it/s]

{'loss': 11.1998, 'grad_norm': 8.123398780822754, 'learning_rate': 4.951744400988462e-05, 'epoch': 0.03}




OutOfMemoryError: CUDA out of memory. Tried to allocate 2.35 GiB. GPU 0 has a total capacity of 15.69 GiB of which 1.63 GiB is free. Process 1941 has 249.95 MiB memory in use. Including non-PyTorch memory, this process has 13.38 GiB memory in use. Of the allocated memory 10.55 GiB is allocated by PyTorch, and 2.52 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Load the processor and model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

# Function to count trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Get the number of trainable parameters
num_trainable_params = count_trainable_parameters(model)

print(f'Number of trainable parameters: {num_trainable_params}')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 315461792


In [20]:
# Load the processor and model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

model.freeze_feature_encoder()

# Function to count trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Get the number of trainable parameters
num_trainable_params = count_trainable_parameters(model)

print(f'Number of trainable parameters: {num_trainable_params}')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 311261344


In [None]:
# Load the processor and model
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-960h", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

# Freeze all layers except the head
for param in model.parameters():
    param.requires_grad = False  # Freeze all parameters

# Assuming the head is the `classifier` in Wav2Vec2ForCTC
for param in model.lm_head.parameters():  # For the head (classifier) layer
    param.requires_grad = True  # Unfreeze the head

# Function to count trainable parameters
def count_trainable_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# Get the number of trainable parameters
num_trainable_params = count_trainable_parameters(model)

print(f'Number of trainable parameters: {num_trainable_params}')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Number of trainable parameters: 32800
