In [1]:
from huggingface_hub import notebook_login

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from datasets import load_dataset, DatasetDict

# Set the desired cache directory
cache_dir = "E:\\mp3_processing"

common_voice = DatasetDict()
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_16_1", "lg", split="train+validation", use_auth_token=True, cache_dir=cache_dir)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_16_1", "lg", split="test", use_auth_token=True, cache_dir=cache_dir)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

In [5]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-medium")

In [6]:
from transformers import WhisperTokenizer,WhisperProcessor

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-medium", language="english", task="transcribe")
processor = WhisperProcessor.from_pretrained("openai/whisper-medium", language="english", task="transcribe")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [8]:
# Normal Data Preparation 
"""
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    input_features = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    labels = tokenizer(batch["sentence"]).input_ids

    # truncate or split the sequences if they exceed the maximum length
    max_length = 1024  # Adjust this value based on your model's maximum sequence length
    if len(input_features) > max_length:
        input_features = input_features[:max_length]
    if len(labels) > max_length:
        labels = labels[:max_length]
    batch["input_features"] = input_features
    batch["labels"] = labels

    return batch
"""

In [8]:
import librosa
import numpy as np

def apply_augmentation(audio, sample_rate):
    # Pitch shifting
    if np.random.rand() < 0.5:
        pitch_shift_range = (-2, 2)
        pitch_shift = np.random.randint(*pitch_shift_range)
        audio = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=pitch_shift)
    
    # Time stretching
    if np.random.rand() < 0.5:
        stretch_rate = np.random.uniform(0.8, 1.2)
        audio = librosa.effects.time_stretch(audio, rate=stretch_rate)
    
    # Gaussian noise injection
    if np.random.rand() < 0.5:
        noise_scale = 0.005
        noise = np.random.normal(0, noise_scale, audio.shape)
        audio = audio + noise
        audio = np.clip(audio, -1, 1)
    
    return audio

In [9]:
def prepare_dataset(batch):
    audio = batch["audio"]
    sample_rate = audio["sampling_rate"]

    # Apply data augmentation with a configurable probability
    if np.random.rand() < 0.5:
        audio_array = audio["array"]
        audio_array = apply_augmentation(audio_array, sample_rate)
        audio["array"] = audio_array

    # Compute log-Mel input features from input audio array
    input_features = feature_extractor(audio["array"], sampling_rate=sample_rate).input_features[0]

    # Encode target text to label ids
    labels = tokenizer(batch["sentence"]).input_ids

    # Truncation
    max_length = 1024  # Adjust this value based on your model's maximum sequence length
    if len(input_features) > max_length:
        input_features = input_features[:max_length]
    if len(labels) > max_length:
        labels = labels[:max_length]

    batch["input_features"] = input_features
    batch["labels"] = labels

    return batch


In [10]:
 #common_voice_processed = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"])

Map:   0%|          | 0/84400 [00:00<?, ? examples/s]

Map:   0%|          | 0/13358 [00:00<?, ? examples/s]

In [11]:
import torch
from datasets import Dataset, DatasetDict

# Load the dataset using torch.load
#torch.save(common_voice_processed,'common_voice_processed_mid_eng_AUG.pt')
common_voice_processed = torch.load('common_voice_processed_mid_eng_AUG.pt')


In [12]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

# Let's initialise the data collator just defined
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [13]:
import evaluate
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id
    # we do not want to group tokens when computing the metrics
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [14]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
model.config.forced_decoder_ids = None
model.generation_config.language = "english"  
model.config.suppress_tokens = []

In [15]:
print(torch.cuda.is_available())

True


In [16]:
"""
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medium-Lg_AUG",
    per_device_train_batch_size=16, 
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True, 
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)
"""

'\nfrom transformers import Seq2SeqTrainingArguments\n\ntraining_args = Seq2SeqTrainingArguments(\n    output_dir="./whisper-medium-Lg_AUG",\n    per_device_train_batch_size=16, \n    gradient_accumulation_steps=1,\n    learning_rate=1e-5,\n    warmup_steps=500,\n    max_steps=4000,\n    gradient_checkpointing=True,\n    fp16=True,\n    evaluation_strategy="steps",\n    per_device_eval_batch_size=8,\n    predict_with_generate=True, \n    generation_max_length=225,\n    save_steps=1000,\n    eval_steps=1000,\n    logging_steps=25,\n    report_to=["tensorboard"],\n    load_best_model_at_end=True,\n    metric_for_best_model="wer",\n    greater_is_better=False,\n    push_to_hub=True,\n)\n'

In [17]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-mid-eng_AUG",
    per_device_train_batch_size=32,  # Increase batch size to utilize GPU memory
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,  # Increase eval batch size to utilize GPU memory
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=50,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [18]:
device = "cuda"

In [19]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model.to(device),
    train_dataset=common_voice_processed['train'],
    eval_dataset=common_voice_processed['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [20]:
trainer.train()

  0%|          | 0/4000 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


{'loss': 5.0106, 'grad_norm': 20.7950439453125, 'learning_rate': 9.200000000000001e-07, 'epoch': 0.02}
{'loss': 3.1883, 'grad_norm': 12.457552909851074, 'learning_rate': 1.9200000000000003e-06, 'epoch': 0.04}
{'loss': 2.1165, 'grad_norm': 11.790446281433105, 'learning_rate': 2.92e-06, 'epoch': 0.06}
{'loss': 1.3118, 'grad_norm': 10.1051607131958, 'learning_rate': 3.920000000000001e-06, 'epoch': 0.08}
{'loss': 1.0356, 'grad_norm': 9.566577911376953, 'learning_rate': 4.92e-06, 'epoch': 0.09}
{'loss': 0.8957, 'grad_norm': 8.466584205627441, 'learning_rate': 5.92e-06, 'epoch': 0.11}
{'loss': 0.7944, 'grad_norm': 7.1853790283203125, 'learning_rate': 6.92e-06, 'epoch': 0.13}
{'loss': 0.7543, 'grad_norm': 8.216296195983887, 'learning_rate': 7.92e-06, 'epoch': 0.15}
{'loss': 0.7401, 'grad_norm': 7.857621669769287, 'learning_rate': 8.920000000000001e-06, 'epoch': 0.17}
{'loss': 0.6737, 'grad_norm': 6.249478816986084, 'learning_rate': 9.920000000000002e-06, 'epoch': 0.19}
{'loss': 0.6651, 'grad_

  0%|          | 0/835 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.5857741236686707, 'eval_wer': 49.546397234777984, 'eval_runtime': 9035.7324, 'eval_samples_per_second': 1.478, 'eval_steps_per_second': 0.092, 'epoch': 0.38}




{'loss': 0.4708, 'grad_norm': 6.1445512771606445, 'learning_rate': 8.44e-06, 'epoch': 0.4}
{'loss': 0.4689, 'grad_norm': 5.6822896003723145, 'learning_rate': 8.297142857142859e-06, 'epoch': 0.42}
{'loss': 0.4702, 'grad_norm': 5.341933727264404, 'learning_rate': 8.154285714285715e-06, 'epoch': 0.44}
{'loss': 0.4664, 'grad_norm': 5.366504669189453, 'learning_rate': 8.011428571428573e-06, 'epoch': 0.45}
{'loss': 0.4639, 'grad_norm': 6.1702070236206055, 'learning_rate': 7.86857142857143e-06, 'epoch': 0.47}
{'loss': 0.4475, 'grad_norm': 5.6362504959106445, 'learning_rate': 7.725714285714286e-06, 'epoch': 0.49}
{'loss': 0.434, 'grad_norm': 5.721033096313477, 'learning_rate': 7.5828571428571444e-06, 'epoch': 0.51}
{'loss': 0.423, 'grad_norm': 6.321561336517334, 'learning_rate': 7.440000000000001e-06, 'epoch': 0.53}
{'loss': 0.4372, 'grad_norm': 6.102550983428955, 'learning_rate': 7.297142857142858e-06, 'epoch': 0.55}
{'loss': 0.4223, 'grad_norm': 5.335416793823242, 'learning_rate': 7.15428571

  0%|          | 0/835 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.473019540309906, 'eval_wer': 41.73145440042541, 'eval_runtime': 8911.2303, 'eval_samples_per_second': 1.499, 'eval_steps_per_second': 0.094, 'epoch': 0.76}




{'loss': 0.3958, 'grad_norm': 6.597301006317139, 'learning_rate': 5.582857142857143e-06, 'epoch': 0.78}
{'loss': 0.38, 'grad_norm': 6.393946170806885, 'learning_rate': 5.4400000000000004e-06, 'epoch': 0.8}
{'loss': 0.3871, 'grad_norm': 5.489497661590576, 'learning_rate': 5.297142857142858e-06, 'epoch': 0.82}
{'loss': 0.3719, 'grad_norm': 5.904890060424805, 'learning_rate': 5.154285714285715e-06, 'epoch': 0.83}
{'loss': 0.3953, 'grad_norm': 6.2761616706848145, 'learning_rate': 5.011428571428571e-06, 'epoch': 0.85}
{'loss': 0.375, 'grad_norm': 4.486977577209473, 'learning_rate': 4.868571428571429e-06, 'epoch': 0.87}
{'loss': 0.3528, 'grad_norm': 4.533949375152588, 'learning_rate': 4.725714285714286e-06, 'epoch': 0.89}
{'loss': 0.3626, 'grad_norm': 6.124539852142334, 'learning_rate': 4.5828571428571435e-06, 'epoch': 0.91}
{'loss': 0.3782, 'grad_norm': 5.480850696563721, 'learning_rate': 4.440000000000001e-06, 'epoch': 0.93}
{'loss': 0.3762, 'grad_norm': 5.472536563873291, 'learning_rate':

  0%|          | 0/835 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.4351734519004822, 'eval_wer': 38.252592395639454, 'eval_runtime': 8891.249, 'eval_samples_per_second': 1.502, 'eval_steps_per_second': 0.094, 'epoch': 1.14}




{'loss': 0.2486, 'grad_norm': 4.46259069442749, 'learning_rate': 2.725714285714286e-06, 'epoch': 1.16}
{'loss': 0.2591, 'grad_norm': 3.9345345497131348, 'learning_rate': 2.582857142857143e-06, 'epoch': 1.18}
{'loss': 0.2711, 'grad_norm': 3.9948229789733887, 'learning_rate': 2.4400000000000004e-06, 'epoch': 1.19}
{'loss': 0.2565, 'grad_norm': 4.853420734405518, 'learning_rate': 2.297142857142857e-06, 'epoch': 1.21}
{'loss': 0.261, 'grad_norm': 4.33436393737793, 'learning_rate': 2.1542857142857147e-06, 'epoch': 1.23}
{'loss': 0.2465, 'grad_norm': 4.560461521148682, 'learning_rate': 2.0114285714285715e-06, 'epoch': 1.25}
{'loss': 0.2471, 'grad_norm': 4.5200395584106445, 'learning_rate': 1.8685714285714289e-06, 'epoch': 1.27}
{'loss': 0.2507, 'grad_norm': 4.746490478515625, 'learning_rate': 1.7257142857142858e-06, 'epoch': 1.29}
{'loss': 0.2458, 'grad_norm': 4.873831748962402, 'learning_rate': 1.582857142857143e-06, 'epoch': 1.31}
{'loss': 0.2495, 'grad_norm': 4.331137657165527, 'learning_

  0%|          | 0/835 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


{'eval_loss': 0.4154386520385742, 'eval_wer': 37.69529380483914, 'eval_runtime': 9267.7655, 'eval_samples_per_second': 1.441, 'eval_steps_per_second': 0.09, 'epoch': 1.52}


There were missing keys in the checkpoint model loaded: ['proj_out.weight'].


{'train_runtime': 67990.7509, 'train_samples_per_second': 1.883, 'train_steps_per_second': 0.059, 'train_loss': 0.5294672603607178, 'epoch': 1.52}


TrainOutput(global_step=4000, training_loss=0.5294672603607178, metrics={'train_runtime': 67990.7509, 'train_samples_per_second': 1.883, 'train_steps_per_second': 0.059, 'train_loss': 0.5294672603607178, 'epoch': 1.52})

: 