In [2]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer

[0mCollecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-y6km0xj7
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-y6km0xj7
  Resolved https://github.com/huggingface/transformers to commit 3658488ff77ff8d45101293e749263acf437f4d5
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25ldone
[?25h  Created wheel for transformers: filename=transformers-4.30.0.dev0-py3-none-any.whl size=7120338 sha256=b0a4b3006c9c4c991f78dedb433a1f0599ed0a1a0f09c127e151814cd229004f
  Stored in directory: /tmp/pip-ephem-wheel-cache-d095xovh/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformers
I

In [108]:
from datasets import Dataset, DatasetDict
from transformers import WhisperFeatureExtractor

column_names = ['segments_start', 'segments_end', 'sentence', 'chunk_name', 'parent_name', 'paths']
# Path to your train CSV file
train_csv_file_path = "train.csv"

# Path to your test CSV file
test_csv_file_path = "test.csv"

# Load the train CSV file into a Dataset object
train_dataset = Dataset.from_csv(train_csv_file_path, delimiter=',')

# Load the test CSV file into a Dataset object
test_dataset = Dataset.from_csv(test_csv_file_path, delimiter=',')

# Create a DatasetDict object with train and test splits
dataset_dict = DatasetDict({'train': train_dataset, 'test': test_dataset})

# Print dataset information
print(dataset_dict)

Found cached dataset csv (/home/.cache/huggingface/datasets/csv/default-cc963ed38855d479/0.0.0)
Found cached dataset csv (/home/.cache/huggingface/datasets/csv/default-2cd926a5f467cc68/0.0.0)


DatasetDict({
    train: Dataset({
        features: ['segments_start', 'segments_end', 'sentence', 'chunk_name', 'parent_name', 'paths'],
        num_rows: 4918
    })
    test: Dataset({
        features: ['segments_start', 'segments_end', 'sentence', 'chunk_name', 'parent_name', 'paths'],
        num_rows: 869
    })
})


In [24]:
dataset_dict["train"][0]

{'segments_start': 0.0,
 'segments_end': 9.0,
 'sentence': 'दोस्तों bash में nested और multilevel if statement के spoken tutorial में आपका स्वागत है',
 'chunk_name': '100051_GigsEKRBMhvNwolC_0000',
 'parent_name': 'GigsEKRBMhvNwolC',
 'paths': 'data/train_segments/100051_GigsEKRBMhvNwolC_0000.wav'}

In [25]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [26]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")



In [27]:
input_str = dataset_dict["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")

Input:                 दोस्तों bash में nested और multilevel if statement के spoken tutorial में आपका स्वागत है
Decoded w/ special:    <|startoftranscript|><|transcribe|><|notimestamps|>दोस्तों bash में nested और multilevel if statement के spoken tutorial में आपका स्वागत है<|endoftext|>
Decoded w/out special: दोस्तों bash में nested और multilevel if statement के spoken tutorial में आपका स्वागत है
Are equal:             True


In [28]:
from transformers import WhisperProcessor
import soundfile as sf
processor = WhisperProcessor.from_pretrained("openai/whisper-small", task="transcribe")

In [29]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio, sample_rate = sf.read(batch['paths'])

    # compute log-Mel input features from input audio array 
    batch["input_features"] = feature_extractor(audio, sampling_rate=sample_rate).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

dataset_dict = dataset_dict.map(prepare_dataset, remove_columns=dataset_dict.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/4918 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/869 [00:00<?, ? examples/s]

In [30]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch


In [31]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [32]:
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

In [33]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [63]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [64]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [42]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-hi",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=400,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=100,
    eval_steps=100,
    logging_steps=25,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=True,
)

In [43]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [91]:
processor.save_pretrained(training_args.output_dir)

In [93]:
from transformers import AutoFeatureExtractor

processor = WhisperProcessor.from_pretrained("whisper-small-hi", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("whisper-small-hi/checkpoint-400")


In [97]:
audio, sr = sf.read()

In [121]:
for i in range(10):
    audio, sr = sf.read(dataset_dict["test"][i]['paths'])
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
    input_features = inputs.input_features
    
    generated_ids = model.generate(inputs=input_features)
    
    transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
    print("Predicted: ",transcription)
    print("Truth: ", dataset_dict["test"][i]['sentence'])
    print('\n')

Predicted:  libreoffice impress में एक प्रस्तुति document बनाना और बुन्यादि formatting के इस spoken tutorial में आपका स्वागा
Truth:  लिबर ऑफिस impress में एक प्रस्तुति document बनाना और बुनियादी formatting के इस spoken tutorial में आपका स्वागत है


Predicted:  इस tutorial में हम impress window के भागों के बारे में सीखेंगे और कैसे slide insert करें और copy करें font तथा font को format करना सीखेंगे
Truth:  इस tutorial में हम impress window के भागों के बारे में सीखेंगे और कैसे स्लाइड इन्सर्ट करें और कॉपी करें फॉन्ट तथा फॉन्ट को फॉर्मेट करना सीखेंगे


Predicted:  यहाँ हम अपने operating system के रूप में gnu linux और libreoffice version 334 का उपयोग कर रहे हैं
Truth:  यहाँ हम अपने ऑपरेटिंग सिस्टम के रूप में gnu/linux और लिबरऑफिस वर्जन 334 का उपयोग कर रहे हैं


Predicted:  चलिए अपनी प्रस्तुति sampleimpress open करते हैं जिसे पिछले tutorial में प्रद्शित हैं
Truth:  चलिए अपनी प्रस्तुति प्रेजैटेशन sample impress open करते हैं जिसे पिछले tutorial में बनाया था


Predicted:  चलिए देते हैं कि scree