In [1]:
!pip install transformers gdown soundfile jiwer datasets
!pip install accelerate -U
!mkdir ./dataset

Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting jiwer
  Downloading jiwer-3.0.3-py3-none-any.whl (21 kB)
Installing collected packages: jiwer, gdown
Successfully installed gdown-4.7.1 jiwer-3.0.3


In [3]:
!cp -r /kaggle/input/slu-utils/* /kaggle/working/

In [11]:
import utils
utils.download_data()

Downloading...
From (uriginal): https://drive.google.com/uc?id=1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ
From (redirected): https://drive.google.com/uc?id=1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ&confirm=t&uuid=4d0f47f5-4580-42f7-94d6-371d2d782575
To: /kaggle/working/dataset/train_data.zip
100%|██████████| 733M/733M [00:03<00:00, 226MB/s]  
Downloading...
From (uriginal): https://drive.google.com/uc?id=1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64
From (redirected): https://drive.google.com/uc?id=1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64&confirm=t&uuid=4530ef79-4e57-4cc4-9b47-83aa2c5600e9
To: /kaggle/working/dataset/public_test.zip
100%|██████████| 131M/131M [00:00<00:00, 283MB/s]  
Downloading...
From: https://drive.google.com/uc?id=1K_07kix1OgBGO2FNPh-Lxqr1yLbtqFYt
To: /kaggle/working/dataset/train.jsonl
100%|██████████| 3.30M/3.30M [00:00<00:00, 199MB/s]


In [None]:
!unzip /kaggle/working/dataset/train_data.zip -d /kaggle/working/dataset/train

In [5]:
import os, glob, re, torch, json, utils, numpy as np, soundfile as sf
from functools import partial
from datasets import load_metric
from torch.utils.data import DataLoader, Dataset
from transformers import WhisperForConditionalGeneration, WhisperProcessor, TrainingArguments, Trainer



In [7]:
class WhisperDataset(Dataset):
    def __init__(self, processor, root_path, files_id, labels=None):
        self.processor = processor
        self.root_path = root_path
        self.files_id = files_id
        self.labels = labels

    def _process_sound_file(self, idx):
        speech, samplerate  = sf.read(os.path.join(self.root_path, self.files_id[idx]))
        chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
        clean_txt = lambda txt: re.sub(chars_to_ignore_regex, '', txt.lower()).lower()
        label = clean_txt(self.labels[idx]["sentence"]) if self.labels is not None else None
        input_feature = self.processor(speech, text=label, sampling_rate=samplerate)
        # input_feature["input_length"] = len(speech) / samplerate
        return input_feature
#         return {"input_features": speech, "sampling_rate": samplerate, "labels": label}

    def __len__(self):
        return len(self.files_id)

    def __getitem__(self, idx):
        data = self._process_sound_file(idx)
        return {"input_features": data.input_features, "labels": data.labels if "labels" in data else None, "file_id": self.files_id[idx]}

In [8]:
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

class DataCollatorSpeechSeq2SeqWithPadding:
    def __init__(self, processor=None):
        self.processor = processor
    def __call__(self, features):
        input_features = [{"input_features": feature["input_features"][0]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        if features[0]["labels"] is None:
            batch["file_id"] = [i["file_id"] for i in features]
            return batch
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
        labels = labels_batch["input_ids"].masked_fill(
            labels_batch.attention_mask.ne(1), -100
        )
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]
        batch["labels"] = labels
        return batch

In [9]:
def train_test_split(root_path, notation_file, test_size=0.3)   :
    notations = utils.load_annotation(notation_file)
    dataset = WhisperDataset(root_path, [i["file"] for i in notations], notations)
    N = len(dataset)
    print(f"Len dataset: {N}")
    train_size = int(N * (1-test_size))
    train_set, valid_set = torch.utils.data.random_split(dataset, [train_size, N-train_size])
    return train_set, valid_set

In [13]:
train_ds, valid_ds = train_test_split("/kaggle/working/dataset/train/Train/", "/kaggle/working/dataset/train.jsonl", test_size=0.3)
len(train_ds), len(valid_ds)

Len dataset: 7490


(5243, 2247)

In [14]:
processor = WhisperProcessor.from_pretrained("GeoffVdr/whisper-medium-nlcv11", language="Vietnamese", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("GeoffVdr/whisper-medium-nlcv11")

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

In [15]:
for name, p in model.model.named_parameters():
    p.requires_grad = False
for name, p in model.proj_out.named_parameters():
    p.requires_grad = True

In [16]:
wer_metric = load_metric("wer")
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [18]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper_v1.0",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=5e-5,
    lr_scheduler_type="constant_with_warmup",
    warmup_steps=2000,
    max_steps=28000,
    gradient_checkpointing=True,
    fp16=True,
    fp16_full_eval=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    generation_max_length=400,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=1000,
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    save_total_limit=1,
)

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [19]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
cat: /sys/module/amdgpu/initstate: No such file or directory


`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Epoch,Training Loss,Validation Loss


In [None]:
processor = WhisperProcessor.from_pretrained("GeoffVdr/whisper-medium-nlcv11", language="Vietnamese", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained("./whisper_v1.0/checkpoint-5000")

In [None]:
test_set = WhisperDataset(processor, "./dataset/test/public_test", [i for i in os.listdir("./dataset/test/public_test")])
len(test_set)

In [None]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False, collate_fn=data_collator)

In [None]:
def whisper_inference(model, test_loader, processor, device=None):
    if device is None:
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.eval()
    model = model.to(device)
    model.half()
    pred_sentences = {}
    for idx, batch in enumerate(test_loader, 1):
        X_test =  batch["input_features"].to(device)
        file_test = batch["file_id"]
        with torch.set_grad_enabled(False):
            with torch.autocast("cuda", dtype=torch.float16, enabled=True):
                logits = model.generate(inputs=X_test)
        # logits = torch.argmax(logits, dim=-1).cpu().detach().numpy()
        transcriptions = processor.batch_decode(logits, skip_special_tokens=True)
        # transcriptions = [
            # decoder.decode_beams(i)[0][0] for i in logits.cpu().detach().numpy()
        # ]
        for file_id, trans in zip(file_test, transcriptions):
            pred_sentences[file_id] = trans
            # print(trans)
        print("\r", end="")
        print(f"\r {idx} / {len(test_loader)}", end = "" if idx != len(test_loader) else "\n")
    return pred_sentences

In [None]:
pred_sentences = whisper_inference(model, test_loader, processor, torch.device("cuda:0"))

In [None]:
pred_sentences

In [None]:
with open("./whisper_test_sentences.json", "w", encoding="utf-8") as f:
    json.dump(pred_sentences, f, ensure_ascii=False)
    f.close()