#Prepare Environment

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Wed Nov 29 19:07:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.30
!pip install jiwer
!pip install gradio
!pip install kaleido
!pip install cohere
!pip install openai
!pip install tiktoken

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-vk49od7x
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-vk49od7x
  Resolved https://github.com/huggingface/transformers to commit 083e36923a19650fa264c4173db2f63ab124bb27
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#Load Dataset
The following code is to load the dataset and save it to Google Drive. If there is nothing need to be modified, just run it only once then it can be commentted out.

In [None]:
from datasets import load_dataset, DatasetDict
from google.colab import drive
drive.mount('/content/gdrive')

common_voice = DatasetDict()

common_voice["train"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-TW", split="train+validation", use_auth_token=True)
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_11_0", "zh-TW", split="test", use_auth_token=True)

print(common_voice)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




DatasetDict({
    train: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 11277
    })
    test: Dataset({
        features: ['client_id', 'path', 'audio', 'sentence', 'up_votes', 'down_votes', 'age', 'gender', 'accent', 'locale', 'segment'],
        num_rows: 4709
    })
})


In [None]:
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", "gender", "locale", "path", "segment", "up_votes"])

print(common_voice)

DatasetDict({
    train: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 11277
    })
    test: Dataset({
        features: ['audio', 'sentence'],
        num_rows: 4709
    })
})


In [None]:
print(common_voice["train"][0])

{'audio': {'path': '/root/.cache/huggingface/datasets/downloads/extracted/51e978c47798482641ccccdcb8d659ea097ecfb24e17f8cacf723f3d1da59cd5/zh-TW_train_0/common_voice_zh-TW_17626464.mp3', 'array': array([ 0.00000000e+00, -6.27062811e-15, -6.86393445e-15, ...,
        2.13285966e-06,  1.94480162e-06, -4.87215766e-06]), 'sampling_rate': 48000}, 'sentence': '我們一起享用'}


In [None]:
save_path = '/content/gdrive/MyDrive/common_voice_dataset'
common_voice.save_to_disk(save_path)

Saving the dataset (0/1 shards):   0%|          | 0/11277 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4709 [00:00<?, ? examples/s]

#Prepare Feature Extractor, Tokenizer and Data
The following code is to read the existing dataset in Google Drive. Then prepare feature extractor, tokenizer and data. Then store the data in Google Drive.If there is nothing need to be modified, just run it only once then it can be commentted out.

In [6]:
from datasets import load_dataset, DatasetDict
from google.colab import drive

drive.mount('/content/gdrive')

load_path_gdrive = '/content/gdrive/MyDrive/common_voice_dataset'
common_voice = DatasetDict.load_from_disk(load_path_gdrive)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [7]:
from transformers import WhisperFeatureExtractor

feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")

In [8]:
from transformers import WhisperTokenizer

tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="mandarin", task="transcribe")

In [9]:
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="mandarin", task="transcribe")

In [None]:
print(common_voice["train"][0])

{'audio': {'path': 'common_voice_zh-TW_17626464.mp3', 'array': array([ 0.00000000e+00, -6.27062811e-15, -6.86393445e-15, ...,
        2.13285966e-06,  1.94480162e-06, -4.87215766e-06]), 'sampling_rate': 48000}, 'sentence': '我們一起享用'}


In [None]:
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))

In [None]:
print(common_voice["train"][0])

{'audio': {'path': 'common_voice_zh-TW_17626464.mp3', 'array': array([-3.18323146e-12,  2.27373675e-12,  3.63797881e-12, ...,
       -2.14449028e-06,  6.71983798e-06,  1.50593405e-06]), 'sampling_rate': 16000}, 'sentence': '我們一起享用'}


In [None]:
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array
    batch["input_features"] = feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch

In [None]:
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=2)

Map (num_proc=2):   0%|          | 0/11277 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/4709 [00:00<?, ? examples/s]

In [None]:
save_path = '/content/gdrive/MyDrive/common_voice'
common_voice.save_to_disk(save_path)

Saving the dataset (0/22 shards):   0%|          | 0/11277 [00:00<?, ? examples/s]

Saving the dataset (0/10 shards):   0%|          | 0/4709 [00:00<?, ? examples/s]

#Test

In [10]:
from datasets import load_dataset, DatasetDict
from google.colab import drive

drive.mount('/content/gdrive')

load_path_gdrive = '/content/gdrive/MyDrive/common_voice'
common_voice = DatasetDict.load_from_disk(load_path_gdrive)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [11]:
import torch

from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # pad the labels to max length
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [12]:
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

In [13]:
import evaluate

metric = evaluate.load("wer")

In [14]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # replace -100 with the pad_token_id
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # we do not want to group tokens when computing the metrics
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [15]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

In [16]:
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [17]:
!pip install transformers[torch]
!pip install accelerate -U



In [18]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-small-tw",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
    push_to_hub=True,
)

In [19]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=common_voice["train"],
    eval_dataset=common_voice["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

In [20]:
processor.save_pretrained(training_args.output_dir)

In [26]:
function ConnectButton(){
    console.log("Connect pushed");
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click()
}
setInterval(ConnectButton, 60000);

SyntaxError: ignored

In [27]:
trainer.train()

`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss


KeyboardInterrupt: ignored