In [1]:
# # %%capture
# ! pip install git+https://github.com/openai/whisper.git
# ! pip install jiwer
# ! pip install pyopenjtalk==0.3.0
# ! pip install pytorch-lightning==1.7.7
# ! pip install -qqq evaluate==0.2.2
# ! pip install pandas termcolor torchaudio h5py matplotlib ipympl

In [2]:
import IPython.display
from pathlib import Path

import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
from torch import nn
import pandas as pd
import whisper
import torchaudio
import torchaudio.transforms as at

from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

from tqdm.notebook import tqdm
import pyopenjtalk
import evaluate

from transformers import (
    AdamW,
    get_linear_schedule_with_warmup
)

2022-11-07 22:16:12.889140: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:
2022-11-07 22:16:12.889162: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
import json

In [19]:
DATASET_DIR = "/content/jvs/jvs_ver1"
SAMPLE_RATE = 16000
BATCH_SIZE = 1
TRAIN_RATE = 0.8

AUDIO_MAX_LENGTH = 480000
TEXT_MAX_LENGTH = 120
SEED = 3407
DEVICE = "gpu" if torch.cuda.is_available() else "cpu"
seed_everything(SEED, workers=True)

Global seed set to 3407


3407

In [20]:
def load_wave(wave_path, sample_rate:int=16000) -> torch.Tensor:
    waveform, sr = torchaudio.load(wave_path, normalize=True)
    if sample_rate != sr:
        waveform = at.Resample(sr, sample_rate)(waveform)
    return waveform

In [21]:
AUDIO_FOLDER = "data/train/songs"
LABEL_FOLDER = "data/train/labels"

In [22]:
def get_audio_label_paths(audio_folder, label_folder):
    audio_files = os.listdir(audio_folder)
    label_files = os.listdir(label_folder)
    
    audio_paths = []
    label_paths = []
    
    for file in audio_files:
        name, ext = os.path.splitext(file)
        if name + ".json" in label_files:
            audio_paths.append(os.path.join(audio_folder, file))
            label_paths.append(os.path.join(label_folder, name + ".json"))
            
    return audio_paths, label_paths

In [23]:
len(get_audio_label_paths(AUDIO_FOLDER, LABEL_FOLDER)[0])

1057

In [24]:
class LyricDataset(torch.utils.data.Dataset):
    def __init__(self, audio_folder, label_folder, tokenizer, sample_rate):
        super().__init__()

        self.tokenizer = tokenizer
        self.sample_rate = sample_rate
        
        self.audio_paths, self.label_paths = get_audio_label_paths(audio_folder, label_folder)
    
    def __len__(self):
        return len(self.audio_paths)
    
    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label_path = self.label_paths[idx]
        
        # audio
        audio = load_wave(audio_path, sample_rate=self.sample_rate)
        audio = whisper.pad_or_trim(audio.flatten())
        mel = whisper.log_mel_spectrogram(audio)
        
        with open(label_path, "r") as f:
            label = json.load(f)
        
        words = []
        starts = []
        ends = []
        for segment in label:
            for ann in segment["l"]:
                words.append(ann["d"])
                starts.append(ann["s"])
                ends.append(ann["e"])
        
        max_ms = 30000  # or len of audio file

        separated_tokens = []
        separated_starts = []
        separated_ends = []
        for word, s, e in zip(words, starts, ends):
            tokens = wtokenizer.encode(word)

            separated_tokens += tokens
            separated_starts += [s / max_ms] * len(tokens)
            separated_ends += [e / max_ms] * len(tokens)

        separated_tokens = separated_tokens
        starts = separated_starts
        ends = separated_ends
        return {
            "input_ids": mel,
            "dec_input_ids": separated_tokens,
            "starts": starts,
            "ends": ends,
        }

In [25]:
import numpy as np

class DataCollatorWithPadding:
    def __call__(sefl, features):
        input_ids, labels, dec_input_ids = [], [], []

        for f in features:
            input_ids.append(f["input_ids"])
            labels.append(np.array([f["starts"], f["ends"]]).transpose())
            dec_input_ids.append(f["dec_input_ids"])
        
        input_ids = torch.concat([input_id[None, :] for input_id in input_ids])
        
        label_lengths = [len(lab) for lab in labels]
        dec_input_ids_length = [len(e) for e in dec_input_ids]
        max_label_len = max(label_lengths+dec_input_ids_length)

        # labels = [np.pad(lab, (0, max_label_len - lab_len), 'constant', constant_values=-100) for lab, lab_len in zip(labels, label_lengths)]
        labels = [np.concatenate([lab, np.ones((max(max_label_len - lab_len, 0), 2)) * -100]) for lab, lab_len in zip(labels, label_lengths)]
        dec_input_ids = [np.pad(e, (0, max_label_len - e_len), 'constant', constant_values=50257) for e, e_len in zip(dec_input_ids, dec_input_ids_length)] # 50257 is eot token id

        batch = {
            "labels": labels,
            "dec_input_ids": dec_input_ids
        }
        
        batch = {k: torch.tensor(np.array(v), requires_grad=False) for k, v in batch.items()}
        batch["input_ids"] = input_ids

        return batch

In [29]:
class Config:
    learning_rate = 0.0005
    weight_decay = 0.01
    adam_epsilon = 1e-8
    warmup_steps = 2
    batch_size = 16
    num_worker = 2
    num_train_epochs = 10
    gradient_accumulation_steps = 1
    sample_rate = SAMPLE_RATE

In [31]:
class WhisperModelModule(LightningModule):
    def __init__(self, cfg:Config, model_name="base", lang="vi", train_dataset=[], eval_dataset=[]) -> None:
        super().__init__()
        self.options = whisper.DecodingOptions(language=lang, without_timestamps=True)
        self.model = whisper.load_model(model_name)
        self.tokenizer = whisper.tokenizer.get_tokenizer(True, language="vi", task=self.options.task)

        # only decoder training
        for p in self.model.encoder.parameters():
            p.requires_grad = False
        
        self.loss_fn = nn.MSELoss()
        self.metric_fn = lambda x: 0

        self.cfg = cfg
        self.__train_dataset = train_dataset
        self.__eval_dataset = eval_dataset
        
        self.linear = nn.Linear(self.model.dims.n_text_state, 2)
    
    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_id):
        input_ids = batch["input_ids"]
        labels = batch["labels"].long()
        dec_input_ids = batch["dec_input_ids"].long()

        with torch.no_grad():
            audio_features = self.model.encoder(input_ids)

        out = self.model.decoder(dec_input_ids, audio_features)
        out = self.linear(out)
        
        # loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))
        loss = self.loss_fn(out, labels)
        self.log("train/loss", loss, on_step=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_id):
        input_ids = batch["input_ids"]
        labels = batch["labels"].long()
        dec_input_ids = batch["dec_input_ids"].long()


        audio_features = self.model.encoder(input_ids)
        out = self.model.decoder(dec_input_ids, audio_features)
        out = self.linear(out)
        loss = self.loss_fn(out, labels)
        
#         # loss = self.loss_fn(out.view(-1, out.size(-1)), labels.view(-1))

#         out[out == -100] = self.tokenizer.eot
#         labels[labels == -100] = self.tokenizer.eot

#         o_list, l_list = [], []
#         for o, l in zip(out, labels):
#             o = torch.argmax(o, dim=1)
#             o_list.append(self.tokenizer.decode(o, skip_special_tokens=True))
#             l_list.append(self.tokenizer.decode(l, skip_special_tokens=True))
#         cer = self.metrics_cer.compute(references=l_list, predictions=o_list)
#         wer = self.metrics_wer.compute(references=l_list, predictions=o_list)

#         self.log("val/loss", loss, on_step=True, prog_bar=True, logger=True)
#         self.log("val/cer", cer, on_step=True, prog_bar=True, logger=True)
#         self.log("val/wer", wer, on_step=True, prog_bar=True, logger=True)

        return {
            # "cer": cer,
            # "wer": wer,
            "loss": loss
        }

    def configure_optimizers(self):
        """オプティマイザーとスケジューラーを作成する"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() 
                            if not any(nd in n for nd in no_decay)],
                "weight_decay": self.cfg.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() 
                            if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, 
                          lr=self.cfg.learning_rate, 
                          eps=self.cfg.adam_epsilon)
        self.optimizer = optimizer

        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.cfg.warmup_steps, 
            num_training_steps=self.t_total
        )
        self.scheduler = scheduler

        return [optimizer], [{"scheduler": scheduler, "interval": "step", "frequency": 1}]
    
    def setup(self, stage=None):
        """初期設定（データセットの読み込み）"""

        if stage == 'fit' or stage is None:
            self.t_total = (
                (len(self.__train_dataset) // (self.cfg.batch_size))
                // self.cfg.gradient_accumulation_steps
                * float(self.cfg.num_train_epochs)
            )
    
    def train_dataloader(self):
        """訓練データローダーを作成する"""
        return torch.utils.data.DataLoader(self.__train_dataset, 
                          batch_size=self.cfg.batch_size, 
                          drop_last=True, shuffle=True, num_workers=self.cfg.num_worker,
                          collate_fn=DataCollatorWithPadding()
                          )

    def val_dataloader(self):
        """バリデーションデータローダーを作成する"""
        return torch.utils.data.DataLoader(self.__eval_dataset, 
                          batch_size=self.cfg.batch_size, 
                          num_workers=self.cfg.num_worker,
                          collate_fn=DataCollatorWithPadding()
                          )
    
       

In [32]:
log_output_dir = "./logs"
check_output_dir = "./artifacts"

train_name = "whisper"
train_id = "00001"

model_name = "base"
lang = "vi"

In [33]:
cfg = Config()

Path(log_output_dir).mkdir(exist_ok=True)
Path(check_output_dir).mkdir(exist_ok=True)

tflogger = TensorBoardLogger(
    save_dir=log_output_dir,
    name=train_name,
    version=train_id
)

checkpoint_callback = ModelCheckpoint(
    dirpath=f"{check_output_dir}/checkpoint",
    filename="checkpoint-{epoch:04d}",
    save_top_k=-1 # all model save
)

callback_list = [checkpoint_callback, LearningRateMonitor(logging_interval="epoch")]

woptions = whisper.DecodingOptions(language="vi", without_timestamps=False)
wmodel = whisper.load_model("base")
wtokenizer = whisper.tokenizer.get_tokenizer(True, language="vi", task=woptions.task)

train_dataset = LyricDataset(
    audio_folder=AUDIO_FOLDER,
    label_folder=LABEL_FOLDER,
    tokenizer=wtokenizer,
    sample_rate=SAMPLE_RATE
)
val_dataset = LyricDataset(
    audio_folder=AUDIO_FOLDER,
    label_folder=LABEL_FOLDER,
    tokenizer=wtokenizer,
    sample_rate=SAMPLE_RATE
)
model = WhisperModelModule(cfg, model_name, lang, train_dataset, val_dataset)

trainer = Trainer(
    precision=16,
    accelerator=DEVICE,
    max_epochs=cfg.num_train_epochs,
    accumulate_grad_batches=cfg.gradient_accumulation_steps,
    logger=tflogger,
    callbacks=callback_list
)

trainer.fit(model)

Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type    | Params
------------------------------------
0 | model   | Whisper | 71.8 M
1 | loss_fn | MSELoss | 0     
2 | linear  | Linear  | 1.0 K 
------------------------------------
52.0 M    Trainable params
19.8 M    Non-trainable params
71.8 M    Total params
143.654   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.07 GiB (GPU 0; 5.80 GiB total capacity; 3.04 GiB already allocated; 462.38 MiB free; 3.28 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF