In [1]:
# %env LC_ALL=C.UTF-8
# %env LANG=C.UTF-8
# %env TRANSFORMERS_CACHE=/content/cache
# %env HF_DATASETS_CACHE=/content/cache
# %env CUDA_LAUNCH_BLOCKING=1

In [2]:
import torch
from importlib import reload

import numpy as np
from tqdm.notebook import tqdm

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor

seed_everything(42)

Global seed set to 42


42

## Tokenizer

In [3]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer(
    "./fa-vocab.json", 
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    word_delimiter_token="|",
    do_lower_case=False
)

In [4]:
text = "از مهمونداری کنار بکشم"
print(" ".join(tokenizer.tokenize(text)))
print(tokenizer.decode(tokenizer.encode(text)))

ا ز | م ه م و ن د ا ر ی | ک ن ا ر | ب ک ش م
از مهمونداری کنار بکشم


## Feature Extractor

In [5]:
from transformers import Wav2Vec2FeatureExtractor


feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, 
                                             sampling_rate=16000, 
                                             padding_value=0.0, 
                                             do_normalize=True, 
                                             return_attention_mask=True)

## Processor

In [6]:
from transformers import Wav2Vec2Processor


processor = Wav2Vec2Processor(feature_extractor=feature_extractor, 
                              tokenizer=tokenizer)

In [7]:
if len(processor.tokenizer.get_vocab()) == len(processor.tokenizer):
    print(len(processor.tokenizer))

40


In [8]:
save_dir = "weights/wav2vec2-large-xlsr-persian-cv"

processor.save_pretrained(save_dir)

## Dataset

In [9]:
data_dir = 'cv-fa-6.1/cv-corpus-6.1-2020-12-11/fa/'
# data_dir = 'shemo-fa/'
csv_dir = '/media/data/soroosh/dataset/ASR/' + data_dir

import src.data
reload(src.data)
from src.data import DataModule

data_module = DataModule(processor, csv_dir, min_dur=1, max_dur=10, batch_size=8)
data_module.setup()

418 samples | 1 hours, 11 minutes, 2 seconds were filtered.
num train samples: 203425  total duration: 224 hours, 32 minutes, 34 seconds
num val samples: 25166  total duration: 224 hours, 32 minutes, 34 seconds


In [10]:
data_module.train.df.head()

Unnamed: 0,sentence,path,np_path,duration
0,تا آخر مه,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,1.08
1,کجا بپیچم,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,1.104
2,آریان,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,1.175563
3,لطفا نگران نباش,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,1.176
4,آیا این تنها رنگی است که دارید,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,/media/data/soroosh/dataset/ASR/cv-fa-6.1/cv-c...,1.176


In [11]:
x,y = data_module.train[0]
x.shape, len(y)

((17280,), 10)

In [12]:
b = next(iter(data_module.train_dataloader()))
for k, v in b.items():
    print(k, v.shape)

input_values torch.Size([8, 20063])
attention_mask torch.Size([8, 20063])
labels torch.Size([8, 31])


## Model

In [13]:
import src.wav2vec
reload(src.wav2vec)
from src.wav2vec import Wav2vec

model = Wav2vec(processor, max_epochs=10)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# model.step(b)

In [15]:
model.count_parameters()

315479720

## Trainer

In [16]:
# default logger used by trainer
logger = TensorBoardLogger(
    save_dir='logs/',
    name='cv',
)

checkpoint = ModelCheckpoint(dirpath='weights/cv/', 
                             filename='{epoch}-{val_loss:.2f}', 
                             monitor='val_loss',
                             save_top_k=1, 
                             period=1)

lr_logger = LearningRateMonitor(logging_interval='step')

trainer = Trainer(benchmark=True, 
                  gpus=1, 
                  logger=logger, 
                  max_epochs=10,
                  callbacks=[checkpoint, lr_logger])

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


### Training

In case you want to use this google colab to fine-tune your model, you should make sure that your training doesn't stop due to inactivity. A simple hack to prevent this is to paste the following code into the console of this tab (*right mouse click -> inspect -> Console tab and insert code*).

```javascript
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
```

In [None]:
trainer.fit(model, data_module)


  | Name  | Type           | Params
-----------------------------------------
0 | model | Wav2Vec2ForCTC | 315 M 
-----------------------------------------
311 M     Trainable params
4.2 M     Non-trainable params
315 M     Total params
1,261.919 Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

In [None]:
# trainer.save_model()
# trainer.save_state()

# train_metrics = train_result.metrics
# train_metrics["train_samples"] = len(train_data)
# trainer.log_metrics("train", train_metrics)
# trainer.save_metrics("train", train_metrics)

train_result

## Evaluate

In [None]:
# test_metrics = trainer.evaluate()
# test_metrics["eval_samples"] = len(test_data)

# trainer.log_metrics("eval", test_metrics)
# trainer.save_metrics("eval", test_metrics)

In [None]:
# test_metrics

## load and evaluate

In [None]:
# model = Wav2Vec2ForCTC.from_pretrained('/media/data/soroosh/' + save_dir).to("cuda")
# processor = Wav2Vec2Processor.from_pretrained(save_dir)

In [None]:
# input_dict = processor(test_data["input_values"][0], return_tensors="pt", padding=True)

# logits = model(input_dict.input_values.to("cuda")).logits

# pred_ids = torch.argmax(logits, dim=-1)[0]

In [None]:
# print("Prediction:")
# print(processor.decode(pred_ids))

# print("\nReference:")
# print(processor.tokenizer.decode(test_data["labels"][0]))