In [1]:
from datasets import load_dataset, load_metric, load_from_disk, Audio
from datasets import ClassLabel, DatasetDict, Dataset
import random
import pandas as pd
from IPython.display import display, HTML
import re
import json
from transformers import Wav2Vec2CTCTokenizer, AutoTokenizer
from transformers import Wav2Vec2FeatureExtractor, AutoFeatureExtractor
from transformers import Wav2Vec2Processor
import IPython.display as ipd
import numpy as np
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from transformers import Wav2Vec2ForCTC
from transformers import TrainingArguments
from transformers import Trainer
from evaluate import load
import os
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm
import argparse
import json
import logging
import os
import subprocess
import tarfile
import urllib.request


Dataset

In [3]:
feature_extractor = AutoFeatureExtractor.from_pretrained('kehanlu/mandarin-wav2vec2')

In [4]:
feature_extractor

Wav2Vec2FeatureExtractor {
  "do_normalize": false,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "processor_class": "Wav2Vec2Processor",
  "return_attention_mask": false,
  "sampling_rate": 16000
}

In [3]:
dataset = load_from_disk('/Users/ujan/speech-processing/data/processed/cv/vectorized_dataset')

In [6]:
dataset['train'][0]['input_length']

Dataset({
    features: ['input_values', 'input_length', 'labels'],
    num_rows: 18541
})

In [2]:
#parser = argparse.ArgumentParser(description="Aishell Data download")
#parser.add_argument("--data_root", required=True, default=None, type=str)
#args = parser.parse_args()


def process_data(data_folder: str, dst_folder: str):
    """
    To generate manifest
    Args:
        data_folder: source with wav files
        dst_folder: where manifest files will be stored
    Returns:
    """

    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)

    transcript_file = os.path.join(data_folder, "transcript", "aishell_transcript_v0.8.txt")
    transcript_dict = {}
    with open(transcript_file, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            audio_id, text = line.split(" ", 1)
            # remove white space
            text = text.replace(" ", "")
            transcript_dict[audio_id] = text

    data_types = ["train", "dev", "test"]
    vocab_count = {}
    for dt in data_types:
        json_lines = []
        audio_dir = os.path.join(data_folder, "wav", dt)
        for sub_folder, _, file_list in os.walk(audio_dir):
            for fname in file_list:
                audio_path = os.path.join(sub_folder, fname)
                audio_id = fname.strip(".wav")
                if audio_id not in transcript_dict:
                    continue
                text = transcript_dict[audio_id]
                for li in text:
                    vocab_count[li] = vocab_count.get(li, 0) + 1
                duration = subprocess.check_output("soxi -D {0}".format(audio_path), shell=True)
                duration = float(duration)
                json_lines.append(
                    json.dumps(
                        {"audio_filepath": os.path.abspath(audio_path), "duration": duration, "text": text,},
                        ensure_ascii=False,
                    )
                )

        manifest_path = os.path.join(dst_folder, dt + ".json")
        with open(manifest_path, "w", encoding="utf-8") as fout:
            for line in json_lines:
                fout.write(line + "\n")

    vocab = sorted(vocab_count.items(), key=lambda k: k[1], reverse=True)
    vocab_file = os.path.join(dst_folder, "vocab.txt")
    with open(vocab_file, "w", encoding="utf-8") as f:
        for v, c in vocab:
            f.write(v + "\n")
            
    print("Done")

In [3]:
src_folder = '/home/ujan/Downloads/aishell1/data/data_aishell'
dst_folder = '/home/ujan/Downloads/aishell1/data/aishell_processed'

process_data(src_folder, dst_folder)

Done


In [28]:
data_files = {'train': '/home/ujan/Downloads/aishell1/data/aishell_processed/train.json',
              'test': '/home/ujan/Downloads/aishell1/data/aishell_processed/test.json',
              'validation': '/home/ujan/Downloads/aishell1/data/aishell_processed/dev.json'}

aishell = load_dataset('json', data_files=data_files)
aishell

Using custom data configuration default-e6d8718f000f6cc3
Found cached dataset json (/home/ujan/.cache/huggingface/datasets/json/default-e6d8718f000f6cc3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 120098
    })
    test: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 7176
    })
    validation: Dataset({
        features: ['audio_filepath', 'duration', 'text'],
        num_rows: 14326
    })
})

In [29]:
dataset = aishell.cast_column("audio_filepath", Audio())
dataset = dataset.rename_column('audio_filepath', 'audio')
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'duration', 'text'],
        num_rows: 120098
    })
    test: Dataset({
        features: ['audio', 'duration', 'text'],
        num_rows: 7176
    })
    validation: Dataset({
        features: ['audio', 'duration', 'text'],
        num_rows: 14326
    })
})

In [30]:
def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

show_random_elements(dataset["train"].remove_columns(["duration", "audio"]))

Unnamed: 0,text
0,土地供应新房建设与销售量增大幅增长
1,莫斯科在无车日当天会呈现出难得一见的空旷景象
2,妻子出轨上门女婿离异后依然照顾八旬岳母
3,一对男女正在做羞羞的事
4,从而我们的出租率能达到百分之百
5,买房目的主要是落户投资等
6,上海飞莫斯科航班自蒙古折返在浦东机场上空盘旋
7,团体内的成员可以是小孩或者外国人
8,克而瑞研究中心指出
9,刚过完人生中第一个母亲节


In [5]:
#chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'  ######## chinese?

#def remove_special_characters(batch):
    #batch["text"] = re.sub(chars_to_ignore_regex, '', batch["text"]).lower()
    #return batch

#dataset = dataset.map(remove_special_characters)

Extract all distinct letters of the training and test data and build our vocabulary from this set of letters

In [6]:
#def extract_all_chars(batch):
    #all_text = " ".join(batch["text"])
    #vocab = list(set(all_text))
    #return {"vocab": [vocab], "all_text": [all_text]}

#vocabs = dataset.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=dataset.column_names["train"])

In [7]:
#vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]) | set(vocabs["validation"]["vocab"][0]))

#vocab_dict = {v: k for k, v in enumerate(vocab_list)}
#vocab_dict

To make it clearer that " " has its own token class, we give it a more visible character |. In addition, we also add an "unknown" token. We also add a padding token that corresponds to CTC's "blank token"

In [8]:
#vocab_dict["|"] = vocab_dict[" "]
#del vocab_dict[" "]

#vocab_dict["[UNK]"] = len(vocab_dict)
#vocab_dict["[PAD]"] = len(vocab_dict)
#print(len(vocab_dict))

Save the vocabulary as a json file

In [9]:
#with open('vocab.json', 'w') as vocab_file:
    #json.dump(vocab_dict, vocab_file)

Get vocab

In [31]:
processor = Wav2Vec2Processor.from_pretrained('/home/ujan/Downloads/mandarin-wav2vec2-processor')

We use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class.

In [3]:
tokenizer = Wav2Vec2CTCTokenizer("/home/ujan/speech-processing/models/wav2vec2/wav2vec2-xls-r-1b_Datasets//vocab.json")

In [4]:
tokenizer

Wav2Vec2CTCTokenizer(name_or_path='', vocab_size=5623, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

Wav2Vec2 feature extractor

In [5]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

To make the usage of Wav2Vec2 as user-friendly as possible, the feature extractor and tokenizer are wrapped into a single Wav2Vec2Processor class so that one only needs a model and processor object

In [6]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [10]:
#libri['train.100'][0]

In [11]:
#rand_int = random.randint(0, len(libri["train.100"]))

#print(libri["train.100"][rand_int]["text"])
#ipd.Audio(data=np.asarray(libri["train.100"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

In [12]:
#rand_int = random.randint(0, len(libri["train.100"]))

#print("Target text:", libri["train.100"][rand_int]["text"])
#print("Input array shape:", np.asarray(libri["train.100"][rand_int]["audio"]["array"]).shape)
#print("Sampling rate:", libri["train.100"][rand_int]["audio"]["sampling_rate"])

In [32]:
def prepare_dataset(batch):
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["text"]).input_ids
    return batch

In [33]:
dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names["train"], num_proc=4)

        

#2:   0%|          | 0/30024 [00:00<?, ?ex/s]

#0:   0%|          | 0/30025 [00:00<?, ?ex/s]

#3:   0%|          | 0/30024 [00:00<?, ?ex/s]

#1:   0%|          | 0/30025 [00:00<?, ?ex/s]



       

#1:   0%|          | 0/1794 [00:00<?, ?ex/s]

#0:   0%|          | 0/1794 [00:00<?, ?ex/s]

 

#2:   0%|          | 0/1794 [00:00<?, ?ex/s]

#3:   0%|          | 0/1794 [00:00<?, ?ex/s]



      

#1:   0%|          | 0/3582 [00:00<?, ?ex/s]

#0:   0%|          | 0/3582 [00:00<?, ?ex/s]

  

#2:   0%|          | 0/3581 [00:00<?, ?ex/s]

#3:   0%|          | 0/3581 [00:00<?, ?ex/s]



In [40]:
dataset.save_to_disk('/home/ujan/Downloads/aishell1_finetune/aishell1_vectorized')

Saving the dataset (0/70 shards):   0%|          | 0/120098 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/7176 [00:00<?, ? examples/s]

Saving the dataset (0/9 shards):   0%|          | 0/14326 [00:00<?, ? examples/s]

Training & Evaluation

In [41]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [42]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [43]:
#wer = load("wer")
cer_metric = load("/home/ujan/Downloads/evaluate/metrics/cer/cer.py")

In [7]:
model = Wav2Vec2ForCTC.from_pretrained(
    "/home/ujan/speech-processing/models/wav2vec2/wav2vec2-xls-r-1b_Datasets/checkpoint-33000",
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
)

In [8]:
model

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (1-4): 4 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2LayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))
          (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projec

In [13]:
#model = Wav2Vec2ForCTC.from_pretrained(
    #"/home/ujan/Downloads/wav2vec2-aishell1",
    #ctc_loss_reduction="mean", 
    #pad_token_id=processor.tokenizer.pad_token_id,
#)

The first component of Wav2Vec2 consists of a stack of CNN layers that are used to extract acoustically meaningful - but contextually independent - features from the raw speech signal. This part of the model has already been sufficiently trained during pretrainind and as stated in the paper does not need to be fine-tuned anymore 

In [45]:
model.freeze_feature_encoder()

In [46]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    #wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

In [47]:
len(dataset['train'])

120098

In [48]:
len(dataset['validation'])

14326

In [49]:
len(dataset['train'])/32

3753.0625

In [16]:
training_args = TrainingArguments(
    output_dir='/home/ujan/Notebooks/aishell_ctc',
    group_by_length=True,
    #per_device_train_batch_size=32,
    per_device_train_batch_size=32,
    gradient_accumulation_steps=1,
    eval_accumulation_steps=32,
    per_device_eval_batch_size=8,
    evaluation_strategy="steps", #
    num_train_epochs=30,
    fp16=True,
    gradient_checkpointing=True, 
    save_steps=1000, #
    eval_steps=1000, #
    logging_steps=1000, #
    learning_rate=0.0002,
    weight_decay=0.005,
    warmup_steps=1000,
    save_total_limit=2,
    dataloader_num_workers=0,
)

In [17]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"], # validation
    tokenizer=processor.feature_extractor,
)

Using cuda_amp half precision backend


In [18]:
trainer.state.log_history

[]

In [None]:
trainer.train()

***** Running training *****
  Num examples = 120098
  Num Epochs = 30
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 112620
  Number of trainable parameters = 93426441


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 14326
  Batch size = 8


In [26]:
len(dataset['train'][10]['input_values'])

84783

Evaluation

In [15]:
device = torch.device("cuda")
model.to(device)

Wav2Vec2ForCTC(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)

In [16]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
    return batch

results = dataset["test"].map(map_to_result, remove_columns=dataset["test"].column_names)

  0%|          | 0/7176 [00:00<?, ?ex/s]

In [17]:
print("Test CER: {:.3f}".format(cer.compute(predictions=results["pred_str"], references=results["text"])))

Test CER: 7.120


In [18]:
results_val = dataset["validation"].map(map_to_result, remove_columns=dataset["test"].column_names)

  0%|          | 0/14326 [00:00<?, ?ex/s]

In [24]:
print("Val CER: {:.3f}".format(cer.compute(predictions=results_val["pred_str"], references=results_val["text"])))

Val CER: 6.723
