# Finetune XLS-R (0.3B) for Lingala

This notebook is adapted from Fine-tuning Xlsr-Wav2Vec2 for  ASR to train lingala ASR

## installs and imports

This notebook is implemented in jupyter notebook on Amazon ec2 V100 16GB

In [1]:
import sys

In [None]:
# this code worked with the following torch setup
#!{sys.executable} -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio===0.12.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
#!{sys.executable} -m pip install ipywidgets

In [None]:
#!{sys.executable} -m jupyter nbextension enable --py widgetsnbextension

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
import torch
import torchvision
torch.cuda.get_device_name()

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
# Install helper functions.
#!{sys.executable} -m pip install -q git+https://github.com/gmihaila/ml_things.git

In [None]:
#!{sys.executable} -m pip uninstall matplotlib -y

In [None]:
#!{sys.executable} -m pip install matplotlib==3.4.0

In [None]:
!{sys.executable} -m pip install numpy==1.20.3

In [2]:
!{sys.executable} -m pip install datasets==2.1.0
!{sys.executable} -m pip install transformers==4.18.0
#!pip install huggingface_hub==0.1
!{sys.executable} -m pip install torchaudio
!{sys.executable} -m pip install librosa
!{sys.executable} -m pip install jiwer



# Data Preparation

Download or upload the data in a zip file from local machine

In [3]:
from huggingface_hub import notebook_login

notebook_login()


Token is valid.
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/ubuntu/.cache/huggingface/token
Login successful


In [None]:
!{sys.executable} -m pip install gdown -U --no-cache-dir

In [4]:
import os


extract to LingalaAudio

In [5]:
import numpy as np
import zipfile
import csv

In [None]:
import zipfile
zip_ref = zipfile.ZipFile("/home/ubuntu/LingalaAudio-20221111T081505Z-001.zip")
zip_ref.extractall("/home/ubuntu/")
zip_ref.close()

## Load Dataset

We use our own collected dataset which contains 4.3 validated hours of lingala at approximately 578.0 MB. 

In [6]:
with open('/home/ubuntu/LingalaAudio/pyLingala-master/data/train.csv', newline='',encoding='UTF-8') as f:
      reader = csv.reader(f)
      data = list(reader)
      train_data = [data[i] for i in range(len(data)) if i!=0]

with open('/home/ubuntu/LingalaAudio/pyLingala-master/data/test.csv', newline='',encoding='UTF-8') as f:
      reader = csv.reader(f)
      data = list(reader)
      t_data = [data[i] for i in range(len(data)) if i!=0]

This seed is used to help other get the same result as us

In [7]:
#Get valid indices
import random
random.seed(42) #this seed was used specifically to compare with Okwugbe model



v = 300 #200 samples for valid. Change as you want
test_list = [i for i in range(len(t_data))]
valid_indices = random.choices(test_list, k=v)


test_data = [t_data[i] for i in range(len(t_data)) if i not in valid_indices]
valid_data = [t_data[i] for i in range(len(t_data)) if i in valid_indices]

create json files

In [8]:
def create_json_file(d):
  utterance = d[2]
  wav_path =d[0]
  wav_path = wav_path.replace("/home/ubuntu/organised_recording","/home/ubuntu/LingalaAudio/pyLingala-master")
  return {
      "path": wav_path,
      "sentence": utterance
  }

train_json = [create_json_file(i) for i in train_data]
test_json = [create_json_file(i) for i in test_data]
valid_json = [create_json_file(i) for i in valid_data]

In [9]:
import os

In [10]:
#Make folder to store files


train_path = '/home/ubuntu/model_output/lingala_xlsr2/train'
test_path = '/home/ubuntu/model_output/lingala_xlsr2/test'
valid_path = '/home/ubuntu/model_output/lingala_xlsr2/valid'

if not os.path.isdir(train_path):
  print("Creating paths")
  os.makedirs(train_path)
  os.makedirs(test_path)
  os.makedirs(valid_path)

Creating paths


In [11]:
import json
#for train
for i, sample in enumerate(train_json):
  file_path = os.path.join(train_path,'train_lingala_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)

#for test
for i, sample in enumerate(test_json):
  file_path = os.path.join(test_path,'test_lingala_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)

#for valid
for i, sample in enumerate(valid_json):
  file_path = os.path.join(valid_path,'valid_lingala_{}.json'.format(i))
  with open(file_path, 'w') as outfile:
    json.dump(sample, outfile)

In [12]:
from ipywidgets import FloatProgress

#run the second time after the error
from datasets import load_dataset, load_metric

#for train
for root, dirs, files in os.walk(train_path):
  lingala_train = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")

#for test
for root, dirs, files in os.walk(test_path):
  lingala_test = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")

#for valid
for root, dirs, files in os.walk(valid_path):
  lingala_valid = load_dataset("json", data_files=[os.path.join(root,i) for i in files],split="train")

Resolving data files:   0%|          | 0/2465 [00:00<?, ?it/s]

Using custom data configuration default-a1f85b37df83fe45


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-a1f85b37df83fe45/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-a1f85b37df83fe45/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


Resolving data files:   0%|          | 0/180 [00:00<?, ?it/s]

Using custom data configuration default-a8fe9a54dce8c43c


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-a8fe9a54dce8c43c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-a8fe9a54dce8c43c/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


Resolving data files:   0%|          | 0/204 [00:00<?, ?it/s]

Using custom data configuration default-738905403159d845


Downloading and preparing dataset json/default to /home/ubuntu/.cache/huggingface/datasets/json/default-738905403159d845/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/json/default-738905403159d845/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


## Exploration of the data

In [13]:
lingala_test

Dataset({
    features: ['path', 'sentence'],
    num_rows: 180
})

In [14]:
#show random sentences
from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    print(len(dataset))
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    display(HTML(df.to_html()))

In [15]:
show_random_elements(lingala_test, num_examples=20)

180


Unnamed: 0,path,sentence
0,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/joe/220716-083749_lin_9f7_elicit_47.wav,amoni elongi ya mama na bango akufa motema na ye ekiti
1,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/garcongina/220719-141705_lin_9f7_elicit_134.wav,soki kaka polele pawu
2,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/yan2/220717-135652_lin_9f7_elicit_94.wav,nyɔnso wana ezosalama tshotsho kodiongo pe azoyeba te
3,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/v/220716-105601_lin_9f7_elicit_90.wav,afandi mwa kimya azongisi mokongo na nsima
4,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/garcongina/220719-141705_lin_9f7_elicit_111.wav,akobi na kolobaka ete soki bobwakisi ngai bokotiya nani
5,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/yan2/220717-135652_lin_9f7_elicit_82.wav,po ndelo oyɔ mwananyoka abongisaki ndako wana oza na likanisi te
6,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/deborah/220718-132354_lin_9f7_elicit_9.wav,mawa esunda pongi eyingela olongwa susi na ngai
7,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/garcongina/220719-141705_lin_9f7_elicit_124.wav,bato bazalaki kokεngεla ye basutuka
8,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/garcongina/220719-141705_lin_9f7_elicit_113.wav,na ndenge ya respect petit tangu akoti na esika bakulutu bazali ezosenga mwa nsamba ekita
9,/home/ubuntu/LingalaAudio/pyLingala-master/data/test/wav/v/220716-105601_lin_9f7_elicit_97.wav,yɔka yɔ masolo ozalaka na yango


In [None]:
train_df = pd.DataFrame({'sentence':lingala_train['sentence']})
train_df['nb_words'] = train_df.sentence.map(lambda x: len(x.split()))
train_df.nb_words.hist(bins=30)

In [None]:
validation_df = pd.DataFrame({'sentence':lingala_valid['sentence']})
validation_df['nb_words'] = validation_df.sentence.map(lambda x: len(x.split()))
validation_df.nb_words.hist(bins=30)

In [None]:
test_df = pd.DataFrame({'sentence':lingala_test['sentence']})
test_df['nb_words'] = test_df.sentence.map(lambda x: len(x.split()))
test_df.nb_words.hist(bins=30)

## Path ; wav files

In [None]:
import torch
import torchaudio

print(torch.__version__)
print(torchaudio.__version__)

In [None]:
from functools import partial
import pandas as pd
import numpy as np

import torchaudio

In [None]:
SAMPLE_WAV = '/home/ubuntu/LingalaAudio/pyLingala-master/data/train/wav/betty/221011-125449_lin_359_elicit_0.wav'

In [None]:
metadata = torchaudio.info(SAMPLE_WAV)
print(metadata)

In [None]:
train_df = pd.DataFrame({'path':lingala_train['path']})
train_df['sample_rate'] = train_df.path.map(lambda x: torchaudio.info(x).sample_rate)
train_df['num_frames'] = train_df.path.map(lambda x: torchaudio.info(x).num_frames)
train_df['seconds'] = train_df.num_frames / train_df.sample_rate
train_df.seconds.hist(bins=30)

In [None]:
validation_df = pd.DataFrame({'path':lingala_valid['path']})
validation_df['sample_rate'] = validation_df.path.map(lambda x: torchaudio.info(x).sample_rate)
validation_df['num_frames'] = validation_df.path.map(lambda x: torchaudio.info(x).num_frames)
validation_df['seconds'] = validation_df.num_frames / validation_df.sample_rate
validation_df.seconds.hist(bins=30)

In [None]:
test_df = pd.DataFrame({'path':lingala_test['path']})
test_df['sample_rate'] = test_df.path.map(lambda x: torchaudio.info(x).sample_rate)
test_df['num_frames'] = test_df.path.map(lambda x: torchaudio.info(x).num_frames)
test_df['seconds'] = test_df.num_frames / test_df.sample_rate
test_df.seconds.hist(bins=30)

In [16]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

def remove_special_characters(batch):
    batch["sentence"] = re.sub(chars_to_ignore_regex, '', batch["sentence"]).lower() + " "
    return batch

In [17]:
lingala_train = lingala_train.map(remove_special_characters)
lingala_test = lingala_test.map(remove_special_characters)
lingala_valid = lingala_valid.map(remove_special_characters)



  0%|          | 0/2465 [00:00<?, ?ex/s]

  0%|          | 0/180 [00:00<?, ?ex/s]

  0%|          | 0/204 [00:00<?, ?ex/s]

In [18]:
show_random_elements(lingala_train.remove_columns(["path"]))

2465


Unnamed: 0,sentence
0,fkm azwaki visa ya ba touristes mpo na sanza moko
1,likofi na libumu
2,akomaki mwa mɔtɔ ye nde mɔtɔ akomaki kosukola basani
3,mbote tata mokonzi
4,djef azalaki mokongo mpe marielouise azala moluba
5,ntango wana miraesa atindi moninga na ye moko akεndε koyebisa baboti ya fkm
6,fkm ayɔki nzutu
7,ngai nde mokonzi ya ekolo
8,na mboka eyindi pe na mbala moko
9,moro azongiseli ye ete ngai natekaka na depot ya mere moko na le marche


# Create Wav2Vec2CTCTokenizer

Connectionist Temporal Classification (CTC) tokenizer is a character-level tokenizer. We uses space (denoted as | token) as word delimiter token and [PAD] as blank token.

In [19]:
def extract_all_chars(batch):
  all_text = " ".join(batch["sentence"])
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [20]:
vocab_train = lingala_train.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lingala_train.column_names)
vocab_test = lingala_test.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lingala_test.column_names)
vocab_valid = lingala_valid.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=lingala_valid.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
vocab_list = list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0]) | set(vocab_valid["vocab"][0]))

In [22]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{'q': 0,
 'x': 1,
 'ç': 2,
 'n': 3,
 'b': 4,
 'v': 5,
 'd': 6,
 'm': 7,
 'r': 8,
 'y': 9,
 'a': 10,
 't': 11,
 'g': 12,
 'c': 13,
 'u': 14,
 'i': 15,
 'ε': 16,
 'p': 17,
 'ɔ': 18,
 'e': 19,
 'z': 20,
 'h': 21,
 ' ': 22,
 'w': 23,
 'j': 24,
 'l': 25,
 'o': 26,
 'k': 27,
 'f': 28,
 's': 29}

In [23]:
#make space = |
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [24]:
#padding token serves as blank token
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

32

In [25]:
import json
with open('/home/ubuntu/model_output/lingala_xlsr2/vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [26]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer("/home/ubuntu/model_output/lingala_xlsr2/vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

In [None]:
repo_name = 'Xlsr-0-3-lingala'


In [None]:
tokenizer.push_to_hub('output')

# Create Wav2Vec2 Feature Extractor

Wav2Vec2 was pretrained on the audio data of LibriSpeech and LibriVox which both were sampling with 16kHz. my own data has 16kHz sampling rate.

A Wav2Vec2 feature extractor object requires the following parameters to be instantiated:

feature_size: Speech models take a sequence of feature vectors as an input. While the length of this sequence obviously varies, the feature size should not. In the case of Wav2Vec2, the feature size is 1 because the model was trained on the raw speech signal .
sampling_rate: The sampling rate at which the model is trained on.
padding_value: For batched inference, shorter inputs need to be padded with a specific value
do_normalize: Whether the input should be zero-mean-unit-variance normalized or not. Usually, speech models perform better when normalizing the input
return_attention_mask: Whether the model should make use of an attention_mask for batched inference. In general, models should always make use of the attention_mask to mask padded tokens. However, due to a very specific design choice of Wav2Vec2's "base" checkpoint, better results are achieved when using no attention_mask. This is not recommended for other speech models. For more information, one can take a look at this issue. Important If you want to use this notebook to fine-tune large-lv60, this parameter should be set to True.

In [27]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [28]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [29]:
#If you are running this Colab for the first time and have not saved the processor, uncomment code below
processor.save_pretrained("/home/ubuntu/model_output/lingala_xlsr2/Xlsr-0-3-lingala")

#To load trained processor
model_dir='/home/ubuntu/model_output/lingala_xlsr2/Xlsr-0-3-lingala'
#processor = Wav2Vec2Processor.from_pretrained(model_dir)

In [30]:
lingala_train[197]

{'path': '/home/ubuntu/LingalaAudio/pyLingala-master/data/train/wav/rebecca/221011-120830_lin_359_elicit_67.wav',
 'sentence': 'akoti na depot ya mibale '}

## Preprocess Data

We resample to 16kHz with which wav2vec2 was pretrained on.

In [31]:
import torchaudio

def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    batch["speech"] = speech_array[0].numpy()
    batch["sampling_rate"] = sampling_rate
    batch["target_text"] = batch["sentence"]
    return batch

In [32]:
lingala_train = lingala_train.map(speech_file_to_array_fn, remove_columns=lingala_train.column_names)
lingala_test = lingala_test.map(speech_file_to_array_fn, remove_columns=lingala_test.column_names)
lingala_valid = lingala_valid.map(speech_file_to_array_fn, remove_columns=lingala_valid.column_names)

  0%|          | 0/2465 [00:00<?, ?ex/s]

  0%|          | 0/180 [00:00<?, ?ex/s]

  0%|          | 0/204 [00:00<?, ?ex/s]

In [33]:
#sample sounds
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(lingala_train)-1)

ipd.Audio(data=np.asarray(lingala_train[rand_int]["speech"]), autoplay=True, rate=16000)

In [34]:
#rand_int = random.randint(0, len(fon_train)-1)

print("Target text:", lingala_train[rand_int]["target_text"])
print("Input array shape:", np.asarray(lingala_train[rand_int]["speech"]).shape)
print("Sampling rate:", lingala_train[rand_int]["sampling_rate"])

Target text: kofandaka esika moko pe mayεlε na kelasi esangisaki bango mbangu pe malεmbε malεmbε batɔnga kindeko na bango mibale 
Input array shape: (271161,)
Sampling rate: 16000


Then we prepare input_values using processor and labels using target_text

In [35]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

In [36]:
lingala_train = lingala_train.map(prepare_dataset, remove_columns=lingala_train.column_names, batch_size=8, num_proc=4, batched=True)
lingala_test = lingala_test.map(prepare_dataset, remove_columns=lingala_test.column_names, batch_size=8, num_proc=4, batched=True)
lingala_valid = lingala_valid.map(prepare_dataset, remove_columns=lingala_valid.column_names, batch_size=8, num_proc=4, batched=True)

     

#0:   0%|          | 0/78 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/77 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/77 [00:00<?, ?ba/s]

 

  return array(a, dtype, copy=False, order=order)


#3:   0%|          | 0/77 [00:00<?, ?ba/s]

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


      

#0:   0%|          | 0/6 [00:00<?, ?ba/s]

#1:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/6 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/6 [00:00<?, ?ba/s]

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


     

#0:   0%|          | 0/7 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/7 [00:00<?, ?ba/s]

#2:   0%|          | 0/7 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/7 [00:00<?, ?ba/s]

  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)
  return array(a, dtype, copy=False, order=order)


# Training

## Data collator

In [37]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch


In [38]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [39]:
#mkdir cer

In [40]:
#cd cer

In [41]:
#wget -O cer.py https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/raw/main/cer.py

In [42]:
#cd ..

# Metric
We use word error rate with space as word boundary while training. We also use character error rate without word boundaries in testing

In [43]:
from datasets import load_dataset, load_metric

In [44]:
wer_metric = load_metric("wer")
#cer_metric = load_metric("/home/ubuntu/cer")
#cer_metric = load_metric("cer", revision="master")
#cer_metric = load_metric('cer')

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [45]:
#cer_metric.compute(predictions=['aab'],references=['aaac'])

In [46]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}
    #cer= cer_metric.compute(predictions=pred_str, references=label_str)
    #return {"cer": cer}

In [47]:
# testing
#label_str = 'aab'
#pred_str = 'aaac'
#preds = [char for seq in pred_str for char in list(seq)]
#refs = [char for seq in label_str for char in list(seq)]
#cer = wer(refs, preds)
#print(cer)

In [48]:
#from jiwer import wer

In [49]:
#def compute_metrics(pred):
    #pred_logits = pred.predictions
    #pred_ids = np.argmax(pred_logits, axis=-1)

    #pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    #pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    #label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # wer = wer_metric.compute(predictions=pred_str, references=label_str)
    # cer = fastwer.score(pred_str, label_str, char_level=True)
    #cer = word_error_rate(hypotheses=pred_str, references=label_str, use_cer=True)

    # ref: https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/blob/main/cer.py
    #preds = [char for seq in pred_str for char in list(seq)]
    #refs = [char for seq in label_str for char in list(seq)]
    #cer = wer(refs, preds)

    #return {"cer": cer}
    # return {"wer": wer}

In [50]:
!{sys.executable} -m pip install -qqq evaluate==0.2.2

# Model 

We use the pretrained facebook/wav2vec2-large-xlsr-53. The training script is scripts/wav2vec_finetune.py.

In [51]:
from transformers import Wav2Vec2ForCTC

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-xls-r-300m", 
    attention_dropout=0.0,
    hidden_dropout=0.0,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.0,
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/wav2vec2-xls-r-300m were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_hid.bias', 'project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.bias', 'quantizer.weight_proj.weight', 'project_q.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-xls-r-300m and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it 

We do not finetune the feature extractor layer.

In [52]:
model.freeze_feature_extractor()



In [None]:
model.gradient_checkpointing_enable()

In [53]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./output",
  # output_dir="/content/drive/MyDrive/lingala_xlsr/wav2vec2-large-xlsr-lingala",
  group_by_length=True,
  per_device_train_batch_size=2,
  gradient_accumulation_steps=16,
  evaluation_strategy="steps",
  num_train_epochs=50,
  gradient_checkpointing=True,
  fp16=True,
  save_steps=400,
  eval_steps=400,
  logging_steps=400,
  learning_rate=3e-4,
    #1e-4,
  warmup_steps=500,
  save_total_limit=2,
  logging_dir='logs',
  push_to_hub=True,
)

In [55]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=lingala_train,
    eval_dataset=lingala_valid,
    tokenizer=processor.feature_extractor,
)

Cloning https://huggingface.co/Ussen/output into local empty directory.
Using amp half precision backend


In [56]:
train_result=trainer.train()

***** Running training *****
  Num examples = 2465
  Num Epochs = 50
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 16
  Total optimization steps = 3850


Step,Training Loss,Validation Loss,Wer
400,3.6517,0.53746,0.527881
800,0.3497,0.433404,0.402549
1200,0.1654,0.449737,0.388741
1600,0.0972,0.469211,0.383962
2000,0.066,0.488909,0.372278
2400,0.0486,0.555223,0.359002
2800,0.032,0.564449,0.354753
3200,0.0244,0.526145,0.335635
3600,0.0185,0.540868,0.330855


***** Running Evaluation *****
  Num examples = 204
  Batch size = 8
Saving model checkpoint to ./output/checkpoint-400
Configuration saved in ./output/checkpoint-400/config.json
Model weights saved in ./output/checkpoint-400/pytorch_model.bin
Feature extractor saved in ./output/checkpoint-400/preprocessor_config.json
Feature extractor saved in ./output/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 204
  Batch size = 8
Saving model checkpoint to ./output/checkpoint-800
Configuration saved in ./output/checkpoint-800/config.json
Model weights saved in ./output/checkpoint-800/pytorch_model.bin
Feature extractor saved in ./output/checkpoint-800/preprocessor_config.json
Feature extractor saved in ./output/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 204
  Batch size = 8
Saving model checkpoint to ./output/checkpoint-1200
Configuration saved in ./output/checkpoint-1200/config.json
Model weights saved in ./output/checkpoint-1200/pytorch_m

In [57]:
trainer.push_to_hub("./output")

Saving model checkpoint to ./output
Configuration saved in ./output/config.json
Model weights saved in ./output/pytorch_model.bin
Feature extractor saved in ./output/preprocessor_config.json


Upload file pytorch_model.bin:   0%|          | 32.0k/1.18G [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/Ussen/output
   6be7c1a..8671a1d  main -> main

Dropping the following result as it does not have all the necessary fields:
{}
To https://huggingface.co/Ussen/output
   8671a1d..0eee5cc  main -> main



'https://huggingface.co/Ussen/output/commit/8671a1dea95656a4d506f173898aedd16de07e82'

# Plot the training loss vs evaluation loss

this is used to check wether the model is overfitting

In [None]:
trainer.state.log_history

In [None]:
import io
import os
import math
import torch
import warnings
from tqdm.notebook import tqdm
from ml_things import plot_dict, fix_text

In [None]:
# Keep track of train and evaluate loss.
loss_history = {'train_loss':[], 'eval_loss':[]}
 
# Keep track of train and evaluate perplexity.
# This is a metric useful to track for language models.
wer_history = {'train_wer':[], 'eval_wer':[]}
 
# Loop through each log history.
for log_history in trainer.state.log_history:
 
  if 'loss' in log_history.keys():
    # Deal with trianing loss.
    loss_history['train_loss'].append(log_history['loss'])
    wer_history['train_wer'].append(math.exp(log_history['loss']))
     
  elif 'eval_loss' in log_history.keys():
    # Deal with eval loss.
    loss_history['eval_loss'].append(log_history['eval_loss'])
    wer_history['eval_wer'].append(math.exp(log_history['eval_loss']))
 
# Plot Losses.
plot_dict(loss_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='Loss', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=0.3)
 
print()
 
# Plot Perplexities.
plot_dict(wer_history, start_step=training_args.logging_steps, 
          step_size=training_args.logging_steps, use_title='wer', 
          use_xlabel='Train Steps', use_ylabel='Values', magnify=0.2)

# Inference and Evaluation

We load the test split, re-splitted from our own data 

In [58]:
import os
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric,Dataset,concatenate_datasets,set_caching_enabled, ClassLabel
import pandas as pd

import random
from IPython.display import display, HTML

import json
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2ForCTC,Wav2Vec2Processor,Trainer,TrainingArguments,Wav2Vec2FeatureExtractor

import re
set_caching_enabled(False)

import soundfile as sf
import torchaudio


import IPython.display as ipd

import numpy as np
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

from tqdm import tqdm
import torch

  set_caching_enabled(False)


Load pretrained model and processor to process the test dataset.

In [59]:
from transformers import Wav2Vec2ForCTC
model = Wav2Vec2ForCTC.from_pretrained("/home/ubuntu/output/checkpoint-3600").to("cuda")
processor = Wav2Vec2Processor.from_pretrained("/home/ubuntu/output")

loading configuration file /home/ubuntu/output/checkpoint-3600/config.json
Model config Wav2Vec2Config {
  "_name_or_path": "facebook/wav2vec2-xls-r-300m",
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForCTC"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 768,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": true,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "mean",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": true,
  "eos_token_id": 2,
  "feat_extract_activation": "gelu",
  "feat_extract_dropout": 0.0,
  "feat_extract_norm": "layer",
  "feat_proj_dropout"

OSError: Can't load tokenizer for '/home/ubuntu/output'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure '/home/ubuntu/output' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.

In [60]:
lingala_test.shape[0]

180

## Infer a few examples

In [61]:
input_dict = processor(lingala_test["input_values"][1],sampling_rate=16000, return_tensors="pt", padding=True)

logits = model(input_dict.input_values.to("cuda")).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [62]:
print("Prediction:")
print(processor.decode(pred_ids))

#print(processor.decode(lingala_test["input_values"][0]))

print("\nReference:")
#print(common_voice_test_transcription["sentence"][0].lower())
print(processor.decode(lingala_test["labels"][1]))
#print("Target text:", lingala_test[0]["target_text"])

Prediction:
yango nde lopango moudi asombaki na mbɔngɔ thotjo ayebiki

Reference:
yango nde lopango muji asombaki na mbɔngɔ tshotsho ayibaki


In [63]:
inputs = processor(lingala_test["input_values"][1], sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
    logits = model(inputs.input_values,).logits

pred_ids = torch.argmax(logits, dim=-1)

print("Prediction:", processor.batch_decode(pred_ids))
print("Reference:", lingala_test["labels"][1])

RuntimeError: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor

# Evaluate on test set

We evaluate the test set on WER with word boundaries and CER without spaces

In [64]:
final_pred = []
for i in tqdm(range(lingala_test.shape[0])):    
    input_dict = processor(lingala_test[i]["input_values"], return_tensors="pt", padding=True)

    logits = model(input_dict.input_values.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    prediction = processor.decode(pred_ids)
    final_pred.append(prediction)

  0%|          | 0/180 [00:00<?, ?it/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
  1%|          | 2/180 [00:00<00:15, 11.21it/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
  2%|▏         | 4/180 [00:00<00:14, 12.03it/s]It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
It is strongly recommended to pass the ``sampling_rate`` argument to this fun

In [65]:
final_pred

['moyi ezalaki kongala makasi',
 'yango nde lopango moudi asombaki na mbɔngɔ thotjo ayebiki',
 'ezali kala',
 'aboyi kokεndε poto',
 'olobi ozali mwana ya barumbu osilisi okeyi kaka bongo',
 'tshotsho asepeli',
 'heure wana kimba nse qke ezalaki mingi bamboka ya bakongo',
 'mingala mike mike pe epayi basi oyɔ ya mpεmbεni bango basengaka na ndenge ya mokε te',
 'kitoko elɔkɔ te',
 'nyɔnso wana ezosalama tshotsho kodiongo pe azoyebi te',
 'yɔka yɔ masolo ozalaka nango',
 'na nkoto mibale na zomi na sambo lopango ya suka etikalaki yango wana',
 'nakokεndε na ye wapi soki balongoli ye',
 'ata pe ozali mwana balumbo osala nini',
 'afandaka lopango monene kaka ye moko',
 'na kozala te mokolokooi kotelema',
 'palabala na kikwiti batongi te',
 'baye komona ete azalaki kolobela mopa oyɔ azalaki na katiya masedeci ya mwindu',
 'tika koloba boye',
 'tolangola mwa chehr',
 'bakola kobenga yɔ bonkliian',
 'mibali oyɔ nyɔkolaki pe obomaki bazali bandeko na ngai',
 'pakozala kofuta mbɔngɔ na cuntu na

In [66]:
def map_to_result(batch):
  with torch.no_grad():
    input_values = torch.tensor(batch["input_values"], device="cuda").unsqueeze(0)
    logits = model(input_values).logits

  pred_ids = torch.argmax(logits, dim=-1)
  batch["pred_str"] = processor.batch_decode(pred_ids)[0]
  batch["text"] = processor.decode(batch["labels"], group_tokens=False)
  
  return batch


In [67]:
results = lingala_test.map(map_to_result, remove_columns=lingala_test.column_names)

  0%|          | 0/180 [00:00<?, ?ex/s]

In [68]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["text"])))


Test WER: 0.258


In [69]:
show_random_elements(results)


180


Unnamed: 0,pred_str,text
0,alingaka kotala baemission ya francais te,alingaka kotala baemission ya français te
1,bato bayei ebele esika yango mɔtɔ akoki pe kosunga bango ezali te,bato bayei ebele esika yango mɔtɔ akokipe kosunga bango azali te
2,tozalaki kobeta na mpo na kola kisa makasi,tozalaki kobetana po na kolakisa makasi
3,akomi kaka koloba lisolo na litoyi ya makanbgo na ye oyɔ babimaki na ye,akomi kaka koloba lisolo na litoyi ya makangu na ye oyɔ babimaki na ye
4,kimya oyɔ ya ye,kimya oyɔ ya ye
5,na luki butu na moyi babenganga excellence ye oyɔ aleki na malonga,naluki butu na moyi babenga ngai excellence ye oyɔ aleki na malonga
6,soki kaka polele pawau,soki kaka polele pawu
7,pakozala kofuta mbɔngɔ na cuntu na ye ya banki,bakozala kofuta mbɔngɔ na compte na ye na banki
8,yɔ na mwasi na yɔ nabanla na yɔ bolie yango,yɔ na mwasi na yɔ na bana na yɔ bolie yango
9,tshotshoko diongo apusani mpεmbεni na bango,tshotsho kodiongo apusani mpεmbεni na bango


In [None]:
#model.to("cuda")

#with torch.no_grad():
#  logits = model(torch.tensor(lingala_test[:1]["input_values"], device="cuda")).logits

#pred_ids = torch.argmax(logits, dim=-1)

# convert ids to tokens
#" ".join(processor.tokenizer.convert_ids_to_tokens(pred_ids[0].tolist()))


In [None]:
wer_metric = load_metric("wer")
#cer_metric = load_metric("/home/ubuntu/cer")
cer_metric = load_metric("cer", revision="master")
#cer_metric = load_metric('cer')

In [None]:
#def compute_metrics(pred):
    #pred_logits = pred.predictions
    #pred_ids = np.argmax(pred_logits, axis=-1)

    #pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    #pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    #label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    # wer = wer_metric.compute(predictions=pred_str, references=label_str)
    # cer = fastwer.score(pred_str, label_str, char_level=True)
    #cer = word_error_rate(hypotheses=pred_str, references=label_str, use_cer=True)

    # ref: https://huggingface.co/ctl/wav2vec2-large-xlsr-cantonese/blob/main/cer.py
    #preds = [char for seq in pred_str for char in list(seq)]
    #refs = [char for seq in label_str for char in list(seq)]
    #cer = wer(refs, preds)

    #return {"cer": cer}
    # return {"wer": wer}

In [70]:
import evaluate
cer_metrics = evaluate.load("cer")

Downloading builder script:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

In [72]:
cer_metrics.compute(predictions=results["pred_str"], references=results["text"])

0.07009506356660176

In [73]:
cer_metric = load_metric("cer", revision="master")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [74]:
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=results["pred_str"], references=results["text"])))

Test CER: 0.070
