In [1]:
%%capture
!pip install datasets
!pip install transformers
!pip install torchaudio
!pip install jiwer
!pip install accelerate -U

In [2]:
import IPython.display as ipd
import numpy as np
import random
from datasets import ClassLabel
import pandas as pd
from IPython.display import display, HTML
import torch
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm

In [3]:
%%capture
!apt install git-lfs

In [4]:
from datasets import load_dataset, load_metric, Audio

In [5]:
dataset = load_dataset("tbkazakova/even_speech_hse", split='train')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/130 [00:00<?, ?it/s]

In [6]:
even_hse = dataset

In [7]:
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor

model = AutoModelForCTC.from_pretrained("tbkazakova/wav2vec-bert-2.0-even-pakendorf")
processor = Wav2Vec2BertProcessor.from_pretrained("tbkazakova/wav2vec-bert-2.0-even-pakendorf")

config.json:   0%|          | 0.00/1.91k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.42G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/231 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
import re
chars_to_remove_regex = '[,?.!-;:"“%‘”\'»\«\„\‐\–()=[]\\]'

def remove_special_characters(batch):
    # remove special characters
    batch["transcription"] = re.sub(chars_to_remove_regex, '', batch["transcription"]).strip().lower()
    return batch

In [9]:
def prepare_dataset(batch):
    audio = batch["audio"]
    batch["input_features"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
    batch["input_length"] = len(batch["input_features"])
    batch["labels"] = processor(text=batch["transcription"]).input_ids
    return batch

In [10]:
even_hse = even_hse.map(remove_special_characters)
even_hse = even_hse.cast_column("audio", Audio(sampling_rate=16_000))
even_hse = even_hse.map(prepare_dataset, remove_columns=even_hse.column_names)

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

In [11]:
wer_metric = load_metric("wer")
cer_metric = load_metric("cer")

  wer_metric = load_metric("wer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [12]:
input_dict = even_hse[0]

logits = model(torch.tensor(input_dict["input_features"]).unsqueeze(0)).logits

pred_ids = torch.argmax(logits, dim=-1)[0]

In [13]:
processor.decode(pred_ids)

'би куӈарапчи эрэгэр бисиву этикэнну ми'

In [14]:
processor.decode(input_dict["labels"]).lower()

'би куӈарыпч ырыгыр бишиву этыкэнуми'

In [15]:
predictions = []
references = []
for i in tqdm(range(len(even_hse))):
    input_dict = even_hse[i]
    logits = model(torch.tensor(input_dict["input_features"]).unsqueeze(0)).logits
    pred_ids = torch.argmax(logits, dim=-1)[0]
    predictions.append(processor.decode(pred_ids))
    references.append(processor.decode(input_dict["labels"]).lower())


wer = wer_metric.compute(predictions=predictions, references=references)
cer = cer_metric.compute(predictions=predictions, references=references)

100%|██████████| 129/129 [09:56<00:00,  4.63s/it]


In [18]:
wer, cer

(0.8333333333333334, 0.2971014492753623)

In [19]:
pred_real = pd.DataFrame()
pred_real['pred'] = predictions
pred_real['real'] = references
pred_real.to_csv('pred_real_by_pakendorf.csv', index=False)

In [20]:
for i in range(len(predictions)):
    print(predictions[i])
    print(references[i])
    print()

би куӈарапчи эрэгэр бисиву этикэнну ми
би куӈарыпч ырыгыр бишиву этыкэнуми

тордула биддёттэм
ордула бидётым

нан рыбяткаддёттун гобдиддоттэн
нан рыбалкадёту гобдыдёту

хамгиддётту
камгидёту

нан
нан

тадук таӈнариву
тадук таӈнариву

интэрнатту биддыву
интэрнату бидыву

нан тадук таӈӈни ордыди мудакриди орриву нанда ортаки
нан тадук таӈны одыдыв мудакриди уриву нанда ортэки

эрэгэр орду биддин бичэвутту едэ нон армиятки ипкэн итэндув
эрэр орду бидым бишиву тэдэ нан армиятки кулитын

армиян одянниву тара нан тадук эмнивун нанда ортаки орриву
армиянадяныву тар нан тадук эмниву нанда ортэки уриву

туркурим туркурив бисин биӈэ дюлду
туркуриву туркуриву бишин биӈа дюду

эр грэкму биддёттэн у ноӈан же харавал укунюӈэрэтдн
ырыгыр экму бидётын у ноӈан жэ коровал [unk]кэнюритын

карававканни он
корова укунён

нан
нан

э
так

нан тиеда этыкэм к родыди атыкаӈниву
нан тэдэ этыкэмкыр одыди атыкаӈниву

иланмяр дюранӈанав одакан
иланмэр дюр анӈынын одакан

нан тик дёрмяв илу мин анӈаду сатыкав атыкан