# Import all required packages

In [None]:
!pip install librosa
!pip install pandas
!pip install transformers
!pip install jiwer
!pip install scikit-learn
!pip install torch
!pip install datasets
!pip install dataclasses
!pip install typing
!pip install numpy

In [2]:
import os
import re
from tqdm import tqdm
import librosa
import torch
import pandas as pd
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, TrainingArguments, Trainer
from jiwer import wer, cer
from datasets import ClassLabel, Dataset, load_metric
import random
import json
import numpy as np
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


# Processing source files

Get all files, define new sort function to sort as \[1, 2, 3 ... 100\], because built-in function sorts strings as \[1, 100, 101 ...\].

In [6]:
def atoi(text):
    return int(text) if text.isdigit() else text


def natural_keys(text):
    return [atoi(c) for c in re.split(r'(\d+)', text) ]

In [7]:
def prepare_files(directory, file_with_text, inf):
    with open(file_with_text, encoding='utf-16') as f:
        text = f.readlines()
    files = os.listdir(directory)
    files_full = []
    for filename in files:
        if '.DS_Store' not in filename:
            f = os.path.join(directory, filename)
            files_full.append(f)
    files_full.sort(key=natural_keys)
    j = 0
    dict_for_inf = []
    for filename in tqdm(files_full):
        if not '=' in text[j] and not 'нрзб' in text[j] and not '[' in text[j] and not '<' in text[j]:
            x = text[j].replace('\n', '').lower()
            x = x.replace('.', ' ')
            x = x.replace(',', ' ')
            x = x.replace(':', ' ')
            x = x.replace('?', ' ')
            x = x.replace('!', ' ')
            x = x.replace('–', ' ')
            x = x.replace('-', ' ')
            x = x.replace('ё', 'е')
            x = re.sub('(\s){2,}', ' ', x)
            x = re.sub('\(.*\)', '', x)
            dict_for_inf.append({'respondent':inf, 'path': filename, 'sentence': x})
        j += 1  
    return dict_for_inf

In [8]:
lnt = prepare_files('/content/input/new_mono_lnt20210706', 
                    '/content/input/20210706_lnt1950_1to831.txt', 'LNT1950')
mga_1307 = prepare_files('/content/input/new_mono_mga20210713', 
                         '/content/input/20210713mga1932_1to1159.txt', 'MGA1932')
mga_1607 = prepare_files('/content/input/new_mono_mga20210716', 
                         '/content/input/20210716mga1932_1to856.txt', 'MGA1932')

mga_1007 = prepare_files('/content/input/new_mono_mga20220710', 
                         '/content/input/20220710mga1932_1to304.txt', 'MGA1932')
gip_0707 = prepare_files('/content/input/new_mono_gip20210707', 
                         '/content/input/20210707gip1953_1to1607.txt', 'GIP1953')
gip_1507 = prepare_files('/content/input/new_mono_gip20220715', 
                         '/content/input/20220715gip1953_1to332.txt', 'GIP1953')
gip_2704 = prepare_files('/content/input/new_mono_gip20230427', 
                         '/content/input/20230427gip1953_1to873.txt', 'GIP1953')

apb_0707 = prepare_files('/content/input/new_mono_apb20220707', 
                         '/content/input/20220707apb1940_1to674.txt', 'AB1940')
apb_1007 = prepare_files('/content/input/new_mono_apb20220710', 
                         '/content/input/20220710apb1940EZ_1to659.txt', 'AB1940')
apb_2704 = prepare_files('/content/input/new_mono_apb20230427', 
                         '/content/input/20230427apb1940_1to557.txt', 'AB1940')
zns_1007 = prepare_files('/content/input/new_mono_zns20220710', 
                         '/content/input/20220710zns1939_1to677.txt', 'ZNS1939')
zns_1107 = prepare_files('/content/input/new_mono_zns20220711', 
                         '/content/input/20220711zns1939_1to379.txt', 'ZNS1939')

100%|████████████████████████████████████████████████████████████████████████████| 831/831 [00:00<00:00, 207740.29it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1159/1159 [00:00<00:00, 289667.40it/s]
100%|████████████████████████████████████████████████████████████████████████████| 856/856 [00:00<00:00, 285285.99it/s]
100%|████████████████████████████████████████████████████████████████████████████| 304/304 [00:00<00:00, 303877.12it/s]
100%|██████████████████████████████████████████████████████████████████████████| 1607/1607 [00:00<00:00, 229533.34it/s]
100%|████████████████████████████████████████████████████████████████████████████| 332/332 [00:00<00:00, 165853.85it/s]
100%|████████████████████████████████████████████████████████████████████████████| 873/873 [00:00<00:00, 218213.79it/s]
100%|████████████████████████████████████████████████████████████████████████████| 674/674 [00:00<00:00, 224665.10it/s]
100%|███████████████████████████████████

In [9]:
all_records = lnt + mga_1307 + mga_1607 + mga_1007 + gip_0707 + gip_1507 + gip_2704 + apb_0707 + apb_1007 + apb_2704 + zns_1007 + zns_1107
len(all_records)

7922

In [16]:
df = pd.DataFrame(all_records, columns=['respondent', 'path', 'sentence'])
df.to_excel("/content/all_records_full.xlsx", index=False)

# wav2vec2-large-ru-golos-with-lm

Let's see how this model transcribes without training on dialect data

## Import model and processor

In [12]:
LANG_ID = "ru"
MODEL_ID = "bond005/wav2vec2-large-ru-golos-with-lm"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID, padding=True)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

## Read audio

In [13]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"]
    return batch

test_dataset = []
for l in tqdm(all_records):
    test_dataset.append(speech_file_to_array_fn(l))
data = [d['speech'] for d in test_dataset]

100%|██████████████████████████████████████████████████████████████████████████████| 7922/7922 [04:38<00:00, 28.45it/s]


## Test

In [15]:
j = 0
ready = []
for d in tqdm(data):
    inputs = processor(d, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentences = processor.batch_decode(predicted_ids)

    for i, predicted_sentence in enumerate(predicted_sentences):
        ready.append([test_dataset[j]["path"], test_dataset[j]["sentence"], predicted_sentence, 
                      test_dataset[j]["speech"], test_dataset[j]["respondent"]])
    j += 1

100%|████████████████████████████████████████████████████████████████████████████| 7922/7922 [1:24:58<00:00,  1.55it/s]


In [18]:
wers = []
cers = []
total = []

for path, correct, transcription, speech, respondent in tqdm(ready):
    try:
        error = wer(correct.lower(), transcription)
        wers.append(error)
        c_error = cer(correct.lower(), transcription)
        cers.append(c_error)
        total.append([correct.lower(), transcription, error, c_error, duration, speech, respondent])
    except:
        pass

print('Len WERs: ', len(wers))
print('Mean WER: ', sum(wers)/len(wers))
print('Len CERs: ', len(cers))
print('Mean CER: ', sum(cers)/len(cers))

100%|███████████████████████████████████████████████████████████████████████████| 7922/7922 [00:00<00:00, 14322.52it/s]

Len WERs:  7921
Mean WER:  0.6429166699745915
Len CERs:  7921
Mean CER:  0.35015402881148294





In [20]:
df = pd.DataFrame(total, columns=['original sentence', 'predicted sentence', 'wer', 'cer', 'speech', 'respondent'])
df.to_excel("/content/predicted_sentence_baseline_full_data.xlsx", index=False)

# Fine-tune

In [16]:
df = pd.DataFrame(test_dataset, columns=['respondent', 'path', 'sentence', 'duration', 'speech'])

In [17]:
ds = Dataset.from_pandas(df[['sentence', 'speech']])
ds = ds.train_test_split(test_size=0.3, shuffle=True)

In [19]:
def extract_all_chars(batch):
    all_text = " ".join(batch["sentence"])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

vocabs = ds.map(extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=ds.column_names["train"])


Map:   0%|          | 0/5545 [00:00<?, ? examples/s]

Map:   0%|          | 0/2377 [00:00<?, ? examples/s]

In [20]:
vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))

vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [22]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)

In [25]:
def prepare_dataset(batch, processor):
    audio = batch["speech"]
    batch["input_values"] = processor(audio, sampling_rate=16000).input_values[0]
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["sentence"]).input_ids
    return batch

In [26]:
ds = ds.map(lambda examples: prepare_dataset(examples, processor))

Map:   0%|          | 0/5545 [00:00<?, ? examples/s]



Map:   0%|          | 0/2377 [00:00<?, ? examples/s]

In [27]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",).to('cpu')
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels,
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors="pt",).to('cpu')

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100).to('cpu')
        batch["labels"] = labels.to('cpu')
        return batch

data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [29]:
wer_metric = load_metric("wer")
cer_metric = load_metric("cer", revision="master")

  wer_metric = load_metric("wer")


In [30]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    pred_str = [pred_str[i] for i in range(len(pred_str)) if len(label_str[i]) > 0]
    label_str = [label_str[i] for i in range(len(label_str)) if len(label_str[i]) > 0]
    
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)
    
    return {"wer": wer, 'cer': cer}

In [31]:
model = Wav2Vec2ForCTC.from_pretrained(
    "bond005/wav2vec2-large-ru-golos-with-lm", 
    ctc_loss_reduction="mean", 
    pad_token_id=processor.tokenizer.pad_token_id,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
)

In [32]:
model.freeze_feature_extractor()



In [33]:
device = torch.device("cpu")
model = model.cpu()

In [34]:
training_args = TrainingArguments(
  output_dir='./wav2vec2-large-ru-golos-with-lm-dialect-full',
  per_device_train_batch_size=8,
  learning_rate=1e-4,
  evaluation_strategy="epoch",
  num_train_epochs=15,
  report_to="none",
  save_strategy="epoch"
)


In [35]:
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=processor.feature_extractor,
)

In [37]:
trainer.train()



Epoch,Training Loss,Validation Loss,Wer,Cer
1,1.2969,0.87099,0.485913,0.210442
2,1.0887,0.766962,0.459668,0.191516
3,0.8881,0.755385,0.433049,0.182242
4,0.8093,0.749115,0.421917,0.178457
5,0.7551,0.785105,0.41918,0.178188
6,0.6667,0.801216,0.417501,0.178261
7,0.6371,0.79538,0.407861,0.171414
8,0.5914,0.795122,0.396045,0.168314
9,0.5382,0.842741,0.394676,0.167616
10,0.5141,0.796864,0.389514,0.166122




TrainOutput(global_step=10410, training_loss=0.6562900070498244, metrics={'train_runtime': 374225.8968, 'train_samples_per_second': 0.222, 'train_steps_per_second': 0.028, 'total_flos': 1.963448772786867e+19, 'train_loss': 0.6562900070498244, 'epoch': 15.0})

In [None]:
model.save_pretrained("/content/wav2vec2-large-ru-golos-with-lm-dialect-lnt-mga-gip-v2")

# Testing

In [38]:
MODEL_ID = "bond005/wav2vec2-large-ru-golos-with-lm"
model = Wav2Vec2ForCTC.from_pretrained('/content/wav2vec2-large-ru-golos-with-lm-dialect-full/checkpoint-10410/', local_files_only=True)
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID, padding=True)

In [39]:
def map_to_result(batch):
    with torch.no_grad():
        input_values = torch.tensor(batch["input_values"]).unsqueeze(0)
        logits = model(input_values).logits

    pred_ids = torch.argmax(logits, dim=-1)
    batch["pred_str"] = processor.batch_decode(pred_ids)[0]
    batch["text"] = processor.decode(batch["labels"], group_tokens=False)

    return batch

results = ds["test"].map(map_to_result, remove_columns=ds["test"].column_names)



Map:   0%|          | 0/2377 [00:00<?, ? examples/s]

In [40]:
pred_str = [results["pred_str"][i] for i in range(len(results["pred_str"])) if len(results["text"][i]) > 0]
label_str = [results["text"][i] for i in range(len(results["text"])) if len(results["text"][i]) > 0]

In [41]:
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=pred_str, references=label_str)))

Test WER: 0.374


In [42]:
cer_metric = load_metric("cer", revision="master")
print("Test CER: {:.3f}".format(cer_metric.compute(predictions=pred_str, references=label_str)))

Test CER: 0.160


In [47]:
test_results = results.to_pandas()

In [48]:
path = "/content/finetune_15epoch_full_test.xlsx"
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')

test_results.to_excel(writer) 

writer.save()
writer.close()

  warn("Calling close() on already closed file.")
