# Import all required packages

In [None]:
!pip install librosa
!pip install pandas
!pip install transformers
!pip install jiwer
!pip install scikit-learn
!pip install torch

In [1]:
import os
import re
from tqdm import tqdm
import librosa
import pandas as pd
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from jiwer import wer, cer
from sklearn.model_selection import train_test_split
import torch

# Define the model that was trained on dialect data

In [3]:
MODEL_ID = "bond005/wav2vec2-large-ru-golos-with-lm"
model = Wav2Vec2ForCTC.from_pretrained('/content/wav2vec2-large-ru-golos-with-lm-dialect-full/checkpoint-10410/', local_files_only=True)
processor = Wav2Vec2Processor.from_pretrained(MODEL_ID, padding=True)

# Processing source files

Get all files, define new sort function to sort as \[1, 2, 3 ... 100\], because built-in function sorts strings as \[1, 100, 101 ...\].

In [4]:
def atoi(text):
    return int(text) if text.isdigit() else text


def natural_keys(text):
    return [atoi(c) for c in re.split(r'(\d+)', text)]

In [5]:
def prepare_files(directory, file_with_text, inf):
    with open(file_with_text, encoding='utf-16') as f:
        text = f.readlines()
    files = os.listdir(directory)
    files_full = []
    for filename in files:
        if '.DS_Store' not in filename:
            f = os.path.join(directory, filename)
            files_full.append(f)
    files_full.sort(key=natural_keys)
    j = 0
    dict_for_inf = []
    for filename in tqdm(files_full):
        if not '=' in text[j] and not 'нрзб' in text[j] and not '[' in text[j] and not '<' in text[j]:
            x = text[j].replace('\n', '').lower()
            x = x.replace('.', ' ')
            x = x.replace(',', ' ')
            x = x.replace(':', ' ')
            x = x.replace('?', ' ')
            x = x.replace('!', ' ')
            x = x.replace('–', ' ')
            x = x.replace('-', ' ')
            x = x.replace('ё', 'е')
            x = re.sub('(\s){2,}', ' ', x)
            x = re.sub('\(.*\)', '', x)
            x = x.rstrip()
            dict_for_inf.append({'respondent':inf, 'path': filename, 'sentence': x})
        j += 1  
    return dict_for_inf

In [6]:
enm = prepare_files('/content/input_opochka/new_mono_enm20180618', 
                    '/content/input_opochka/20180618_enm1930_1to487.txt', 'ENM1930')
ive = prepare_files('/content/input_opochka/new_mono_ive20190702', 
                    '/content/input_opochka/20190702_ive1949_1to234.txt', 'IVE1949')
onv = prepare_files('/content/input_opochka/new_mono_onv20180622', 
                    '/content/input_opochka/20180622_onv1972_1to529.txt', 'ONV1972')
saf = prepare_files('/content/input_opochka/new_mono_saf20190701', 
                    '/content/input_opochka/20190701_saf1973_1to434.txt', 'IVE1949')
tai = prepare_files('/content/input_opochka/new_mono_tai20190706', 
                    '/content/input_opochka/20190706_tai1955_1to167.txt', 'TAI1955')
tve = prepare_files('/content/input_opochka/new_mono_tve20190702', 
                    '/content/input_opochka/20190702_tve1955_1to709.txt', 'TVE1955')
vav = prepare_files('/content/input_opochka/new_mono_vav20180619', 
                    '/content/input_opochka/20180619_vav1949_1to277.txt', 'VAV1949')

100%|████████████████████████████████████████████████████████████████████████████| 487/487 [00:00<00:00, 162293.50it/s]
100%|████████████████████████████████████████████████████████████████████████████| 234/234 [00:00<00:00, 234072.77it/s]
100%|████████████████████████████████████████████████████████████████████████████| 529/529 [00:00<00:00, 264582.26it/s]
100%|████████████████████████████████████████████████████████████████████████████| 434/434 [00:00<00:00, 216938.14it/s]
0it [00:00, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████| 709/709 [00:00<00:00, 236350.46it/s]
100%|████████████████████████████████████████████████████████████████████████████| 277/277 [00:00<00:00, 277020.08it/s]


In [8]:
all_data = enm + ive + onv + saf + tai + tve + vav
print(len(all_data))
train, test = train_test_split(all_data, test_size=0.3, random_state=22)

2256


# Read audio

In [10]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch["path"], sr=16000)
    batch["speech"] = speech_array
    batch["sentence"] = batch["sentence"]
    return batch

test_dataset = []

for l in tqdm(test):
    test_dataset.append(speech_file_to_array_fn(l))
data = [d['speech'] for d in test_dataset]

100%|████████████████████████████████████████████████████████████████████████████████| 677/677 [00:33<00:00, 19.93it/s]


# Testing the model that was pretrained on the data of the Zapadnodvinsk villages

In [11]:
j = 0
ready = []
for d in tqdm(data):
    inputs = processor(d, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentences = processor.batch_decode(predicted_ids)

    for i, predicted_sentence in enumerate(predicted_sentences):
        ready.append([test_dataset[j]["path"], test_dataset[j]["sentence"], predicted_sentence, 
                      test_dataset[j]["speech"], test_dataset[j]["respondent"]])
    j += 1

100%|████████████████████████████████████████████████████████████████████████████████| 677/677 [10:25<00:00,  1.08it/s]


In [14]:
wers = []
cers = []
total = []

for path, correct, transcription, speech, respondent in tqdm(ready):
    if correct != '' and correct != ' ':
        w = wer(correct, transcription)
        wers.append(w)
        c = cer(correct, transcription)
        cers.append(c)

print('Mean WER: ', sum(wers)/len(wers))
print('Mean CER: ', sum(cers)/len(cers))

100%|█████████████████████████████████████████████████████████████████████████████| 677/677 [00:00<00:00, 12086.77it/s]

Mean WER:  0.6153097363516223
Mean CER:  0.3702287137562663





In [17]:
test_results = pd.DataFrame(ready, columns=['path', 'correct', 'transcription', 'speech', 'respondent'])
path = "/content/opochka_wav2vec_test.xlsx"
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')

test_results.to_excel(writer) 

writer.save()
writer.close()

  warn("Calling close() on already closed file.")


# Testing the model that was not trained on dialect data

In [18]:
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
j = 0
ready = []
for d in tqdm(data):
    inputs = processor(d, sampling_rate=16_000, return_tensors="pt", padding=True)

    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentences = processor.batch_decode(predicted_ids)

    for i, predicted_sentence in enumerate(predicted_sentences):
        ready.append([test_dataset[j]["path"], test_dataset[j]["sentence"], predicted_sentence, 
                      test_dataset[j]["speech"], test_dataset[j]["respondent"]])
    j += 1

100%|████████████████████████████████████████████████████████████████████████████████| 677/677 [10:21<00:00,  1.09it/s]


In [19]:
wers = []
cers = []
total = []

for path, correct, transcription, speech, respondent in tqdm(ready):
    if correct != '' and correct != ' ':
        w = wer(correct, transcription)
        wers.append(w)
        c = cer(correct, transcription)
        cers.append(c)

print('Mean WER: ', sum(wers)/len(wers))
print('Mean CER: ', sum(cers)/len(cers))

100%|█████████████████████████████████████████████████████████████████████████████| 677/677 [00:00<00:00, 10530.44it/s]

Mean WER:  0.6594268174819736
Mean CER:  0.40216192678043755





In [20]:
test_results = pd.DataFrame(ready, columns=['path', 'correct', 'transcription', 'speech', 'respondent'])
path = "/content/opochka_wav2vec_baseline.xlsx"
writer = pd.ExcelWriter(path, engine = 'xlsxwriter')

test_results.to_excel(writer) 

writer.save()
writer.close()

  warn("Calling close() on already closed file.")
