In [None]:
import speechbrain as sb
import torch
import sys
import os

import pandas as pd
import re

In [None]:
DATA_DIR = "data"

PREP_DATA_FOLDER = f'{DATA_DIR}/prep_data/'

RESULTS_FOLDER = f'{DATA_DIR}/results/'
EXP_METADATA_FILE = f'{RESULTS_FOLDER}/exp_metadata.csv'
PREP_SCORING_RESULTS_FILE = f'{RESULTS_FOLDER}/results_scoring_prep.csv'
EPOCH_RESULTS_DIR = f'{RESULTS_FOLDER}/epoch_results'
PARAMS_DIR= f'{RESULTS_FOLDER}/params'


In [None]:
MODEL_TYPE = "w2v2"
SCORING_TYPE=""

SCORING_HPARAM_FILE = f'hparams/scoring/{MODEL_TYPE}/train_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring.yaml'
SCORING_MODEL_DIR = f"results/scoring/{MODEL_TYPE}/crdnn_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring_aug_no_round_no_pre_train"
PRETRAINED_MODEL_DIR = f"results/apr/{MODEL_TYPE}/crdnn_{MODEL_TYPE}_timit_apr/1234"
SCORING_HPARAM_FILE = f"hparams/scoring/{MODEL_TYPE}/train_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring.yaml"

In [None]:
argv = [
    SCORING_HPARAM_FILE,
    "--data_folder", PREP_DATA_FOLDER,
    "--batch_size", "4",
    "--pretrained_model_folder", PRETRAINED_MODEL_DIR,
    "--use_augmentation", "True",
    "--exp_folder", SCORING_MODEL_DIR,
    "--exp_metadata_file", EXP_METADATA_FILE,
    "--results_file", PREP_SCORING_RESULTS_FILE,
    "--epoch_results_dir", EPOCH_RESULTS_DIR,
    "--params_dir", PARAMS_DIR
    ]

hparams_file, run_opts, overrides = sb.parse_arguments(argv)

In [None]:
from hyperpyyaml import load_hyperpyyaml

with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin, overrides)

In [None]:
from models.brain import get_brain_class

brain_class = get_brain_class(hparams)
brain = brain_class(
        modules=hparams["modules"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )


In [None]:
ckpt_path = "results/scoring/w2v2/crdnn_w2v2_so762_scoring_aug_no_round_no_pre_train/1234/save-1/CKPT+2024-01-30+08-10-42+00"

wav2vec2_ckpt_path = f'{ckpt_path}/wav2vec2.ckpt'
model_ckpt_path = f'{ckpt_path}/model.ckpt'
model_scorer_ckpt_path = f'{ckpt_path}/model_scorer.ckpt'

wav2vec2_state_dict = torch.load(wav2vec2_ckpt_path)
model_state_dict = torch.load(model_ckpt_path)
model_scorer_state_dict = torch.load(model_scorer_ckpt_path)

hparams["wav2vec2"].load_state_dict(wav2vec2_state_dict)
hparams["model"].load_state_dict(model_state_dict)
hparams["model_scorer"].load_state_dict(model_scorer_state_dict)

In [None]:
hparams["label_encoder_path"] = "results/scoring/w2v2/crdnn_w2v2_so762_scoring_aug_no_round_no_pre_train/1234/save/label_encoder.txt"

In [None]:
label_encoder_path = hparams["label_encoder_path"]
label_encoder = sb.dataio.encoder.CTCTextEncoder.from_saved(label_encoder_path)

In [None]:
from data_prep_utils.dataset_preparation.dataio_prep import dataio_prep

train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

In [None]:
def normalize(text):
    text = re.sub(
        r'[\!@#$%^&*\(\)\\\.\"\,\?\;\:\+\-\_\/\|~`]', ' ', text)
    
    text = re.sub('\s+', ' ', text)
    text = text.lower().strip()
    return text


In [None]:
path = "/home/tuyendv/E2E-R/resources/lexicon"
lexicon = pd.read_csv(path, names=["word", "arpa"], sep="\t")

lexicon.dropna(inplace=True)
lexicon["word"] = lexicon.word.apply(lambda x: x.lower())
lexicon["arpa"] = lexicon.arpa.apply(lambda x: re.sub("\d", "", x).lower())

lexicon.word.drop_duplicates(inplace=True)
lexicon.set_index("word", inplace=True)
lexicon = lexicon.to_dict()["arpa"]

In [None]:
audio_path = "/home/tuyendv/E2E-R/wav/I assured that the sessions all my team worth it. Two weeks to start right as a language..wav"
transcript = "i assured that the sessions all my team worth it two weeks to start right as a language"

In [None]:
transcript = normalize(transcript)
transcript

In [None]:
def convert_word_to_arpa(word):
    word = lexicon[word].lower()
    word = word.replace("ax", "ah")
    word = word.split()

    return word

In [None]:
words = transcript.split()
df = pd.DataFrame(
    {
        "word": words,
        "word-id": range(len(words))
    }
)
df["phone"] = df["word"].apply(convert_word_to_arpa)
df = df.explode(column="phone")
df.head(2)

In [None]:
phn_canonical_list = df["phone"].tolist()

In [None]:

phn_encoded_list = label_encoder.encode_sequence(phn_canonical_list)
phn_canonical_encoded = torch.LongTensor(phn_encoded_list)
phn_canonical_encoded_eos = torch.LongTensor(label_encoder.append_eos_index(phn_encoded_list))
phn_canonical_encoded_bos = torch.LongTensor(label_encoder.prepend_bos_index(phn_encoded_list))

wavs = sb.dataio.dataio.read_audio(audio_path)
wavs = wavs.unsqueeze(0).cuda()
wav_lens = torch.tensor([wavs.shape[1]]).cuda()
phns_canonical_bos = phn_canonical_encoded_bos.unsqueeze(0).cuda()
phns_canonical_eos = phn_canonical_encoded_eos.unsqueeze(0).cuda()

In [None]:
scores_pred, wav_lens = brain.infer(wavs, wav_lens, phns_canonical_bos, phns_canonical_eos)
scores_pred = (scores_pred * 100).cpu().round()
print(scores_pred)

In [None]:
df["phone-score"] = scores_pred[0].tolist()[:-1]
df["start-time"] = 0
df["end-time"] = 0
df["start-index"] = 0
df["end-index"] = 0
df["ipa"] = ""
df["sound_most_like"] = df["phone"]

In [None]:
sentence = {
    "utterance": transcript, 
    "duration": 0,
    "text": transcript,
    "score": 0,
    "ipa": "",
    "version": "v1.0"
    "words": [],
}

sentence["words"] = [None] * (df["word-id"].max() + 1)
for (word, word_id), group in df.groupby(["word", "word-id"]):
    group = group.reset_index()

    word = {
        "start_time": 0,
        "end_time": 0,
        "start_index": 0,
        "end_index": 0,
        "text": word,
        "arpabet": "",
        "ipa": "",
        "score": 0,
        "phonemes": []
    }
    for phone_index in group.index:
        phone = {
            "start_time": group["start-time"][phone_index],
            "end_time": group["end-time"][phone_index],
            "start_index": group["start-index"][phone_index],
            "end_index": group["end-index"][phone_index],
            "arpabet": group["phone"][phone_index],
            "ipa": group["ipa"][phone_index],
            "sound_most_like": group["sound_most_like"][phone_index],
            "score": group["phone-score"][phone_index]
        }

        word["phonemes"].append(phone)

    sentence["words"][word_id] = word