In [29]:
import speechbrain as sb
import torch
import sys
import os

In [30]:
DATA_DIR = "data"  # Change this path to the path where you keep your data.

PREP_DATA_FOLDER = f'{DATA_DIR}/prep_data/'

RESULTS_FOLDER = f'{DATA_DIR}/results/'
EXP_METADATA_FILE = f'{RESULTS_FOLDER}/exp_metadata.csv'
PREP_SCORING_RESULTS_FILE = f'{RESULTS_FOLDER}/results_scoring_prep.csv'
EPOCH_RESULTS_DIR = f'{RESULTS_FOLDER}/epoch_results'
PARAMS_DIR= f'{RESULTS_FOLDER}/params'


In [31]:
MODEL_TYPE = "w2v2"
SCORING_TYPE=""

SCORING_HPARAM_FILE = f'hparams/scoring/{MODEL_TYPE}/train_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring.yaml'
SCORING_MODEL_DIR = f"results/scoring/{MODEL_TYPE}/crdnn_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring_aug_no_round_no_pre_train"
PRETRAINED_MODEL_DIR = f"results/apr/{MODEL_TYPE}/crdnn_{MODEL_TYPE}_timit_apr/1234"
SCORING_HPARAM_FILE = f"hparams/scoring/{MODEL_TYPE}/train_{MODEL_TYPE}_so762{SCORING_TYPE}_scoring.yaml"

In [32]:
argv = [
    SCORING_HPARAM_FILE,
    "--data_folder", PREP_DATA_FOLDER,
    "--batch_size", "4",
    "--pretrained_model_folder", PRETRAINED_MODEL_DIR,
    "--use_augmentation", "True",
    "--exp_folder", SCORING_MODEL_DIR,
    "--exp_metadata_file", EXP_METADATA_FILE,
    "--results_file", PREP_SCORING_RESULTS_FILE,
    "--epoch_results_dir", EPOCH_RESULTS_DIR,
    "--params_dir", PARAMS_DIR
    ]

hparams_file, run_opts, overrides = sb.parse_arguments(argv)

In [33]:
from hyperpyyaml import load_hyperpyyaml

with open(hparams_file) as fin:
    hparams = load_hyperpyyaml(fin, overrides)

In [34]:
from models.brain import get_brain_class

brain_class = get_brain_class(hparams)
brain = brain_class(
        modules=hparams["modules"],
        hparams=hparams,
        run_opts=run_opts,
        checkpointer=hparams["checkpointer"],
    )


In [35]:
ckpt_path = "results/scoring/w2v2/crdnn_w2v2_so762_scoring_aug_no_round_no_pre_train/1234/save-1/CKPT+2024-01-30+08-10-42+00"

wav2vec2_ckpt_path = f'{ckpt_path}/wav2vec2.ckpt'
model_ckpt_path = f'{ckpt_path}/model.ckpt'
model_scorer_ckpt_path = f'{ckpt_path}/model_scorer.ckpt'

wav2vec2_state_dict = torch.load(wav2vec2_ckpt_path)
model_state_dict = torch.load(model_ckpt_path)
model_scorer_state_dict = torch.load(model_scorer_ckpt_path)

hparams["wav2vec2"].load_state_dict(wav2vec2_state_dict)
hparams["model"].load_state_dict(model_state_dict)
hparams["model_scorer"].load_state_dict(model_scorer_state_dict)

<All keys matched successfully>

In [36]:
hparams["label_encoder_path"] = "results/scoring/w2v2/crdnn_w2v2_so762_scoring_aug_no_round_no_pre_train/1234/save/label_encoder.txt"

In [37]:
label_encoder_path = hparams["label_encoder_path"]
label_encoder = sb.dataio.encoder.CTCTextEncoder.from_saved(label_encoder_path)

In [38]:
from data_prep_utils.dataset_preparation.dataio_prep import dataio_prep

train_data, valid_data, test_data, label_encoder = dataio_prep(hparams)

In [39]:
import pandas as pd
import re

path = "/home/tuyendv/E2E-R/lexicon"
lexicon = pd.read_csv(path, names=["word", "arpa"], sep="\t")

lexicon.dropna(inplace=True)
lexicon["word"] = lexicon.word.apply(lambda x: x.lower())
lexicon["arpa"] = lexicon.arpa.apply(lambda x: re.sub("\d", "", x).lower())

lexicon.word.drop_duplicates(inplace=True)
lexicon.set_index("word", inplace=True)
lexicon = lexicon.to_dict()["arpa"]

In [40]:
audio_path = "/home/tuyendv/E2E-R/wav/mother.wav"
transcript = "mother"

transcript = [lexicon[word] for word in transcript.split()]
transcript = " ".join(transcript)
transcript = transcript.replace("ax", "ah")

phn_canonical_list = transcript.split()
wavs = sb.dataio.dataio.read_audio(audio_path)

phn_encoded_list = label_encoder.encode_sequence(phn_canonical_list)

phn_canonical_encoded = torch.LongTensor(phn_encoded_list)
phn_canonical_encoded_eos = torch.LongTensor(label_encoder.append_eos_index(phn_encoded_list))
phn_canonical_encoded_bos = torch.LongTensor(label_encoder.prepend_bos_index(phn_encoded_list))

wavs = wavs.unsqueeze(0).cuda()
wav_lens = torch.tensor([wavs.shape[1]]).cuda()
phns_canonical_bos = phn_canonical_encoded_bos.unsqueeze(0).cuda()
phns_canonical_eos = phn_canonical_encoded_eos.unsqueeze(0).cuda()

In [41]:
scores_pred, wav_lens = brain.infer(wavs, wav_lens, phns_canonical_bos, phns_canonical_eos)
print(scores_pred)
scores_pred = brain.rescale_scores(scores_pred)
print(scores_pred)

tensor([[0.9290, 0.8414, 0.8655, 0.7649, 0.5825]], device='cuda:0',
       grad_fn=<SumBackward1>)
tensor([[1.8581, 1.6829, 1.7310, 1.5297, 1.1649]], device='cuda:0',
       grad_fn=<MulBackward0>)
