In [1]:
#inferences

In [2]:
import math
import torch
import pydub
from omegaconf import OmegaConf
import torchaudio
from IPython.display import Audio, display

In [3]:
from inference.support import load_model, load_data
from inference.support import W2lKenLMDecoder,W2lViterbiDecoder

In [4]:
SAMPLE_AUDIO_PATH = "220718-132354_lin_9f7_elicit_2.wav"
TARGET_SAMPLE_RATE = 16000

DEVICE_ID = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = 'model_output/monolingual_finetuned_on_lingala/checkpoint_best.pt'
# MODEL_PATH = 'workshop-2022/models/hi_mucs_dc.pt'

In [5]:
# load model
model,char_dict = load_model(MODEL_PATH)
model.to(DEVICE_ID)

Loading model..
Successfully loaded model model_output/monolingual_finetuned_on_lingala/checkpoint_best.pt


Wav2VecCtc(
  (w2v_encoder): Wav2VecEncoder(
    (w2v_model): Wav2Vec2Model(
      (feature_extractor): ConvFeatureExtractionModel(
        (conv_layers): ModuleList(
          (0): Sequential(
            (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
            (1): Dropout(p=0.0, inplace=False)
            (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
            (3): GELU(approximate='none')
          )
          (1-4): 4 x Sequential(
            (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
            (1): Dropout(p=0.0, inplace=False)
            (2): GELU(approximate='none')
          )
          (5-6): 2 x Sequential(
            (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
            (1): Dropout(p=0.0, inplace=False)
            (2): GELU(approximate='none')
          )
        )
      )
      (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
      (dropout_input): Dropout(p=0.0, inplac

In [6]:
ls

[0m[00;36m220718-132354_lin_9f7_elicit_2.wav[0m         [01;34mlm_data[0m/
[01;34mGamayun[0m/                                   [34;42mlm_data_old[0m/
[01;32mLICENSE[0m*                                   [34;42mlm_training[0m/
[01;32mREADME2.md[0m*                                [01;34mmanifest[0m/
[34;42mconfigs[0m/                                   [01;32mmanifest_creation.py[0m*
[34;42mcustom_task[0m/                               [34;42mmedia[0m/
[01;34mdata_prep_scripts[0m/                         [01;34mmodel_output[0m/
[01;34mdataset[0m/                                   [01;34moutputs[0m/
[34;42mfinetune_configs[0m/                          [01;34mpretrained_model[0m/
infer_single_file_on_lingala_models.ipynb  [01;32mreadme.md[0m*
[01;32minfer_single_file_on_swc_models.ipynb[0m*     [00;36msession_86_4085_swc.wav[0m
[34;42minference[0m/                                 [34;42mw2v_inference[0m/
[01;32mlang_wise_manifest_

In [7]:
def load_audio_from_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    num_channels, _ = waveform.shape
    if num_channels == 1:
        return waveform[0], sample_rate
    else:
        raise ValueError("Waveform with more than 1 channels are not supported.")


In [8]:
# load sample
waveform, sample_rate = load_audio_from_file(SAMPLE_AUDIO_PATH)
# resample
resampled_audio = torchaudio.functional.resample(waveform, sample_rate, TARGET_SAMPLE_RATE)
# display audio sample
display(Audio(resampled_audio.numpy(), rate=TARGET_SAMPLE_RATE))

In [9]:
#inference with greedy decoding

In [10]:
viterbi_arg = OmegaConf.create({'nbest':1})
viterbi_generator = W2lViterbiDecoder(viterbi_arg, char_dict)

In [11]:
def infer(feature, generator, DEVICE):
    if DEVICE != 'cpu' and torch.cuda.is_available():
        feature = feature.to(DEVICE)
    sample = {"net_input":{"source":None,"padding_mask":None}}
    sample["net_input"]["source"] = feature.unsqueeze(0)
    if DEVICE != 'cpu' and torch.cuda.is_available():
        sample["net_input"]["padding_mask"] = torch.BoolTensor(sample["net_input"]["source"].size(1)).fill_(False).unsqueeze(0).to(DEVICE)
    else:
        sample["net_input"]["padding_mask"] = torch.BoolTensor(sample["net_input"]["source"].size(1)).fill_(False).unsqueeze(0)

    with torch.no_grad():
        hypo = generator.generate([model], sample, prefix_tokens=None)
    hyp_pieces = char_dict.string(hypo[0][0]["tokens"].int().cpu())
    tr = hyp_pieces.replace(' ','').replace('|',' ').strip()
    return tr

In [12]:
output_str = infer(waveform, viterbi_generator, DEVICE_ID)
print(f"Greedy Output: {output_str}")

Greedy Output: liboso motuka ekoma miso ya jeannine ezotala ndenge tata oyɔ akiti na motuka


In [13]:
#Inference with LM

In [16]:
lm_details = {
    "lexicon":"/home/ubuntu/lingWav2Vec/lm_data/lingala/lexicon.lst", 
    "kenlm_model":"/home/ubuntu/lingWav2Vec/lm_data/lingala/lm.binary", 
    "beam":64,
    "beam_size_token": 1000, 
    "beam_threshold":250,
    "lm_weight":0.5, 
    "word_score":1.0, 
    "sil_weight":0.0,
    "unk_weight": -math.inf
}
lmarg = OmegaConf.create(lm_details)
kenlm_generator = W2lKenLMDecoder(lmarg, char_dict)

In [17]:
output_str = infer(waveform, kenlm_generator, DEVICE_ID)
print(f"LM Output: {output_str}")

LM Output: liboso motuka ekoma miso ya jeannine ezala ndenge tata oyɔ akiti na motuka
