In [60]:
import os
from pathlib import Path
from argparse import Namespace

import allosaurus
from allosaurus.app import read_recognizer
from allosaurus.audio import read_audio
from allosaurus.pm.factory import read_pm
from allosaurus.lm.inventory import Inventory
from allosaurus.model import resolve_model_name, get_all_models, get_model_path

In [7]:
# os.getcwd()
audio_file =  './sample.wav'
model = read_recognizer('latest')
results = model.recognize(audio_file)
results

'æ l u s ɔ ɹ s'

In [62]:
# C:\Users\Hp\anaconda3\lib\site-packages\allosaurus\pretrained
# os.listdir("C:/Users/Hp/anaconda3/lib/site-packages/allosaurus/pretrained/uni2005")



# Feature Extraction

In [61]:
model_path = get_model_path("latest")
model_name = resolve_model_name("latest", None)
inference_config = Namespace(model=model_name, device_id=-1, lang='ipa', approximate=False, prior=None)

# model_path = Path(allosaurus.__file__).parent / 'pretrained' / inference_config.model

In [57]:
model_name, inference_config

('uni2005',
 Namespace(model='uni2005', device_id=-1, lang='ipa', approximate=False, prior=None))

In [58]:
pm = read_pm(model_path, inference_config)

In [59]:
# load wav audio
audio_file = "./sample.wav"
audio = read_audio(audio_file)

# extract feature
feat = pm.compute(audio)
model.recognize(audio_file), len(model.recognize(audio_file)), feat.shape, feat

('æ l u s ɔ ɹ s',
 13,
 (35, 120),
 array([[-1.3275121 , -0.2194957 ,  0.6242982 , ..., -0.23495641,
          0.05200976,  0.85709184],
        [-1.5864275 ,  0.1535516 ,  0.9751952 , ...,  1.5554214 ,
         -2.0703316 ,  0.48564267],
        [-1.6092446 , -0.10560995,  0.7027959 , ...,  1.3302747 ,
          0.13863873, -0.23109473],
        ...,
        [-0.63638103, -0.8867026 ,  1.0978942 , ...,  2.0134444 ,
          0.36944944, -0.5184862 ],
        [-1.4749463 ,  0.06873361,  0.9781027 , ..., -0.52928275,
          0.56808424, -1.5015886 ],
        [-1.432489  , -0.36272946,  0.43866175, ...,  0.23993936,
         -0.42920986,  0.46625167]], dtype=float32))

# List Phones

In [63]:
inventory = Inventory(model_path)

In [65]:
lang = "ipa"
approximate = False

mask = inventory.get_mask(lang.lower(), approximation=approximate)
unit = mask.target_unit

In [75]:
# need to remove <blk>
# unit.id_to_unit.values()
phones = ['I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ']

In [77]:
# phones

# Utils

In [87]:
import numpy as np

model = read_recognizer('latest')
phones = ['I', 'a', 'aː', 'ã', 'ă', 'b', 'bʲ', 'bʲj', 'bʷ', 'bʼ', 'bː', 'b̞', 'b̤', 'b̥', 'c', 'd', 'dʒ', 'dʲ', 'dː', 'd̚', 'd̥', 'd̪', 'd̯', 'd͡z', 'd͡ʑ', 'd͡ʒ', 'd͡ʒː', 'd͡ʒ̤', 'e', 'eː', 'e̞', 'f', 'fʲ', 'fʷ', 'fː', 'g', 'gʲ', 'gʲj', 'gʷ', 'gː', 'h', 'hʷ', 'i', 'ij', 'iː', 'i̞', 'i̥', 'i̯', 'j', 'k', 'kx', 'kʰ', 'kʲ', 'kʲj', 'kʷ', 'kʷʼ', 'kʼ', 'kː', 'k̟ʲ', 'k̟̚', 'k͡p̚', 'l', 'lʲ', 'lː', 'l̪', 'm', 'mʲ', 'mʲj', 'mʷ', 'mː', 'n', 'nj', 'nʲ', 'nː', 'n̪', 'n̺', 'o', 'oː', 'o̞', 'o̥', 'p', 'pf', 'pʰ', 'pʲ', 'pʲj', 'pʷ', 'pʷʼ', 'pʼ', 'pː', 'p̚', 'q', 'r', 'rː', 's', 'sʲ', 'sʼ', 'sː', 's̪', 't', 'ts', 'tsʰ', 'tɕ', 'tɕʰ', 'tʂ', 'tʂʰ', 'tʃ', 'tʰ', 'tʲ', 'tʷʼ', 'tʼ', 'tː', 't̚', 't̪', 't̪ʰ', 't̪̚', 't͡s', 't͡sʼ', 't͡ɕ', 't͡ɬ', 't͡ʃ', 't͡ʃʲ', 't͡ʃʼ', 't͡ʃː', 'u', 'uə', 'uː', 'u͡w', 'v', 'vʲ', 'vʷ', 'vː', 'v̞', 'v̞ʲ', 'w', 'x', 'x̟ʲ', 'y', 'z', 'zj', 'zʲ', 'z̪', 'ä', 'æ', 'ç', 'çj', 'ð', 'ø', 'ŋ', 'ŋ̟', 'ŋ͡m', 'œ', 'œ̃', 'ɐ', 'ɐ̞', 'ɑ', 'ɑ̱', 'ɒ', 'ɓ', 'ɔ', 'ɔ̃', 'ɕ', 'ɕː', 'ɖ̤', 'ɗ', 'ə', 'ɛ', 'ɛ̃', 'ɟ', 'ɡ', 'ɡʲ', 'ɡ̤', 'ɡ̥', 'ɣ', 'ɣj', 'ɤ', 'ɤɐ̞', 'ɤ̆', 'ɥ', 'ɦ', 'ɨ', 'ɪ', 'ɫ', 'ɯ', 'ɯ̟', 'ɯ̥', 'ɰ', 'ɱ', 'ɲ', 'ɳ', 'ɴ', 'ɵ', 'ɸ', 'ɹ', 'ɹ̩', 'ɻ', 'ɻ̩', 'ɽ', 'ɾ', 'ɾj', 'ɾʲ', 'ɾ̠', 'ʀ', 'ʁ', 'ʁ̝', 'ʂ', 'ʃ', 'ʃʲː', 'ʃ͡ɣ', 'ʈ', 'ʉ̞', 'ʊ', 'ʋ', 'ʋʲ', 'ʌ', 'ʎ', 'ʏ', 'ʐ', 'ʑ', 'ʒ', 'ʒ͡ɣ', 'ʔ', 'ʝ', 'ː', 'β', 'β̞', 'θ', 'χ', 'ә', 'ḁ']

def encode(transcript):
    '''
    accepts a string transcript and returns a one hot encoded matrix
    for each phone of the transcript
    '''
    phone_list = transcript.split(" ")
    one_hot = np.zeros((len(phone_list), len(phones)))
    
    for idx, phone in enumerate(phone_list):
        phone_loc = phones.index(phone)
        one_hot[idx][phone_loc] = 1
    
    return one_hot

def recognize(audio_path):
    '''
    accepts audio file path and returns the transcript
    '''
    return model.recognize(audio_path)

def list_phones():
    return phones

In [97]:
encode(recognize("./Voice.wav")).shape

(21, 229)