In [None]:
# change these parameters to load a different model
model_path = "/home/daniel/models/output_graph.pb"
alphabet_path = "/home/daniel/models/alphabet.txt"
lm_path = "/home/daniel/models/lm.binary"
trie_path = "/home/daniel/models/trie"

# These constants control the beam search decoder
BEAM_WIDTH = 500  # Beam width used in the CTC decoder when building candidate transcriptions
LM_WEIGHT = 1.75  # The alpha hyperparameter of the CTC decoder. Language Model weight
# Valid word insertion weight. This is used to lessen the word insertion penalty
# when the inserted word is part of the vocabulary
VALID_WORD_COUNT_WEIGHT = 1.00
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

from deepspeech.model import Model
import sys
from timeit import default_timer as timer

def load_model(model_path, alphabet_path, lm_path=None, trie_path=None):
    # load pre-trained DeepSpeech model from file
    print(f'Loading model with{"out" if lm_path else ""} LM from file {model_path}', file=sys.stderr)
    model_load_start = timer()
    ds = Model(model_path, N_FEATURES, N_CONTEXT, alphabet_path, BEAM_WIDTH)
    model_load_end = timer() - model_load_start
    
    if lm_path and trie_path:
        print(f'Loading language model from files {lm_path} {trie_path}', file=sys.stderr)
        lm_load_start = timer()
        ds.enableDecoderWithLM(alphabet_path, lm_path, trie_path, LM_WEIGHT, VALID_WORD_COUNT_WEIGHT, VALID_WORD_COUNT_WEIGHT)
        lm_load_end = timer() - lm_load_start
        print(f'Loaded language model in {lm_load_end:.3}s.', file=sys.stderr)
    print(f'Loaded model in {model_load_end:.3}s.', file=sys.stderr)    
    return ds
    
model_without_lm = load_model(model_path, alphabet_path)
model_with_lm = load_model(model_path, alphabet_path, lm_path, trie_path)

In [None]:
# load corpus
from util.corpus_util import *

corpus = get_corpus('rl')(languages='en')
# corpus = get_corpus('ls')
corpus.summary()

In [None]:
from IPython.display import HTML, Audio, display
from pattern3.metrics import levenshtein_similarity
import random

def get_random_test_samples(corpus, num=5):
    print(f'selecting {num} random speech segments from testset (corpus: {corpus.name})')    
    _,_,test_segments = corpus.train_dev_test_split(include_numeric=True)
    return random.sample(test_segments, num)

segments = get_random_test_samples(corpus)

for i, segment in enumerate(segments):
    print(f'Inferring transcription for speech segment #{i}')
    audio, rate = segment.audio, segment.rate
    transcription_noLM = model_without_lm.stt(audio, rate)
    transcription_LM = model_with_lm.stt(audio, rate)
    
    display(HTML(f'<strong>From corpus entry</strong>: {segment.corpus_entry.name} ({segment.corpus_entry.id})'))    
    display(Audio(data=audio, rate=rate))
    display(HTML(f'<strong>actual transcription</strong>:<br/>{segment.text}'))    
    display(HTML(f'<strong>inferred transcription (without LM)</strong>:<br/>{transcription_noLM}'))    
    display(HTML(f'<strong>Levenshtein similarity (=LER)</strong>: {levenshtein_similarity(transcription_noLM, segment.text)}'))
    display(HTML(f'<strong>inferred transcription (with LM)</strong>:<br/>{transcription_LM}'))        
    display(HTML(f'<strong>Levenshtein similarity (=LER)</strong>: {levenshtein_similarity(transcription_LM, segment.text)}'))
    
    display(HTML(f'<hr/>'))