In [None]:
# change these parameters to load a different model
model = "/home/daniel/models/output_graph.pb"
alphabet = "/home/daniel/models/alphabet.txt"
lm = "/home/daniel/models/lm.binary"
trie = "/home/daniel/models/trie"

# These constants control the beam search decoder
BEAM_WIDTH = 500  # Beam width used in the CTC decoder when building candidate transcriptions
LM_WEIGHT = 1.75  # The alpha hyperparameter of the CTC decoder. Language Model weight
# Valid word insertion weight. This is used to lessen the word insertion penalty
# when the inserted word is part of the vocabulary
VALID_WORD_COUNT_WEIGHT = 1.00
# These constants are tied to the shape of the graph used (changing them changes
# the geometry of the first layer), so make sure you use the same constants that
# were used during training
# Number of MFCC features to use
N_FEATURES = 26
# Size of the context window used for producing timesteps in the input vector
N_CONTEXT = 9

from deepspeech.model import Model
import sys
from timeit import default_timer as timer

# load pre-trained DeepSpeech model from file
print(f'Loading model from file {model}', file=sys.stderr)
model_load_start = timer()
ds = Model(model, N_FEATURES, N_CONTEXT, alphabet, BEAM_WIDTH)
model_load_end = timer() - model_load_start
print(f'Loaded model in {model_load_end:.3}s.', file=sys.stderr)

In [None]:
# load corpus
from util.corpus_util import *

# corpus = get_corpus('rl')(languages='en')
corpus = get_corpus('ls')
corpus.summary()

In [None]:
from IPython.display import HTML, Audio, display
from pattern3.metrics import levenshtein_similarity
import random

def get_random_test_samples(corpus, num=5):
    print(f'selecting {num} random speech segments from testset (corpus: {corpus.name})')
    test_entries = [entry for entry in corpus if entry.subset.startswith('test')]
    return [random.choice(entry.speech_segments) for entry in random.sample(test_entries, num)]

segments = get_random_test_samples(corpus)

for i, segment in enumerate(segments):
    print(f'Inferring transcription for speech segment #{i}')
    audio, rate = segment.audio, segment.rate
    transcription = ds.stt(audio, rate)
    
    display(HTML(f'<strong>From corpus entry</strong>: {segment.corpus_entry.name} ({segment.corpus_entry.id})'))    
    display(Audio(data=audio, rate=rate))
    display(HTML(f'<strong>inferred transcription</strong>:<br/>{transcription}'))    
    display(HTML(f'<strong>actual transcription</strong>:<br/>{segment.text}'))
    display(HTML(f'<strong>Levenshtein similarity (=LER)</strong>: {levenshtein_similarity(transcription, segment.text)}'))
    display(HTML(f'<hr/>'))