In [None]:
import kenlm
from random import shuffle
from lm.lm_util import process_sentence

def load_lm(lm_path):
    model = kenlm.LanguageModel(lm_path)
    print(f'loaded {model.order}-gram model from {lm_path}')
    return model

def create_test_pair(sentence):
    words = sentence.lower().split()
    sentence_original = ' '.join(words)
    sentence_shuffled = sentence_original
    while sentence_shuffled == sentence_original:
        shuffle(words)
        sentence_shuffled = ' '.join(words)
    return sentence_original, sentence_shuffled

def score_sentence(model, sentence):
    print(f'score for \'{sentence}\': ', model.score(sentence))
    for prob, ngram_length, oov in model.full_scores(sentence):
        print({'probability': prob, "n-gram lenght": ngram_length, "oov?": oov})
    print("perplexity:", model.perplexity(sentence))
    print()
    
def check_lm(model, sentences):
    for sentence in sentences:
        print('original sentence:', sentence)
        sentence = process_sentence(sentence)
        print('normalized sentence:', sentence)
        original, shuffled = create_test_pair(sentence)
        print()
        print('scoring original sentence: ')
        score_sentence(model, original)
        print('scoring shuffled sentence: ')
        score_sentence(model, shuffled)
        
english_sentences = [
    'Language modelling is fun',
    'New York'
]
german_sentences = [
    'Seine Pressebeauftragte ist ratlos.',
    'Fünf Minuten später steht er im Eingang des Kulturcafés an der Zürcher Europaallee.',
    'Den Leuten wird bewusst, dass das System des Neoliberalismus nicht länger tragfähig ist.',
    'Doch daneben gibt es die beeindruckende Zahl von 30\'000 Bienenarten, die man unter dem Begriff «Wildbienen» zusammenfasst.',
    'Bereits 1964 plante die US-Airline Pan American touristische Weltraumflüge für das Jahr 2000.',
]
german_sayings = [
    'Ich bin ein Berliner',
    'Man soll den Tag nicht vor dem Abend loben',
    'Was ich nicht weiss macht mich nicht heiss',
    'Ein Unglück kommt selten allein',
    'New York'
]

# 4-gram English LM

The following model was trained on the TIMIT corpus and downloaded from the following location:

https://www.dropbox.com/s/2n897gu5p3o2391/libri-timit-lm.kl

In [None]:
model = load_lm('/home/daniel/KerasDeepSpeech/lm/libri-timit-lm.klm')
check_lm(model, english_sentences)

# 3-Gram German LM
The following is a 3-gram LM that has been trained with CMUSphinx. The ARPA file was downloaded from https://cmusphinx.github.io/wiki/download/ and converted to a binary KenLM model.

In [None]:
model = load_lm('../lm/wiki_de/srilm-voxforge-de-r20171217.klm')
check_lm(model, german_sentences)

# 2-gram German LM (KenLM, probing)

The following 2-gram model was trained on sentences from articles and pages in a Wikipedia dump. The dump was downloaded on 2018-09-21 and contains the state from 2018-09-01. The current dump of the German Wikipedia can be downloaded at http://download.wikimedia.org/dewiki/latest/dewiki-latest-pages-articles.xml.bz2.

The model was not pruned. Probing was used as data structure. The following command was used to create the model:

```bash
lmplz -o 2 -T /home/daniel/tmp --skip_symbols -S 40% <../lm_data/wiki_de/wiki_de.txt.bz2 | build_binary /dev/stdin ../lm/wiki_de/wiki_de_2_gram.klm
```

In [None]:
model = load_lm('../lm/wiki_de/wiki_de_2_gram.klm')
check_lm(model, german_sentences)

# 4-gram German LM (KenLM, probing)

The following 4-gram model was trained on the same dump like the 2-gram model above. Except for the order all other parameters were identical to the 2-gram model, i.e. in the bash command used for creating the model, all parameters except the `-o` parameters were the same.

In [None]:
model = load_lm('../lm/wiki_de/wiki_de_4_gram.klm')
check_lm(model, german_sentences)