**Installing dependencies:**

In [None]:
!pip install https://github.com/kpu/kenlm/archive/master.zip
!git clone https://github.com/kpu/kenlm.git

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip
[2K     [32m\[0m [32m553.6 kB[0m [31m9.9 MB/s[0m [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184306 sha256=7ca14f01ed9ca02e40429615d1595b6339635ae5c2be41f983f34b5a6084216d
  Stored in directory: /tmp/pip-ephem-wheel-cache-lm75u3_8/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0
Cloning into 'kenlm'...
remote: Enumerating objects: 14165, done.[K
remote: Counting objects: 100% (478/478), done.[K
remote: Compr

**Testing usage:**

In [None]:
import kenlm
model = kenlm.Model('/content/kenlm/lm/test.arpa')
print(model.score('this is a sentence .', bos = True, eos = True))

-49.579345703125


**Compiling:**

In [None]:
!mkdir -p /content/kenlm/build
!cmake /content/kenlm -B /content/kenlm/build
%cd /content/kenlm/build
!make -j 4

  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- Could NOT find Eigen3 (missing: Eigen3_DIR)
-- Configuring done (0.1s)
-- Generating done (0.0s)
-- Build files have been written to: /content/kenlm/build
/content/kenlm/build
[ 38%] Built target kenlm_util
[ 41%] Built target probing_hash_table_benchmark
[ 46%] Built target kenlm_filter
[ 71%] Built target kenlm
[ 73%] Built target query
[ 76%] Built target build_binary
[ 78%] Built target fragment
[ 81%] Built target kenlm_benchmark
[ 83%] Built target filter
[ 86%] Built target phrase_table_vocab
[ 95%] Built target kenlm_builder
[ 97%] Built target lmplz
[100%] Built target count_ngrams


In [None]:
%cd /content
!ls

/content
kenlm  sample_data


**Preprocessing (optional):**

Upload the training dataset. It can be a text file or a compressed file (bzip2). In case you would like to compress a text file and then pass it to the training function, here's how you do it:


```
bzip2 <input-file.txt>
```



**Training the model:**

Parameters can be provided, which include:


1.   Order
2.   Threshold
3.   Input file path



In [None]:
import os
import subprocess

def train_language_model(lmplz_path, order, threshold, input_file):
    """
    Train a language model using lmplz tool.

    Args:
    - lmplz_path (str): Path to the lmplz executable.
    - order (int): Order of the n-gram model.
    - threshold (int): Threshold count for pruning low-frequency n-grams.
    - input_file (str): Path to the input text file.

    Returns:
    - output_file (str): Path to the ARPA format output file.
    """
    # Generate output file name based on input file name
    output_file = os.path.splitext(input_file)[0] + "_model.arpa"

    command = [lmplz_path, '-o', str(order), '-T', str(threshold), '<', input_file, '>', output_file]
    command_str = ' '.join(command)

    # Run the command and capture the output and error
    process = subprocess.Popen(command_str, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()

    # Decode and print the output
    if stdout:
        print("Output:\n", stdout.decode())
    if stderr:
        print("Error:\n", stderr.decode())

    return output_file

# Example usage:
lmplz_path = "/content/kenlm/build/bin/lmplz"
# Set n-gram order
order = 3
#  Specifies a threshold count for pruning low-frequency n-grams from the model.
# N-grams occurring fewer times than the specified threshold will be pruned.
threshold = 5
# Enter path of input file:
input_file = "/content/Shah.txt"

output_file = train_language_model(lmplz_path, order, threshold, input_file)
print("Output file:", output_file)

Error:
 === 1/5 Counting and sorting n-grams ===
Reading /content/Shah.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 143982 types 15207
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:182484 2:3786914560 3:7100464640
Statistics:
1 15207 D1=0.725766 D2=0.990402 D3+=1.43806
2 61176 D1=0.825489 D2=1.24297 D3+=1.42936
3 86798 D1=0.888037 D2=1.08122 D3+=0.915233
Memory estimate for binary LM:
type      kB
probing 3345 assuming -p 1.5
probing 3763 assuming -r models -p 1.5
trie    1535 without quantization
trie     943 assuming -q 8 -b 8 quantization 
trie    1463 assuming -a 22 array pointer compression
trie     872 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:182484 2:978816 3:1735960
=== 4/5 Calculating 

**Testing generated model:**

In [None]:
#!/usr/bin/env python
import os
import kenlm

# LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
#model = kenlm.LanguageModel(LM)

model = kenlm.Model(output_file)
print('{0}-gram model'.format(model.order))

sentence = 'language modeling is fun .'
print(sentence)
print(model.score(sentence))

# Check that total full score = direct score
def score(s):
    return sum(prob for prob, _, _ in model.full_scores(s))

assert (abs(score(sentence) - model.score(sentence)) < 1e-3)

# Show scores and n-gram matches
words = ['<s>'] + sentence.split() + ['</s>']
for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
    print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
    if oov:
        print('\t"{0}" is an OOV'.format(words[i+1]))

# Find out-of-vocabulary words
for w in words:
    if not w in model:
        print('"{0}" is an OOV'.format(w))

#Stateful query
state = kenlm.State()
state2 = kenlm.State()
#Use <s> as context.  If you don't want <s>, use model.NullContextWrite(state).
model.BeginSentenceWrite(state)
accum = 0.0
accum += model.BaseScore(state, "a", state2)
accum += model.BaseScore(state2, "sentence", state)
#score defaults to bos = True and eos = True.  Here we'll check without the end
#of sentence marker.
assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3)
accum += model.BaseScore(state, "</s>", state2)
assert (abs(accum - model.score("a sentence")) < 1e-3)

3-gram model
language modeling is fun .
-19.566287994384766
-5.348799705505371 1: language
-4.914098262786865 1: modeling
	"modeling" is an OOV
-2.1186938285827637 1: is
-3.5201516151428223 2: is fun
-3.2292842864990234 1: .
-0.43526071310043335 2: . </s>
"modeling" is an OOV
