**Installing dependencies:**

In [1]:
!pip install https://github.com/kpu/kenlm/archive/master.zip
!git clone https://github.com/kpu/kenlm.git

Collecting https://github.com/kpu/kenlm/archive/master.zip
  Downloading https://github.com/kpu/kenlm/archive/master.zip (553 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m553.6/553.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: kenlm
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone
  Created wheel for kenlm: filename=kenlm-0.2.0-cp310-cp310-linux_x86_64.whl size=3184348 sha256=10198c15443936cb4facb6fcf4a045a70764239d7699c390e404b641cf701743
  Stored in directory: /tmp/pip-ephem-wheel-cache-g5b4yvue/wheels/a5/73/ee/670fbd0cee8f6f0b21d10987cb042291e662e26e1a07026462
Successfully built kenlm
Installing collected packages: kenlm
Successfully installed kenlm-0.2.0
Cloning into 'kenlm'...
remote: Enumerating objects: 14165, done.[K
remote

**Testing usage:**

In [2]:
import kenlm
model = kenlm.Model('/content/kenlm/lm/test.arpa')
print(model.score('this is a sentence .', bos = True, eos = True))

-49.579345703125


**Compiling:**

In [3]:
!mkdir -p /content/kenlm/build
!cmake /content/kenlm -B /content/kenlm/build
%cd /content/kenlm/build
!make -j 4

  Compatibility with CMake < 3.5 will be removed from a future version of
  CMake.

  Update the VERSION argument <min> value or use a ...<max> suffix to tell
  CMake that the project does not need compatibility with older versions.

[0m
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Could NOT find Eigen3 (missing: Eigen3_DIR)
-- Found Boost: /usr/lib/x86_64-linux-gnu/cmake/Boost-1.74.0/BoostConfig.cmake (found suitable version "1.74.0", minimum required is "1.41.0") found components: program_options system thread unit_test_framewor

In [4]:
%cd /content
!ls

/content
kenlm  sample_data


**Preprocessing (optional):**

Upload the training dataset. It can be a text file or a compressed file (bzip2). In case you would like to compress a text file and then pass it to the training function, here's how you do it:


```
bzip2 <input-file.txt>
```



**Training the model:**

Parameters can be provided, which include:


1.   Order
2.   Threshold
3.   Input file path



In [9]:
!pip install transformers



In [26]:
!pip install joblib



In [27]:
import os
from joblib import Parallel, delayed

In [28]:
# Utilities from https://github.com/NVIDIA/NeMo/blob/stable/scripts/asr_language_modeling/ngram_lm/kenlm_utils.py
def tokenize_str(texts, tokenizer, offset):
    tokenized_text = []
    for text in texts:
        tok_text = tokenizer.encode(text)
        tok_text = [chr(token + offset) for token in tok_text]
        tokenized_text.append(tok_text)
    return tokenized_text

def tokenize_text(data, tokenizer, path, chunk_size=8192, buffer_size=32, token_offset=100):
    dataset_len = len(data)
    print(
        f"Chunking {dataset_len} rows into {dataset_len / float(chunk_size):0.4f} tasks (each chunk contains {chunk_size} elements)"
    )

    current_step = 0
    if os.path.exists(path):
        print(f"Deleting previous file : {path}")
        os.remove(path)

    with Parallel(n_jobs=-2, verbose=10) as parallel:
        while True:
            start = current_step * chunk_size
            end = min((current_step + buffer_size) * chunk_size, dataset_len)

            tokenized_data = parallel(
                delayed(tokenize_str)(data[start : start + chunk_size], tokenizer, token_offset)
                for start in range(start, end, chunk_size)
            )

            # Write dataset
            write_dataset(tokenized_data, path)
            current_step += len(tokenized_data)
            print(f"Finished writing {len(tokenized_data)} chunks to {path}. Current chunk index = {current_step}")
            del tokenized_data
            if end >= dataset_len:
                break


def write_dataset(chunks, path):
    # basedir = os.path.dirname(path)

    # if not os.path.exists(basedir):
    #     os.makedirs(basedir, exist_ok=True)

    with open(path, 'a+', encoding='utf-8') as f:
        for chunk_idx in tqdm(range(len(chunks)), desc='Chunk ', total=len(chunks), unit=' chunks'):
            for text in chunks[chunk_idx]:
                line = ' '.join(text)
                f.write(f"{line}\n")

**Preprocessing: Add input text file that would be cleaned and tokenized.**

In [29]:
with open("Shah.txt", 'r', encoding='utf-8') as f:
  dataset = f.readlines()

In [30]:
dataset[0]

'\ufeff[LINE] Chat with Shah sb\n'

In [31]:
chars_to_ignore_regex = '[,?.!\-\;\:"“%‘”�—’…–]'
import re

def clean_text(text):
  text = re.sub(chars_to_ignore_regex, "", text.lower())
  return text

In [32]:
from tqdm import tqdm
dataset_clean = []
for text in tqdm(dataset[0:100000]):
    dataset_clean.append(clean_text(text))

100%|██████████| 16247/16247 [00:00<00:00, 429987.55it/s]


In [33]:
print(dataset_clean[4])

1443	khan sb	hey



In [16]:
!pip install git+https://github.com/HKAB/whisper.git

Collecting git+https://github.com/HKAB/whisper.git
  Cloning https://github.com/HKAB/whisper.git to /tmp/pip-req-build-qrbho9m6
  Running command git clone --filter=blob:none --quiet https://github.com/HKAB/whisper.git /tmp/pip-req-build-qrbho9m6
  Resolved https://github.com/HKAB/whisper.git to commit 6700260b0f4f43092adbaffeaa5322516fbebc19
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ffmpeg-python==0.2.0 (from whisper==1.0)
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->whisper==1.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->whisper==1.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [38]:
import whisper
tokenizer = whisper.tokenizer.get_tokenizer('en').tokenizer

In [39]:
tokenize_text(dataset_clean[0:10000], tokenizer, "dataset_tokenized.txt")

Chunking 10000 rows into 1.2207 tasks (each chunk contains 8192 elements)
Deleting previous file : dataset_tokenized.txt


[Parallel(n_jobs=-2)]: Done   1 tasks      | elapsed:    0.5s
Chunk : 100%|██████████| 2/2 [00:00<00:00, 139.14 chunks/s]

Finished writing 2 chunks to dataset_tokenized.txt. Current chunk index = 2





In [40]:
import os
import subprocess

def train_language_model(lmplz_path, order, threshold, input_file):
    """
    Train a language model using lmplz tool.

    Args:
    - lmplz_path (str): Path to the lmplz executable.
    - order (int): Order of the n-gram model.
    - threshold (int): Threshold count for pruning low-frequency n-grams.
    - input_file (str): Path to the input text file.

    Returns:
    - output_file (str): Path to the ARPA format output file.
    """
    # Generate output file name based on input file name
    output_file = os.path.splitext(input_file)[0] + "_model.arpa"

    command = [lmplz_path, '-o', str(order), '-T', str(threshold), '<', input_file, '>', output_file]
    command_str = ' '.join(command)

    # Run the command and capture the output and error
    process = subprocess.Popen(command_str, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()

    # Decode and print the output
    if stdout:
        print("Output:\n", stdout.decode())
    if stderr:
        print("Error:\n", stderr.decode())

    return output_file

# Example usage:
lmplz_path = "/content/kenlm/build/bin/lmplz"
# Set n-gram order
order = 3
#  Specifies a threshold count for pruning low-frequency n-grams from the model.
# N-grams occurring fewer times than the specified threshold will be pruned.
threshold = 5
# Enter path of input file:
input_file = "/content/dataset_tokenized.txt"

output_file = train_language_model(lmplz_path, order, threshold, input_file)
print("Output file:", output_file)

Error:
 === 1/5 Counting and sorting n-grams ===
Reading /content/dataset_tokenized.txt
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
Unigram tokens 171757 types 5081
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:60972 2:3786956544 3:7100543488
Statistics:
1 5081 D1=0.571604 D2=1.16226 D3+=1.6421
2 33493 D1=0.796666 D2=1.23628 D3+=1.50328
3 57638 D1=0.828935 D2=1.13094 D3+=1.30665
Memory estimate for binary LM:
type      kB
probing 1927 assuming -p 1.5
probing 2143 assuming -r models -p 1.5
trie     804 without quantization
trie     453 assuming -q 8 -b 8 quantization 
trie     771 assuming -a 22 array pointer compression
trie     420 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:60972 2:535888 3:1152760
=== 4/5 Calcul

**Testing generated model:**

In [37]:
#!/usr/bin/env python
import os
import kenlm

# LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
#model = kenlm.LanguageModel(LM)

model = kenlm.Model(output_file)
print('{0}-gram model'.format(model.order))

sentence = 'language modeling is fun .'
print(sentence)
print(model.score(sentence))

# Check that total full score = direct score
def score(s):
    return sum(prob for prob, _, _ in model.full_scores(s))

assert (abs(score(sentence) - model.score(sentence)) < 1e-3)

# Show scores and n-gram matches
words = ['<s>'] + sentence.split() + ['</s>']
for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
    print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
    if oov:
        print('\t"{0}" is an OOV'.format(words[i+1]))

# Find out-of-vocabulary words
for w in words:
    if not w in model:
        print('"{0}" is an OOV'.format(w))

#Stateful query
state = kenlm.State()
state2 = kenlm.State()
#Use <s> as context.  If you don't want <s>, use model.NullContextWrite(state).
model.BeginSentenceWrite(state)
accum = 0.0
accum += model.BaseScore(state, "a", state2)
accum += model.BaseScore(state2, "sentence", state)
#score defaults to bos = True and eos = True.  Here we'll check without the end
#of sentence marker.
assert (abs(accum - model.score("a sentence", eos = False)) < 1e-3)
accum += model.BaseScore(state, "</s>", state2)
assert (abs(accum - model.score("a sentence")) < 1e-3)

3-gram model
language modeling is fun .
-28.570974349975586
-6.237544059753418 1: language
	"language" is an OOV
-4.4959564208984375 1: modeling
	"modeling" is an OOV
-4.4959564208984375 1: is
	"is" is an OOV
-4.4959564208984375 1: fun
	"fun" is an OOV
-4.4959564208984375 1: .
	"." is an OOV
-4.349603176116943 1: </s>
"language" is an OOV
"modeling" is an OOV
"is" is an OOV
"fun" is an OOV
"." is an OOV
