In [None]:
from datasets import load_dataset
from kenlm import Model as KenlmModel
from pyctcdecode.language_model import load_unigram_set_from_arpa, _prepare_unigram_set
import os
import sys
sys.path.append('../')
from utils import remove_unwanted_chars_and_uppercase

In [None]:
n = 3
dataset_name = 'gw_14_socialmedia_NER_RANDOM'
target_lang = 'sv'
text_file_name = f'../text_data/{dataset_name}.txt'
path_name = f'../language_models/{n}gram_{dataset_name}'
lm_name = 'ngram'
if not os.path.exists(path_name):
    os.mkdir(path_name)

lm_name_arpa = f'{path_name}/{lm_name}.arpa'
lm_name_correct = f'{path_name}/{lm_name}_correct.arpa'
lm_name_bin = f'{path_name}/{lm_name}.bin'

In [None]:
dataset = load_dataset('text', data_files='../korp/gigaword-2014-socialmedia.txt')['train']
dataset.num_rows

In [None]:
def extract_text(item):
    item['text'] = remove_unwanted_chars_and_uppercase(item['text'])
    return item

dataset = dataset.map(extract_text)

In [None]:
dataset.push_to_hub('swedish_culturomics_gigaword_corpus_2010_to_2015_preprocessed', split='train', private=True)

In [None]:
# Save the data to a text file

with open(text_file_name, 'w') as file:
    file.write('\n'.join(dataset['text']))

In [None]:
# Construct an n-gram model from .txt file
n = n
text_file_name = text_file_name
lm_name_arpa = lm_name_arpa

!../kenlm/build/bin/lmplz -o {n} <{text_file_name} > {lm_name_arpa}

# Inspect the model
!head -20 {lm_name_arpa}

In [None]:
# Add the final </s> token to the n-gram

with open(lm_name_arpa, 'r') as read_file, open(lm_name_correct, 'w') as write_file:
    has_added_eos = False
    for line in read_file:
        if not has_added_eos and 'ngram 1=' in line:
            count = line.strip().split('=')[-1]
            write_file.write(line.replace(f'{count}', f'{int(count)+1}'))
        elif not has_added_eos and '<s>' in line:
            write_file.write(line)
            write_file.write(line.replace('<s>', '</s>'))
            has_added_eos = True
        else:
            write_file.write(line)

In [None]:
kenlm_model = KenlmModel(lm_name_arpa)

unigrams = load_unigram_set_from_arpa(lm_name_arpa)
unigram_set = _prepare_unigram_set(unigrams, kenlm_model)
unigrams_path = f'{path_name}/unigrams.txt'

with open(unigrams_path, "w") as fi:
    for unigram in sorted(unigram_set):
        fi.write(unigram + "\n")

In [None]:
# Convert the n-gram to binary file to reduce the size
lm_name_arpa = lm_name_arpa
lm_name_bin = lm_name_bin

!../kenlm/build/bin/build_binary {lm_name_arpa} {lm_name_bin}

In [None]:
# Remove the arpa files and print the binary file to see their sizes
lm_name_arpa = lm_name_arpa
lm_name_correct = lm_name_correct

!rm {lm_name_arpa}
!rm {lm_name_correct}
!tree -h language_models