In [8]:
import os

from tokenizers import Tokenizer

tokenizer_uni = Tokenizer.from_file(
    os.path.join('../dags/src/spellcheck/data/', 'tokenizer_unigram_5k.json')
)

In [9]:
with open('../data/tesstrain/kbd/configs/kbd.wordlist', 'r') as f:
    words = f.read().split('\n')

In [11]:
from tqdm import tqdm
import csv
import pandas as pd
from collections import defaultdict
import nltk


def create_token_ng_distribution(words, n=5):
    fd = nltk.FreqDist()
    ngrams_tokens = defaultdict(list)

    for word in sorted(words):
        tokens = tokenizer_uni.encode(word).tokens
        token_ids = [tokenizer_uni.token_to_id(token) for token in tokens]

        ngrams = tuple(nltk.ngrams(tokens, n=n))
        fd.update(ngrams)
        for ng in ngrams:
            ngrams_tokens[ng].append((tokens, token_ids))

    os.makedirs('../data/tesstrain/kbd/token_dist', exist_ok=True)
    os.makedirs(f'../data/tesstrain/kbd/token_dist/{n}', exist_ok=True)
    
    data = []
    
    for ng, freq in tqdm(sorted(fd.items(), key=lambda x: x[1], reverse=True)):
        if freq < 10:
            break

        ng_name = '_'.join(ng)
        f_name = f'({freq}){ng_name}'

        df_data = []
        for _tokens, _token_ids in ngrams_tokens[ng]:
            df_data.append({
                'ng_name': ng_name,
                'q_ng_len': n,
                'q': ''.join(ng),
                'word_ng_len': len(_tokens),
                'word': ''.join(_tokens),
                'tokens': '|'.join(_tokens),
                'token_ids': '|'.join([str(_id) for _id in _token_ids])
            })
        data.extend(df_data)

        df = pd.DataFrame(df_data)
        df.to_csv(f'../data/tesstrain/kbd/token_dist/{n}/{f_name}.csv', index=False, sep=',', quoting=csv.QUOTE_NONE, header=True)
    
    return data

In [12]:
data = []
for n in range(1, 7):
    data_i = create_token_ng_distribution(words, n=n)
    data.extend(data_i)

 68%|██████▊   | 759/1113 [00:07<00:03, 97.83it/s] 
 30%|███       | 14978/49331 [00:11<00:27, 1267.02it/s]
 12%|█▏        | 26218/218569 [00:13<01:37, 1967.41it/s]
  5%|▍         | 18067/366593 [00:08<02:36, 2227.57it/s]
  2%|▏         | 6945/378005 [00:03<02:48, 2203.63it/s]
  1%|          | 1779/280121 [00:00<01:57, 2361.33it/s]


In [13]:
df = pd.DataFrame(data)
df.to_csv(f'../data/tesstrain/kbd/token_dist/all.csv', index=False, sep=',', quoting=csv.QUOTE_ALL, header=True)