In [12]:
import nltk, re
from icecream import ic
from nltk.util import ngrams, everygrams
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends, flatten
from nltk.lm import MLE
import dill as pickle
from tqdm import tqdm
import os

In [13]:
def preprocess(dir):
    '''
    Cleans and tokenizes a string
    :param texts:
    :return: list of tokens
    '''
    tokenize = nltk.tokenize.word_tokenize
    tokens = []
    for dirpath, dirname, filenames  in os.walk(dir):
        for filename in filenames:
            path = dirpath + '/' + filename

            with open(path, 'r', encoding='utf-8') as fp:
                text = fp.read()
            text = text.lower().strip()
            text = text.split('.')
            text = [re.sub(r'[^\w\s]', ' ', t) for t in text]
            print('Tokenizing', filename)
            tokens += [tokenize(t) for t in tqdm(text)]
    return tokens

# files = ['HPSS.txt', 'GoT.txt', 'TheStand.txt', 'TWoK.txt', 'ADwD.txt', 'SpthTra.txt', 'Bees.txt', 'Cat.txt']
tokens = preprocess('books')

Tokenizing ADwD.txt



  0%|          | 0/33838 [00:00<?, ?it/s][A
  3%|▎         | 1025/33838 [00:00<00:03, 10199.12it/s][A
  7%|▋         | 2266/33838 [00:00<00:02, 11496.83it/s][A
 10%|█         | 3416/33838 [00:00<00:02, 11472.11it/s][A
 13%|█▎        | 4564/33838 [00:00<00:02, 10715.30it/s][A
 17%|█▋        | 5791/33838 [00:00<00:02, 11231.71it/s][A
 20%|██        | 6921/33838 [00:00<00:02, 11036.34it/s][A
 24%|██▎       | 8030/33838 [00:00<00:02, 10796.53it/s][A
 27%|██▋       | 9287/33838 [00:00<00:02, 11337.22it/s][A
 31%|███       | 10426/33838 [00:00<00:02, 10766.55it/s][A
 34%|███▍      | 11511/33838 [00:01<00:02, 10697.13it/s][A
 37%|███▋      | 12587/33838 [00:01<00:01, 10700.05it/s][A
 40%|████      | 13690/33838 [00:01<00:01, 10781.22it/s][A
 44%|████▍     | 14806/33838 [00:01<00:01, 10877.32it/s][A
 47%|████▋     | 15896/33838 [00:01<00:01, 10662.22it/s][A
 50%|█████     | 16965/33838 [00:01<00:01, 10484.24it/s][A
 53%|█████▎    | 18016/33838 [00:01<00:01, 10208.87it/s][A
 5

Tokenizing Bees.txt



  0%|          | 0/3773 [00:00<?, ?it/s][A
 21%|██        | 790/3773 [00:00<00:00, 7899.42it/s][A
 42%|████▏     | 1580/3773 [00:00<00:00, 7482.15it/s][A
 67%|██████▋   | 2520/3773 [00:00<00:00, 8331.93it/s][A
100%|██████████| 3773/3773 [00:00<00:00, 8236.94it/s][A


Tokenizing Cat.txt



  0%|          | 0/5556 [00:00<?, ?it/s][A
 15%|█▍        | 815/5556 [00:00<00:00, 8149.95it/s][A
 34%|███▍      | 1891/5556 [00:00<00:00, 9685.03it/s][A
 51%|█████▏    | 2860/5556 [00:00<00:00, 9066.66it/s][A
 68%|██████▊   | 3772/5556 [00:00<00:00, 8819.49it/s][A
100%|██████████| 5556/5556 [00:00<00:00, 8882.54it/s][A


Tokenizing GoT.txt



  0%|          | 0/25467 [00:00<?, ?it/s][A
  5%|▍         | 1223/25467 [00:00<00:01, 12170.64it/s][A
 10%|█         | 2606/25467 [00:00<00:01, 13144.29it/s][A
 15%|█▌        | 3921/25467 [00:00<00:01, 12820.55it/s][A
 20%|██        | 5205/25467 [00:00<00:01, 11650.29it/s][A
 25%|██▌       | 6439/25467 [00:00<00:01, 11864.66it/s][A
 30%|██▉       | 7636/25467 [00:00<00:01, 11384.29it/s][A
 34%|███▍      | 8783/25467 [00:00<00:01, 11116.94it/s][A
 39%|███▉      | 9901/25467 [00:00<00:01, 10924.73it/s][A
 44%|████▎     | 11127/25467 [00:00<00:01, 11304.36it/s][A
 49%|████▉     | 12453/25467 [00:01<00:01, 11867.15it/s][A
 54%|█████▍    | 13743/25467 [00:01<00:00, 12083.79it/s][A
 59%|█████▊    | 14956/25467 [00:02<00:03, 3193.27it/s] [A
 64%|██████▎   | 16194/25467 [00:02<00:02, 4116.98it/s][A
 68%|██████▊   | 17380/25467 [00:02<00:01, 5086.54it/s][A
 72%|███████▏  | 18426/25467 [00:02<00:01, 5850.27it/s][A
 77%|███████▋  | 19640/25467 [00:02<00:00, 6964.40it/s][A
 81%|█

Tokenizing HPSS.txt



  0%|          | 0/6025 [00:00<?, ?it/s][A
 18%|█▊        | 1064/6025 [00:00<00:00, 10587.30it/s][A
 35%|███▌      | 2123/6025 [00:00<00:00, 9304.73it/s] [A
 53%|█████▎    | 3219/6025 [00:00<00:00, 10002.24it/s][A
 73%|███████▎  | 4423/6025 [00:00<00:00, 10752.05it/s][A
100%|██████████| 6025/6025 [00:00<00:00, 10504.49it/s][A


Tokenizing SMiF.txt



  0%|          | 0/3497 [00:00<?, ?it/s][A
 21%|██        | 734/3497 [00:00<00:00, 7336.56it/s][A
 49%|████▉     | 1721/3497 [00:00<00:00, 8803.37it/s][A
100%|██████████| 3497/3497 [00:00<00:00, 9286.72it/s][A


Tokenizing SpthTra.txt



  0%|          | 0/1985 [00:00<?, ?it/s][A
 47%|████▋     | 938/1985 [00:00<00:00, 9337.47it/s][A
100%|██████████| 1985/1985 [00:00<00:00, 8941.93it/s][A


Tokenizing TheStand.txt



  0%|          | 0/38925 [00:00<?, ?it/s][A
  3%|▎         | 1005/38925 [00:00<00:03, 10049.79it/s][A
  5%|▌         | 2010/38925 [00:00<00:03, 9818.62it/s] [A
  8%|▊         | 3054/38925 [00:00<00:03, 10095.81it/s][A
 20%|█▉        | 1201/6025 [00:11<00:00, 12009.75it/s]][A
 14%|█▎        | 5310/38925 [00:00<00:03, 10817.95it/s][A
 16%|█▋        | 6393/38925 [00:00<00:03, 10452.13it/s][A
 19%|█▉        | 7505/38925 [00:00<00:02, 10663.14it/s][A
 22%|██▏       | 8574/38925 [00:00<00:02, 10558.07it/s][A
 25%|██▍       | 9632/38925 [00:00<00:02, 10041.76it/s][A
 28%|██▊       | 10820/38925 [00:01<00:02, 10581.34it/s][A
 31%|███       | 11885/38925 [00:01<00:02, 10137.21it/s][A
 33%|███▎      | 12956/38925 [00:01<00:02, 10287.55it/s][A
 36%|███▌      | 13991/38925 [00:01<00:02, 9972.05it/s] [A
 39%|███▊      | 15025/38925 [00:01<00:02, 10062.93it/s][A
 41%|████▏     | 16108/38925 [00:01<00:02, 10284.99it/s][A
 44%|████▍     | 17177/38925 [00:01<00:02, 10389.13it/s][A
 47

Tokenizing TWoK.txt



  0%|          | 0/34899 [00:00<?, ?it/s][A
  3%|▎         | 1088/34899 [00:00<00:03, 10874.72it/s][A
  6%|▋         | 2199/34899 [00:00<00:02, 11011.76it/s][A
  9%|▉         | 3301/34899 [00:00<00:02, 11014.67it/s][A
 13%|█▎        | 4489/34899 [00:00<00:02, 11333.72it/s][A
 16%|█▋        | 5697/34899 [00:00<00:02, 11586.11it/s][A
 20%|█▉        | 6856/34899 [00:00<00:02, 11391.59it/s][A
 23%|██▎       | 8168/34899 [00:00<00:02, 11929.74it/s][A
 27%|██▋       | 9362/34899 [00:00<00:02, 11045.06it/s][A
 30%|███       | 10479/34899 [00:00<00:02, 10494.69it/s][A
 33%|███▎      | 11657/34899 [00:01<00:02, 10847.88it/s][A
 37%|███▋      | 12753/34899 [00:01<00:02, 10769.14it/s][A
 40%|███▉      | 13838/34899 [00:01<00:01, 10653.09it/s][A
 43%|████▎     | 14977/34899 [00:01<00:01, 10850.00it/s][A
 46%|████▌     | 16094/34899 [00:01<00:01, 10930.11it/s][A
 49%|████▉     | 17191/34899 [00:01<00:01, 10816.06it/s][A
 52%|█████▏    | 18290/34899 [00:01<00:01, 10851.30it/s][A
 5

In [14]:
inputs = padded_everygram_pipeline(4, tokens)
model = MLE(3)
model.fit(*inputs)

In [15]:
# with open('model.pk', 'wb') as fout:
#     pickle.dump(model, fout)

In [16]:
[x for x in model.generate(10) if x != '</s>' and x != '<s>']

[]