In [1]:
import torchtext
import re
import numpy as np
from torchtext.data import get_tokenizer

In [2]:
def flatten(container):
    for c in container:
        for i in c:
            yield i

In [3]:
from typing import List, Callable
def get_raw_data(path_to_ka_data: str) -> List[str]:
    f = open(path_to_ka_data)
    N_DATA_WORDS = 10_000_000
    data = f.read(N_DATA_WORDS*10)
    f.close()
    lines = data.splitlines()
    return lines

In [4]:
# TO CHANGE DATA SOURCE CHANGE THIS
get_data: Callable[str, List[str]] = lambda: get_raw_data('./data/en_part_670.txt')

In [5]:
# TODO: should we take functions out of notebook?
from geotok import _basic_georgian_normalize

In [12]:
# TO CHANGE TOKENIZATION METHOD CHANGE THIS
tokenizer = get_tokenizer(_basic_georgian_normalize)

In [7]:
tokenizer('20.20.2020 მაგარი თარიღია. ძვ.წ. 2020 წელიც მაგარი იყო.')

['20/20/2020',
 'მაგარი',
 'თარიღია',
 '.',
 'ძვ',
 ';',
 'წ',
 ';',
 '2020',
 'წელიც',
 'მაგარი',
 'იყო',
 '.']

In [8]:
original_data = get_data()

In [9]:
data = list(map(tokenizer, original_data))

In [11]:
len(data)

294365

Bad pipe message: %s [b'Mh\xbe\x82\x03\x0fb\xed\xc3P\xd23\x8fS\xc5\x02;% 3\x0f}d\xddIk$|\xafa\x1c\x9cU\xeend&,\xe6\xa3\x85t\x86C#\xcc\xbd+\x15\xfd\xea\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00']
Bad pipe message: %s [b'\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18']
Bad pipe message: %s [b'DB\xe0\xb8\xb4\xa8\xb2\xff\xa7\xbfq\xe7\x95\xc7\xa87\xc1\xc1\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2']
Bad pipe message: %s [b"\x08\x8b\x1e\xb5p\xbeN\xb5\xc6@r\xf1Q\x14\x02\xd5\xa3`\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x00

In [12]:
# Take in tokens, return sentences
def basic_sentence_splitter(tokens: List[str]) -> List[List[str]]:
    sentences = []
    cur = []
    for tok in tokens:
        cur.append(tok)
        if tok in ['.', '!', '?']:
            sentences.append(cur)
            cur = []
    return sentences

In [13]:
# Take in tokens, return sentences
def sentence_splitter_bigger_than_one(tokens: List[str]) -> List[List[str]]:
    sentences = []
    cur = []
    cnt_since_last_dot = 0
    for tok in tokens:
        cur.append(tok)
        cnt_since_last_dot += 1
        if tok in ['.', '!', '?']:
            if cnt_since_last_dot > 2: # sentence should be longer than 1(2 because . counts as 1 too)
                sentences.append(cur)
                cur = []
            cnt_since_last_dot = 0
    if len(cur) > 0:
        sentences.append(cur)
    return sentences

In [14]:
# CHANGE THIS TO CHANGE SENTENCE SPLITTER
sentence_splitter = sentence_splitter_bigger_than_one

In [15]:
sentence_splitter(tokenizer('20.20.2020 მაგარი თარიღია. ძვ.წ. 2020 წელიც მაგარი იყო.'))

[['20/20/2020', 'მაგარი', 'თარიღია', '.'],
 ['ძვ', ';', 'წ', ';', '2020', 'წელიც', 'მაგარი', 'იყო', '.']]

In [16]:
sentences = list(flatten(map(sentence_splitter, data)))

In [21]:
f=open('./data/nose.txt', 'w')
for s in sentences:
    #print(s)
    s=s[:40]
    f.write(' '.join(s))
    f.write('\n')
f.close()

In [22]:
len(sentences)

1346603