In [4]:
import torchtext
import re
import numpy as np
from torchtext.data import get_tokenizer

In [5]:
def flatten(container):
    for c in container:
        for i in c:
            yield i

In [6]:
from typing import List, Callable
def get_data(path_to_ka_data: str) -> List[str]:
    f = open(path_to_ka_data)
    N_DATA_WORDS = 10_000_000
    data = f.read(N_DATA_WORDS*10)
    f.close()
    lines = data.splitlines()
    return lines

In [5]:
import re
alphabets= "(\w)"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = r"(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = r"(\w[.]\w[.](?:\w[.])?)"
websites = "[.](com|net|org|io|gov)"
digits = "(\d)"
geo_bc = "ძვ[.]წ[.]"

# <stop> will be an actual sentence splitting token
# <sep> will just be a dot
def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text) # prefix
    text = re.sub(websites,"<prd>\\1",text) # websites
    text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
    text = re.sub(r"(\d+)\.(\d+)\.(\d+)",r"\1<prd>\2<prd>\3",text) # dates
    text = re.sub(geo_bc,"ძვ<prd>წ<prd>",text)
    if "..." in text: text = text.replace("...","<prd><prd><prd>")
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

In [6]:
assert split_into_sentences('20.20.2020.  დღეს არის 2020 წელი ძვ.წ. ჰმჰმ.') == ['20.20.2020.', 'დღეს არის 2020 წელი ძვ.წ. ჰმჰმ.']

In [7]:
assert split_into_sentences('შ.პ.ს. მაგარი რამეა.') == ['შ.პ.ს. მაგარი რამეა.']

In [8]:
split_into_sentences('hi this is so cool.')

['hi this is so cool.']

In [56]:
#original_data = get_data('./data/ka.txt')

In [9]:
with open('./data/უცხოელი_მწერლები_234.txt', 'r') as input_file:
    with open('./data/ka_nse_test.txt', 'w') as f:
        for doc in input_file:
            for s in split_into_sentences(doc):
                f.write(s)
                f.write('\n')
            f.write('\n')