In [448]:
# Imports
import os
import re

from resources.logger import log

# usage of counters : https://docs.python.org/2/library/collections.html
from ftfy import fix_encoding


In [449]:
# Init stuff
# emit a warning to the puny Humans
log.info('Welcome to the Georgian NLP toolset demo')

2021-07-11 20:56:46,601	INFO -- MainProcess <ipython-input-449-8d1b7e0d6f71>:3 -- Welcome to the Georgian NLP toolset demo


In [450]:
# functions
def file_path(file_name):
    """generates abs path relative to the Package and modules"""
    data_dir = os.path.abspath('')
    f_path = os.path.join(data_dir, file_name)
    # check if the path exists
    if os.path.exists(f_path):
        return f_path
    else:
        raise FileNotFoundError

def is_not_printable(word, letters_only = True):
    """
    Checks if the string contains the printable symbols
    :param word:
    :param letters_only:
    :return True or False:
    """

    for char in word:
        if char not in letters:
            return True


def sizeof_fmt(file_size, suffix='B'):
    """
    Returns the Human Readable file volume unit from num
    :param suffix: default
    :param file_size: file size in integer
    :return:
    ref: https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
    """
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
        if abs(file_size) < 1024.0:
            return "%3.1f%s%s" % (file_size, unit, suffix)
        file_size /= 1024.0
    return "%.1f%s%s" % (file_size, 'Yi', suffix)

In [451]:
numbers = set("0123456789")
symbols = set("!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c")
letters = set('ქწერტყუიოპჭღთასდფგჰჯკლშჟ₾ზხცვბნმძჩ')
printable = set().union(*[numbers, symbols, letters])

In [452]:
class FileToProcess():
    def __init__(self, file_name=None, stop_words="data/stops.txt"):

        if not file_name:
            raise ValueError("Init FileToProcess class with path to the file as file_name argument ")

        self.file_name = file_name
        self.__path = file_path(file_name)
        self.__status = os.stat(self.__path)
        self.__file_object = None

        self.stop_words = set(line.strip() for line in open(file_path(stop_words), encoding='utf-8'))
        self.sequence = []
        self.file_size = sizeof_fmt(self.__status.st_size)

    def load_file(self, max_num_of_lines = -1):
        with open(self.__path, mode='r', encoding='utf-8') as text_file:
            for n, line in enumerate(text_file):

                if n == max_num_of_lines:
                    break

                try:
                    fix_encoding(line)
                    line = line.strip('\n')
                    self.sequence.append(line)

                except Exception as error:
                    print(f"During handling the sentence \"{line}\", following error has occured: {error}")
        # Finally:
        log.info(f'Number of lines in sequence: {len(self.sequence)}')

    def __preprocess_text(self, text, min_sentence_size=3, min_word_size=2, max_word_size=25):
        """preprocess text for NLP tasks"""
        tokens = text.split()
        if len(tokens) <= min_sentence_size:
            return None
        # Removing prefixed 'b'
        text = re.sub(r'^b\s+', '', text)
        # Url Removal
        text = re.sub(r'http\S+', '',text)
        # Number Removal
        text = re.sub(r"$\d+\W+|\b\d+\b|\W+\d+$", '', text)
        # mail removal
        text = re.sub(r'\S*@\S*\s?', '', text)
        # Remove all the special characters
        text = re.sub(r'\W', ' ', text)
        # remove all single characters
        text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
        # Remove single characters from the start
        text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
        # Substituting multiple spaces with single space
        text = re.sub(r'\s+', ' ', text, flags=re.I)
        # Converting to Lowercase
        text = text.lower()
        # Lemmatization - missing
        tokens = text.split()

        index = len(tokens) - 1
        while index >= 0:
            word = tokens[index]
            if is_not_printable(word) or word in self.stop_words or len(word) < min_word_size or len(word) > max_word_size:
                tokens.pop(index)
            index -= 1
        if len(tokens) < min_sentence_size:
            return None

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

    def pre_process(self):
        index = len(self.sequence) - 1
        log.info(f'Number of lines before pre-pro: {index}')
        while index >= 0:
            sentence = self.sequence[index]
            if pre_processed_sentence := self.__preprocess_text(sentence):
                # update sentence with preprocessed version
                # log.info(f"replaceing: {self.sequence[index]} > {pre_processed_sentence}")
                self.sequence[index] = pre_processed_sentence
            else:
                # pop sentence from sequence
                #log.info(f"popping: {self.sequence[index]} ")
                self.sequence.pop(index)
            index -= 1
        log.info(f'Number of lines after pre-pro: {len(self.sequence)}')

    def save_to_file(self, filename=None):
        if not filename:
            file_name = f"{self.file_name.split('.')[0]}_pre_processed.txt"
            with open(file_name, 'w', encoding="utf-8") as output:
                for sentence in self.sequence:
                    output.write(str(sentence) + '\n')

    def __len__(self):
        return len(self.sequence)

    def __repr__(self):
        return f"Obj of file at: {self.__path}"

In [453]:
# load the file
text_file = FileToProcess('data/corpuses/kawiki-latest-pages-articles_preprocessed.txt')

In [454]:
text_file.load_file(max_num_of_lines=50)
len(text_file)

2021-07-11 20:56:46,682	INFO -- MainProcess <ipython-input-452-cc3fc16935ac>:31 -- Number of lines in sequence: 50


50

In [455]:
# text_file.sequence.append("რედაქციის ელ-ფოსტა: resonancenewspaper@yahoo.com")
# text_file.sequence.append("მე ვარ ცა იმ ქალაქების თავზე")
# print(text_file.sequence)

In [456]:
text_file.pre_process()
# print(text_file.sequence)

2021-07-11 20:56:46,711	INFO -- MainProcess <ipython-input-452-cc3fc16935ac>:74 -- Number of lines before pre-pro: 49
2021-07-11 20:56:46,714	INFO -- MainProcess <ipython-input-452-cc3fc16935ac>:86 -- Number of lines after pre-pro: 49


In [457]:
text_file.save_to_file()

In [458]:
len("იმ")

2