In [7]:
import os
import re
import json
import py_vncorenlp
import unicodedata

In [8]:
current_folder = os.getcwd().replace("\\", "/")

In [10]:
def read_tokenize_dictionary(dictionary_path="utils/tokenize_dictionary.json"):
    with open(dictionary_path, 'r', encoding="utf-8") as file:
        tokenize_dictionary = json.load(file)
    return tokenize_dictionary

def read_stop_word_dictionary(dictionary_path="utils/vietnamese-stopwords.txt"):
    with open(dictionary_path, "r", encoding="utf-8") as file:
        stopwords_dictionary = file.read()
    return set(stopwords_dictionary.split("\n"))

def lowercase_text(text: str): 
    return text.lower()

def remove_diacritic(text: str):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)]).replace("đ", "d")

# def tokenize(text: str, tokenize_dictionary: dict):
#     tokenized_sentence = text
#     for original, token in tokenize_dictionary.items():
#         pattern = re.compile(re.escape(original), re.IGNORECASE)
#         tokenized_sentence = pattern.sub(token, tokenized_sentence)
#     return tokenized_sentence

def tokenize(text: str, tokenize_dictionary: dict):
    sorted_items = sorted(tokenize_dictionary.items(), key=lambda x: len(x[0]), reverse=True)
    for original, token in sorted_items:
        pattern = re.compile(r'\b' + re.escape(original) + r'\b', re.IGNORECASE)
        text = pattern.sub(token, text)
    return text

def combined_tokenize(text: str, tokenize_dictionary: dict):
    tokenized_original = tokenize(text, tokenize_dictionary)

    replace_map = {}
    for key in tokenize_dictionary.keys():
        no_accents_key = remove_diacritic(key)
        if no_accents_key != key:
            replace_map[no_accents_key.lower()] = tokenize_dictionary[key]
    final_text = ' '.join([replace_map.get(remove_diacritic(word).lower(), word) for word in tokenized_original.split()])
    
    return final_text

# def remove_stopwords(text: str, stopwords_dictionary: set):
#     for stopword in stopwords_dictionary:
#         text = re.sub(r'\b' + re.escape(stopword) + r'\b', '', text)
#     text = re.sub(r'[^\w\s]', '', text)
#     return re.sub(r'\s+', ' ', text).strip()

def remove_stopwords(text: str, stopwords_dictionary: set):
    stopwords_regex = '|'.join(re.escape(stopword) for stopword in sorted(stopwords_dictionary, key=len, reverse=True))
    text = re.sub(r'\b(?:' + stopwords_regex + r')(?:\W|$)', ' ', text)
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def ner_preprocessing(text: str, tokenize_dictionary: dict, stopwords_dictionary: set):
    text = lowercase_text(text)
    text = combined_tokenize(text, tokenize_dictionary)
    text = remove_stopwords(text, stopwords_dictionary)
    return text

In [12]:
tokenize_dictionary = read_tokenize_dictionary()
stopwords_dictionary = read_stop_word_dictionary()
with open("../data/raw/entity/order/processed_ideal_order.txt", 'r', encoding='utf-8') as input_file:
    with open("../data/processed/entity/order/processed_ideal_order.txt", 'w', encoding='utf-8') as output_file:
        for line in input_file:
            processed_line = ner_preprocessing(line.strip(), tokenize_dictionary, stopwords_dictionary)
            output_file.write(processed_line + '\n')