In [50]:
import json
import re
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from joblib import dump
from itertools import chain
import unicodedata

In [55]:
class PizzaOrderCRFModel:
    def __init__(self):
        self.model = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
    
    def word2features(self, sentence, i):
        word = sentence[i]
        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.isdigit()': word.isdigit(),
        }
        if i > 0:
            word1 = sentence[i-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.isupper()': word1.isupper(),
                '-1:word.isdigit()': word1.isdigit(),
            })
        else:
            features['BOS'] = True

        if i < len(sentence)-1:
            word1 = sentence[i+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.isupper()': word1.isupper(),
                '+1:word.isdigit()': word1.isdigit(),
            })
        else:
            features['EOS'] = True
            
        return features

    def sentence_features(self, words):
        return [self.word2features(words, i) for i in range(len(words))]

    def sentence_labels(self, labels):
        return labels

    def load_data_from_json(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return [(
            item['words'], 
            item['label']
        ) for item in data]

    def train(self, data):
        X = [self.sentence_features(s[0]) for s in data]
        y = [self.sentence_labels(s[1]) for s in data]

        self.model.fit(X, y)

    def evaluate(self, data):
        X_test = [self.sentence_features(s[0]) for s in data]
        y_test = [self.sentence_labels(s[1]) for s in data]
        y_pred = self.model.predict(X_test)

        y_test_flat = list(chain.from_iterable(y_test))
        y_pred_flat = list(chain.from_iterable(y_pred))

        labels = ['B-Quantity', 'B-Pizza', 'I-Pizza', 'B-Topping', 'B-Size', 'I-Size', 'O', 'B-Crust', 'I-Crust']
        print(classification_report(y_test_flat, y_pred_flat, labels=labels, target_names=labels))

    def process_sentence(self, sentence):
        tokens = re.findall(r"[\w']+|[.,!?;]", sentence)
        return {
            "words": tokens,
            "label": ["O"] * len(tokens),
        }

    def predict(self, text):
        processed_sentence = self.process_sentence(text)
        words = processed_sentence["words"]
        features = self.sentence_features(words)

        labels = self.model.predict([features])[0]

        result = {
            "words": words,
            "label": labels,
        }
        return self.aggregate_entities(result)
    
    def predict_with_confidence(self, text):
        processed_sentence = self.process_sentence(text)
        words = processed_sentence["words"]
        features = self.sentence_features(words)

        labels_with_confidence = self.model.predict_marginals([features])[0]

        result = []
        for word, word_labels in zip(words, labels_with_confidence):
            best_label, best_confidence = max(word_labels.items(), key=lambda item: item[1])
            result.append((word, best_label, best_confidence))

        return result

    def aggregate_entities(self, predicted_result):
        aggregated_entities = {}
        current_entity = None
        current_label = None

        for word, label in zip(predicted_result["words"], predicted_result["label"]):
            if label.startswith("B-"):
                if current_entity is not None and current_label is not None:
                    if current_label in aggregated_entities:
                        aggregated_entities[current_label].append(" ".join(current_entity))
                    else:
                        aggregated_entities[current_label] = [" ".join(current_entity)]
                
                current_entity = [word]
                current_label = label[2:] 
            elif label.startswith("I-") and current_entity is not None and label[2:] == current_label:
                current_entity.append(word)
            else:
                if current_entity is not None and current_label is not None:
                    if current_label in aggregated_entities:
                        aggregated_entities[current_label].append(" ".join(current_entity))
                    else:
                        aggregated_entities[current_label] = [" ".join(current_entity)]
                    current_entity = None
                    current_label = None
                if label == "O":
                    continue
                else:
                    aggregated_entities[label] = aggregated_entities.get(label, []) + [word]

        if current_entity is not None and current_label is not None:
            if current_label in aggregated_entities:
                aggregated_entities[current_label].append(" ".join(current_entity))
            else:
                aggregated_entities[current_label] = [" ".join(current_entity)]

        return aggregated_entities

    def save_model(self, file_path):
        dump(self.model, file_path)

model = PizzaOrderCRFModel()
data = model.load_data_from_json('../data/labeled/entity/order/data_20240331.json')
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
model.train(train_data)
model.evaluate(test_data)
model.save_model("homie.h5")

              precision    recall  f1-score   support

  B-Quantity       1.00      1.00      1.00       107
     B-Pizza       0.98      0.99      0.99       102
     I-Pizza       0.95      0.98      0.97        85
   B-Topping       0.99      0.97      0.98       157
      B-Size       1.00      0.99      0.99        90
      I-Size       0.99      1.00      0.99        86
           O       0.99      1.00      0.99       298
     B-Crust       0.99      0.96      0.97        73
     I-Crust       0.97      0.99      0.98        71

   micro avg       0.99      0.99      0.99      1069
   macro avg       0.98      0.99      0.98      1069
weighted avg       0.99      0.99      0.99      1069



In [167]:
def read_tokenize_dictionary(dictionary_path="utils/tokenize_dictionary.json"):
    with open(dictionary_path, 'r', encoding="utf-8") as file:
        tokenize_dictionary = json.load(file)
    return tokenize_dictionary

def read_stop_word_dictionary(dictionary_path="utils/vietnamese-stopwords.txt"):
    with open(dictionary_path, "r", encoding="utf-8") as file:
        stopwords_dictionary = file.read()
    return set(stopwords_dictionary.split("\n"))

def lowercase_text(text: str): 
    return text.lower()

def remove_diacritic(text: str):
    nfkd_form = unicodedata.normalize('NFKD', text)
    return ''.join([c for c in nfkd_form if not unicodedata.combining(c)]).replace("đ", "d")

def tokenize(text: str, tokenize_dictionary: dict):
    sorted_items = sorted(tokenize_dictionary.items(), key=lambda x: len(x[0]), reverse=True)
    for original, token in sorted_items:
        pattern = re.compile(r'\b' + re.escape(original) + r'\b', re.IGNORECASE)
        text = pattern.sub(token, text)
    return text

def combined_tokenize(text: str, tokenize_dictionary: dict):
    tokenized_text = tokenize(text, tokenize_dictionary)

    list_token_tokenized_text = tokenized_text.split()
    token_diacritic_map = {}
    no_diacritic_text = ""
    for index, token in enumerate(list_token_tokenized_text):
        if "_" not in token:
            token_diacritic_map[remove_diacritic(token)] = index
            no_diacritic_text += remove_diacritic(token) + " "
    no_diacritic_text = no_diacritic_text.strip()

    no_diacritic_tokenized_text = tokenize(no_diacritic_text, tokenize_dictionary)
    for no_diacritic_token in no_diacritic_tokenized_text.split():
        if "_" in no_diacritic_token:
            start_of_word = float('inf')
            end_of_word = float('-inf')
            for part_token in no_diacritic_token.split("_"):
                part_token = remove_diacritic(part_token)
                start_of_word = min(start_of_word, token_diacritic_map.get(part_token))
                end_of_word = max(end_of_word, token_diacritic_map.get(part_token)) + 1
            list_token_tokenized_text[start_of_word : end_of_word] = [no_diacritic_token]
    
    return " ".join([token for token in list_token_tokenized_text])

def remove_stopwords(text: str, stopwords_dictionary: set):
    stopwords_regex = '|'.join(re.escape(stopword) for stopword in sorted(stopwords_dictionary, key=len, reverse=True))
    text = re.sub(r'\b(?:' + stopwords_regex + r')(?:\W|$)', ' ', text)
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

def ner_preprocessing(text: str, tokenize_dictionary: dict = read_tokenize_dictionary(), stopwords_dictionary: set = read_stop_word_dictionary()):
    text = lowercase_text(text)
    text = combined_tokenize(text, tokenize_dictionary)
    text = remove_stopwords(text, stopwords_dictionary)
    return text

In [168]:
text = "đặt 2 cái pizza pepperoni, một cái L, thịt hun khói, nấm rơm ớt xanh và một cái size XL thêm bả chì bố nướng"
model.predict_with_confidence(ner_preprocessing(text))

[('2', 'B-Quantity', 0.9370998094622134),
 ('cái_pizza', 'B-Pizza', 0.8716720002742258),
 ('pepperoni', 'I-Pizza', 0.7060395886143229),
 (',', 'O', 0.9989631037075085),
 ('một', 'B-Quantity', 0.9985210497087567),
 ('cái', 'O', 0.9820262970032756),
 ('l', 'B-Size', 0.4532072267317428),
 (',', 'O', 0.9951493906272788),
 ('thịt_hun_khói', 'B-Topping', 0.9474141000952374),
 (',', 'O', 0.9959466172695189),
 ('nấm_rơm', 'B-Topping', 0.9941752417474755),
 ('ớt_xanh', 'B-Topping', 0.9947987666980466),
 ('một', 'B-Quantity', 0.9935378491490322),
 ('cái', 'O', 0.9959443061005397),
 ('size', 'B-Size', 0.9785916245302014),
 ('xl', 'I-Size', 0.9969574829506358),
 ('ba_chỉ_bò_nướng', 'B-Topping', 0.7799210896855255)]

In [141]:
model.predict(ner_preprocessing(text))

{'Quantity': ['2', 'một', 'một'],
 'Pizza': ['cái_pizza pepperoni'],
 'Size': ['size l', 'size m'],
 'Topping': ['thịt_hun_khói']}