## Import library

In [5]:
import json
import re
from joblib import dump
from itertools import chain
from sklearn_crfsuite import CRF
from utils.preprocessing import preprocessing
from sklearn.model_selection import train_test_split

## Implement CRF model

In [6]:
class CustomerInfoCRFModel:
    def __init__(self):
        self.model = CRF(
            algorithm='lbfgs',
            c1=0.1,
            c2=0.1,
            max_iterations=100,
            all_possible_transitions=True
        )
        self.labels = ['Quantity', 'Pizza', 'Topping', 'Size', 'Crust', 'O']
    
    def word2features(self, sentence, i):
        word = sentence[i]
        features = {
            'bias': 1.0,
            'word.lower()': word.lower(),
            'word[-3:]': word[-3:],
            'word[-2:]': word[-2:],
            'word.isupper()': word.isupper(),
            'word.isdigit()': word.isdigit(),
        }
        if i > 0:
            word1 = sentence[i-1]
            features.update({
                '-1:word.lower()': word1.lower(),
                '-1:word.isupper()': word1.isupper(),
                '-1:word.isdigit()': word1.isdigit(),
            })
        else:
            features['BOS'] = True

        if i < len(sentence)-1:
            word1 = sentence[i+1]
            features.update({
                '+1:word.lower()': word1.lower(),
                '+1:word.isupper()': word1.isupper(),
                '+1:word.isdigit()': word1.isdigit(),
            })
        else:
            features['EOS'] = True
            
        return features

    def sentence_features(self, words):
        return [self.word2features(words, i) for i in range(len(words))]

    def sentence_labels(self, labels):
        return labels

    def load_data_from_json(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        return [(
            item['words'], 
            item['label']
        ) for item in data]

    def train(self, data):
        X = [self.sentence_features(s[0]) for s in data]
        y = [self.sentence_labels(s[1]) for s in data]

        self.model.fit(X, y)

    def process_sentence(self, sentence):
        tokens = re.findall(r"[\w']+|[.,!?;]", sentence)
        return {
            "words": tokens,
            "label": ["O"] * len(tokens),
        }

    def predict(self, text):
        processed_sentence = self.process_sentence(text)
        words = processed_sentence["words"]
        features = self.sentence_features(words)

        labels = self.model.predict([features])[0]

        return {
            "words": words,
            "label": labels,
        }

    def save_model(self, file_path):
        dump(self.model, file_path)

In [38]:
model = CustomerInfoCRFModel()
data = model.load_data_from_json('../data/labeled/entity/order/case_3_train.json')
model.train(data)
model.save_model("order_entity_case_3.h5")