In [1]:
import pickle 
import pandas as pd
import spacy
import random
from os import chdir
from tqdm import tqdm
from nltk.corpus import wordnet as wn

chdir(r"C:\Users\chest\Desktop\Projects\Restaurant Chatbot\data")

In [2]:
with open("sem2014.pkl", "rb") as file:
    sem_dict = pickle.load(file)

aspect = [] #Only using aspect to update NER
non_aspect = []
for key in sem_dict.keys():
    if len(sem_dict[key]["aspect"]) > 0:
        aspect.append(sem_dict[key])
    else:
        non_aspect.append(sem_dict[key])

In [3]:
TRAIN_DATA = []        
for item in aspect:
    aspects = item["aspect"]
    category = item["category"]
    text = item["text"]
    for ind in range(len(category)):
        if ind == 0:
            NER_LABEL = category[ind]["category"]
        else:
            NER_LABEL = NER_LABEL + "_" + category[ind]["category"]
    aspects_result = []
    for ner in aspects:
        result = (int(ner["from"]),int(ner["to"]),  NER_LABEL)
        aspects_result.append(result)
    NER_ITEM = (text, {"entities": aspects_result})
    TRAIN_DATA.append(NER_ITEM)


In [16]:
# function to train NER
def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in tqdm(range(iterations)):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [17]:
#Train the model
prdnlp = train_spacy(TRAIN_DATA, 20) 
print("Done training")


  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

Statring iteration 0
{'ner': 5797.143104725427}


  5%|████▏                                                                              | 1/20 [01:12<22:52, 72.23s/it]

Statring iteration 1
{'ner': 4938.267881444179}


 10%|████████▎                                                                          | 2/20 [02:52<24:09, 80.55s/it]

Statring iteration 2
{'ner': 4523.334645488919}


 15%|████████████▍                                                                      | 3/20 [04:53<26:15, 92.69s/it]

Statring iteration 3
{'ner': 4445.153308670598}


 20%|████████████████▌                                                                  | 4/20 [06:49<26:34, 99.66s/it]

Statring iteration 4
{'ner': 4250.480587656978}


 25%|████████████████████▌                                                             | 5/20 [08:38<25:38, 102.56s/it]

Statring iteration 5
{'ner': 4239.560315974639}


 30%|████████████████████████▌                                                         | 6/20 [10:27<24:23, 104.51s/it]

Statring iteration 6
{'ner': 4070.705395425421}


 35%|████████████████████████████▋                                                     | 7/20 [12:16<22:55, 105.82s/it]

Statring iteration 7
{'ner': 3904.0341578231273}


 40%|████████████████████████████████▊                                                 | 8/20 [14:05<21:21, 106.78s/it]

Statring iteration 8
{'ner': 3730.5337340617216}


 45%|████████████████████████████████████▉                                             | 9/20 [15:54<19:41, 107.42s/it]

Statring iteration 9
{'ner': 3656.4170205765586}


 50%|████████████████████████████████████████▌                                        | 10/20 [17:43<17:59, 107.93s/it]

Statring iteration 10
{'ner': 3507.818124413876}


 55%|████████████████████████████████████████████▌                                    | 11/20 [19:32<16:13, 108.20s/it]

Statring iteration 11
{'ner': 3449.6012607359244}


 60%|████████████████████████████████████████████████▌                                | 12/20 [21:20<14:26, 108.33s/it]

Statring iteration 12
{'ner': 3303.093043976465}


 65%|████████████████████████████████████████████████████▋                            | 13/20 [23:09<12:39, 108.51s/it]

Statring iteration 13
{'ner': 3144.081947871398}


 70%|████████████████████████████████████████████████████████▋                        | 14/20 [24:58<10:50, 108.46s/it]

Statring iteration 14
{'ner': 3114.4620326091454}


 75%|████████████████████████████████████████████████████████████▊                    | 15/20 [26:46<09:02, 108.49s/it]

Statring iteration 15
{'ner': 3277.0223231915634}


 80%|████████████████████████████████████████████████████████████████▊                | 16/20 [28:34<07:13, 108.32s/it]

Statring iteration 16
{'ner': 3057.0406417960744}


 85%|████████████████████████████████████████████████████████████████████▊            | 17/20 [30:22<05:24, 108.13s/it]

Statring iteration 17
{'ner': 2861.493234400216}


 90%|████████████████████████████████████████████████████████████████████████▉        | 18/20 [32:11<03:36, 108.29s/it]

Statring iteration 18
{'ner': 2801.913867141313}


 95%|████████████████████████████████████████████████████████████████████████████▉    | 19/20 [34:00<01:48, 108.68s/it]

Statring iteration 19
{'ner': 2850.7691661938507}


100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [35:50<00:00, 108.93s/it]


Done training


In [18]:
# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)


Enter your Model Name: spacy_nerV2


In [22]:
#Test your text
# test_text = input("Enter your testing text: ")
test_text = "The chicken rice is so bad."
doc = prdnlp(test_text)
type(doc)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

chicken rice 4 16 food


In [39]:
sample = ('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]})
print(sample[1])

print(TRAIN_DATA[0][1])

{'entities': [(21, 25, 'PrdName')]}
{'entities': [('14', '4', 'food'), ('25', '19', 'food')]}


In [49]:
for _, annotations in TRAIN_DATA[:2]:
     for ent in annotations.get('entities'):
            print(ent[2])

food
food_price
food_price
food_price


In [10]:
#test 
custom_ner = spacy.load("spacy_nerV2")
test_text = "The staff is so nice and the chicken rice so tasty."
doc = custom_ner(test_text)
type(doc)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

staff 4 9 service
chicken rice 29 41 food
