# Train Customer NER model in sapcy 

In [None]:
# Load a spacy model and chekc if it has ner
import spacy
# nlp=spacy.load('en_core_web_sm')
nlp=spacy.load('xx_ent_wiki_sm')

nlp.pipe_names


In [None]:
# Getting the pipeline component
ner=nlp.get_pipe("ner")

In [None]:
article_text='AUTOMATIC DRAWING, 2018004228000489 AMERICAN EXPRESS SOL PRODUCTS PTY and INTER-BANK CREDIT, TonicLane0722 TONIC LANE PTY L SOL Products Pty'
doc=nlp(article_text)
for ent in doc.ents:
    print(ent.text,ent.label_)

In [None]:
# training data
TRAIN_DATA = [
              ("Walmart is a leading e-commerce company", {"entities": [(0, 7, "ORG")]}),
              ("I reached Chennai yesterday.", {"entities": [(19, 28, "GPE")]}),
              ("I recently ordered a book from Amazon", {"entities": [(24,32, "ORG")]}),
              ("I was driving a BMW", {"entities": [(16,19, "PRODUCT")]}),
              ("I ordered this from ShopClues", {"entities": [(20,29, "ORG")]}),
              ("Fridge can be ordered in Amazon ", {"entities": [(0,6, "PRODUCT")]}),
              ("I bought a new Washer", {"entities": [(16,22, "PRODUCT")]}),
              ("I bought a old table", {"entities": [(16,21, "PRODUCT")]}),
              ("I bought a fancy dress", {"entities": [(18,23, "PRODUCT")]}),
              ("I rented a camera", {"entities": [(12,18, "PRODUCT")]}),
              ("I rented a tent for our trip", {"entities": [(12,16, "PRODUCT")]}),
              ("I rented a screwdriver from our neighbour", {"entities": [(12,22, "PRODUCT")]}),
              ("I repaired my computer", {"entities": [(15,23, "PRODUCT")]}),
              ("I got my clock fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("I got my truck fixed", {"entities": [(16,21, "PRODUCT")]}),
              ("Flipkart started it's journey from zero", {"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Max", {"entities": [(24,27, "ORG")]}),
              ("Flipkart is recognized as leader in market",{"entities": [(0,8, "ORG")]}),
              ("I recently ordered from Swiggy", {"entities": [(24,29, "ORG")]})
              ]

In [None]:
TRAIN_DATA

In [None]:
# Adding labels to the `ner`

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

In [None]:
# Disable pipeline components you dont need to change
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

## Training the NER model

First, let’s understand the ideas involved before going to the code.

(a) To train an ner model, the model has to be looped over the example for sufficient number of iterations. If you train it for like just 5 or 6 iterations, it may not be effective.

(b) Before every iteration it’s a good practice to shuffle the examples randomly throughrandom.shuffle() function .

This will ensure the model does not make generalizations based on the order of the examples.

(c) The training data is usually passed in batches.

In [None]:
# Import requirements
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

In [None]:
# Testing the model
doc = nlp("I was driving a Alto")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

In [None]:
# Save the  model to directory
output_dir = Path('/content/')
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Load the saved model and predict
print("Loading from", output_dir)
nlp_updated = spacy.load(output_dir)
doc = nlp_updated("Fridge can be ordered in FlipKart" )
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

### Train NER from a blank spacy model

In [None]:
import spacy

nlp=spacy.blank("en")

nlp.add_pipe(nlp.create_pipe('ner'))

nlp.begin_training()

### Training completely new entity type in spaCy

In [1]:
# Import and load the spacy model
import spacy
nlp=spacy.load("en_core_web_lg") 

# Getting the ner component
ner=nlp.get_pipe('ner')

In [2]:
# New label to add
LABEL = "FOOD"

# Training examples in the required format
TRAIN_DATA =[ ("Pizza is a common fast food.", {"entities": [(0, 5, "FOOD")]}),
              ("Pasta is an italian recipe", {"entities": [(0, 5, "FOOD")]}),
              ("China's noodles are very famous", {"entities": [(8,14, "FOOD")]}),
              ("Shrimps are famous in China too", {"entities": [(0,7, "FOOD")]}),
              ("Lasagna is another classic of Italy", {"entities": [(0,7, "FOOD")]}),
              ("Sushi is extemely famous and expensive Japanese dish", {"entities": [(0,5, "FOOD")]}),
              ("Unagi is a famous seafood of Japan", {"entities": [(0,5, "FOOD")]}),
              ("Tempura , Soba are other famous dishes of Japan", {"entities": [(0,7, "FOOD")]}),
              ("Udon is a healthy type of noodles", {"entities": [(0,4, "ORG")]}),
              ("Chocolate soufflé is extremely famous french cuisine", {"entities": [(0,17, "FOOD")]}),
              ("Flamiche is french pastry", {"entities": [(0,8, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Burgers are the most commonly consumed fastfood", {"entities": [(0,7, "FOOD")]}),
              ("Frenchfries are considered too oily", {"entities": [(0,11, "FOOD")]})
           ]

In [3]:
# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

In [4]:
# Importing requirements
from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
        # shuffle examples before training
        random.shuffle(TRAIN_DATA)
  
      # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # ictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
              # Calling update() over the iteration
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

Losses {'ner': 8.30617344158236}
Losses {'ner': 12.530081017935686}
Losses {'ner': 17.72781226175987}
Losses {'ner': 23.5134511220994}
Losses {'ner': 30.142971201249953}
Losses {'ner': 34.29515378874374}
Losses {'ner': 39.704931962831104}
Losses {'ner': 45.641172330547995}


  gold = GoldParse(doc, **gold)


Losses {'ner': 49.80742558260477}
Losses {'ner': 56.57269441419681}
Losses {'ner': 60.712227677850144}
Losses {'ner': 65.41609042998618}
Losses {'ner': 70.69118044734216}
Losses {'ner': 75.31106345738965}
Losses {'ner': 4.8581863339059055}
Losses {'ner': 12.068656334942233}
Losses {'ner': 17.667504258829894}
Losses {'ner': 20.892584238111766}
Losses {'ner': 25.709184050470384}
Losses {'ner': 31.60164972176196}
Losses {'ner': 34.98258298408243}
Losses {'ner': 37.87771972886421}
Losses {'ner': 39.91545630158291}
Losses {'ner': 40.750114909988866}
Losses {'ner': 45.74044731879704}
Losses {'ner': 53.69873697185986}
Losses {'ner': 59.002971100305245}
Losses {'ner': 62.689515257422414}
Losses {'ner': 5.08629464515252}
Losses {'ner': 9.649530124937883}
Losses {'ner': 18.829942357813707}
Losses {'ner': 22.669617636565818}
Losses {'ner': 29.574781550531043}
Losses {'ner': 30.35448184303823}
Losses {'ner': 34.72214235176216}
Losses {'ner': 43.98967243541847}
Losses {'ner': 47.33879455385977}
Los

Losses {'ner': 47.81120741679797}
Losses {'ner': 49.998184506238886}
Losses {'ner': 56.03143452705581}
Losses {'ner': 56.05134708199648}
Losses {'ner': 3.552627828706136}
Losses {'ner': 9.815679528203873}
Losses {'ner': 12.44717051031239}
Losses {'ner': 15.961388393660854}
Losses {'ner': 24.025820686837505}
Losses {'ner': 27.032415095141005}
Losses {'ner': 32.42840884195812}
Losses {'ner': 36.719276268390786}
Losses {'ner': 38.70634375400459}
Losses {'ner': 43.904861564557905}
Losses {'ner': 48.70887808017051}
Losses {'ner': 53.27907414994752}
Losses {'ner': 53.525644633030424}
Losses {'ner': 55.563655925375315}
Losses {'ner': 6.286850154399872}
Losses {'ner': 9.474317088606767}
Losses {'ner': 15.380708949523978}
Losses {'ner': 21.003805724089034}
Losses {'ner': 22.761416374965847}
Losses {'ner': 24.787672455682696}
Losses {'ner': 26.205480833797992}
Losses {'ner': 30.017266762703002}
Losses {'ner': 33.17392391402791}
Losses {'ner': 35.171592302256045}
Losses {'ner': 39.90604280271452}

In [5]:
# Testing the NER

test_text = "I ate Sushi yesterday. Maggi is a common fast food "
doc = nlp(test_text)
print("Entities in '%s'" % test_text)
for ent in doc.ents:
    print(ent)

Entities in 'I ate Sushi yesterday. Maggi is a common fast food '
I
Maggi


In [None]:
# Save the model in a Output directory
from pathlib import Path
output_dir=Path('/content/')

# Saving the model to the output directory
if not output_dir.exists():
    output_dir.mkdir()
nlp.meta['name'] = 'my_ner'  # rename model
nlp.to_disk(output_dir)
print("Saved model to", output_dir)

# Loading the model from the directory
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
assert nlp2.get_pipe("ner").move_names == move_names

In [7]:
doc2 = nlp(' Dosa is an extremely famous south Indian dish')
for ent in doc2.ents:
    print(ent.label_, ent.text)