<a href="https://colab.research.google.com/github/valievav/ML-projects/blob/main/NER_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install spacy



In [2]:
# download model
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
!pip install spacy-lookups-data



In [4]:
# JIz-hiRrZ2g&t=18s

import spacy

In [5]:
# use existing model
nlp = spacy.load('en_core_web_md')

# get entities that can be recognized
ner_labels = nlp.get_pipe('ner').labels
[(x, spacy.explain(x)) for x in ner_labels]

[('CARDINAL', 'Numerals that do not fall under another type'),
 ('DATE', 'Absolute or relative dates or periods'),
 ('EVENT', 'Named hurricanes, battles, wars, sports events, etc.'),
 ('FAC', 'Buildings, airports, highways, bridges, etc.'),
 ('GPE', 'Countries, cities, states'),
 ('LANGUAGE', 'Any named language'),
 ('LAW', 'Named documents made into laws.'),
 ('LOC', 'Non-GPE locations, mountain ranges, bodies of water'),
 ('MONEY', 'Monetary values, including unit'),
 ('NORP', 'Nationalities or religious or political groups'),
 ('ORDINAL', '"first", "second", etc.'),
 ('ORG', 'Companies, agencies, institutions, etc.'),
 ('PERCENT', 'Percentage, including "%"'),
 ('PERSON', 'People, including fictional'),
 ('PRODUCT', 'Objects, vehicles, foods, etc. (not services)'),
 ('QUANTITY', 'Measurements, as of weight or distance'),
 ('TIME', 'Times smaller than a day'),
 ('WORK_OF_ART', 'Titles of books, songs, etc.')]

# Check NER model results on 1 sentence


In [6]:
# recognizing entitites for 1 sentence
doc = nlp('Brandon lives in New York from winter 2005, but 30% of time he spends in California.')

for ent in doc.ents:
  print(ent.text, '|', ent.label_)

Brandon | PERSON
New York | GPE
winter 2005 | DATE
30% | PERCENT
California | GPE


In [7]:
# better visual representation
from spacy import displacy

displacy.render(doc, style='ent')

# Check NER model results on test data

In [8]:
text = [
    'Nessa lives in Kyiv',
    'Chris Cornell was part of the Soundgarden before Audioslave',
    'Sakura blossoms in Japan in spring',
    'OpenAI released CharGPT Pro version in December 2024',
    'Lia speaks both Ukrainian and English',
    'R2D2 can only communicate in Binary',
]

# prepare sentences
docs = [nlp(x) for x in text]
docs

[Nessa lives in Kyiv,
 Chris Cornell was part of the Soundgarden before Audioslave,
 Sakura blossoms in Japan in spring,
 OpenAI released CharGPT Pro version in December 2024,
 Lia speaks both Ukrainian and English,
 R2D2 can only communicate in Binary]

In [9]:
docs[0].ents

(Nessa, Kyiv)

In [10]:
# go over sentences to detect entities
for doc in docs:
  entities = []
  for ent in doc.ents:
    entities.append([ent.text, ent.label_])

  print(entities)


[['Nessa', 'PERSON'], ['Kyiv', 'GPE']]
[['Chris Cornell', 'PERSON'], ['the Soundgarden before Audioslave', 'GPE']]
[['Sakura', 'PERSON'], ['Japan', 'GPE'], ['spring', 'DATE']]
[['CharGPT Pro', 'PRODUCT'], ['December 2024', 'DATE']]
[['Lia', 'PERSON'], ['Ukrainian', 'NORP'], ['English', 'LANGUAGE']]
[]


# Set entities manually for model

In [29]:
text = 'Tesla produces cars'
doc = nlp(text)

for entity in doc.ents:
  print(f'{entity.text} | {entity.label_}')

# as we see, Tesla is NOT recognized as company

In [31]:
from spacy.tokens import Span

s = Span(doc, 0, 1, label='ORG')

doc.set_ents([s], default='unmodified')

for entity in doc.ents:
  print(f'{entity.text} | {entity.label_}')

# now Tesla is detected!

Tesla | ORG


# Fine-tune existing NER model on custom data

In [11]:
import random
from spacy.util import minibatch
from spacy.training.example import Example

train_data = [
    ('What is the price of 10 bananas?', {'entities': [(21, 23, 'QUANTITY'), (24, 31, 'PRODUCT')]}),
    ('How much does 5 apples cost?', {'entities': [(15, 16, 'QUANTITY'), (17, 23, 'PRODUCT')]}),
    ('Can you tell me the price of 2 oranges?', {'entities': [(29, 30, 'QUANTITY'), (31, 38, 'PRODUCT')]}),
    ('I want to buy 1 watermelon. How much is it?', {'entities': [(14, 15, 'QUANTITY'), (16, 26, 'PRODUCT')]}),
    ('What would 3 pineapples cost?', {'entities': [(11, 12, 'QUANTITY'), (13, 23, 'PRODUCT')]}),
    ('Give me the price for 12 mangoes.', {'entities': [(21, 23, 'QUANTITY'), (24, 31, 'PRODUCT')]}),
    ('How much for 8 avocados?', {'entities': [(13, 14, 'QUANTITY'), (15, 23, 'PRODUCT')]}),
    ('I need the cost of 6 lemons.', {'entities': [(19, 20, 'QUANTITY'), (21, 27, 'PRODUCT')]}),
    ('Could you tell me how much 4 pears are?', {'entities': [(26, 27, 'QUANTITY'), (28, 33, 'PRODUCT')]}),
    ('Price check on 7 strawberries please.', {'entities': [(14, 15, 'QUANTITY'), (16, 28, 'PRODUCT')]}),
    ('How much would 9 peaches be?', {'entities': [(15, 16, 'QUANTITY'), (17, 24, 'PRODUCT')]}),
    ('What’s the rate of 11 grapes?', {'entities': [(19, 21, 'QUANTITY'), (22, 28, 'PRODUCT')]}),
    ('Can I know the price of 3 cherries?', {'entities': [(25, 26, 'QUANTITY'), (27, 35, 'PRODUCT')]}),
    ('How expensive are 13 blueberries?', {'entities': [(18, 20, 'QUANTITY'), (21, 32, 'PRODUCT')]}),
    ('Do you know the cost of 2 melons?', {'entities': [(24, 25, 'QUANTITY'), (26, 32, 'PRODUCT')]}),
    ('Please tell me the price of 5 plums.', {'entities': [(28, 29, 'QUANTITY'), (30, 35, 'PRODUCT')]}),
    ('How much is 14 kiwis?', {'entities': [(12, 14, 'QUANTITY'), (15, 20, 'PRODUCT')]}),
    ('What’s the total for 6 papayas?', {'entities': [(18, 19, 'QUANTITY'), (20, 27, 'PRODUCT')]}),
    ('I’d like the price of 1 coconut.', {'entities': [(22, 23, 'QUANTITY'), (24, 31, 'PRODUCT')]}),
    ('Tell me how much 10 apricots cost.', {'entities': [(18, 20, 'QUANTITY'), (21, 29, 'PRODUCT')]}),
]

nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [15]:
# use existing model
nlp = spacy.load('en_core_web_md')

# make sure ner pipeline is in the model
if 'ner' not in nlp.pipe_names:
  ner = nlp.add_pipe('ner')
else:
  ner = nlp.get_pipe('ner')

# add unknown labels to the model if we have custom labels in train_data
for _, ent_data in train_data:
  for ent in ent_data['entities']:
    label = ent[2]
    if label not in ner.labels:
      ner.add_label[label]

# disable all pipelines but ner and train it
epochs = 10
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

with nlp.disable_pipes(*other_pipes):
  # optimizer = nlp.begin_training()  # training a new blank model
  optimizer = nlp.create_optimizer()  # fine-tuning already trained model

  for epoch in range(epochs):
    random.shuffle(train_data)
    losses = {}
    batches = minibatch(train_data, size=2)
    for batch in batches:
      examples = []
      for text, ent_data in batch:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, ent_data)
        examples.append(example)
      nlp.update(examples, drop=0.5, losses=losses)
    print(f'Epoch {epoch + 1}, losses {losses}')

# save model
nlp.to_disk('custom_ner_model')


Epoch 1, losses {'ner': 41.80614490384065}
Epoch 2, losses {'ner': 32.538554708986034}
Epoch 3, losses {'ner': 18.27130108387208}
Epoch 4, losses {'ner': 9.793648728328542}
Epoch 5, losses {'ner': 11.861971547995141}
Epoch 6, losses {'ner': 3.3624804931924586}
Epoch 7, losses {'ner': 3.302883366902693}
Epoch 8, losses {'ner': 2.712555937008553}
Epoch 9, losses {'ner': 3.1505966742694804}
Epoch 10, losses {'ner': 5.2233721887111075}


In [23]:
trained_nlp = spacy.load('custom_ner_model')

test_text = [
    'How much for five avocados?',
    'Ten candies please, what''s the price?',
    'I''m intereseted in 5 granola bars, how much is it?',
]

for text in test_text:
  doc = trained_nlp(text)
  print(f'TEXT: {text}')
  print(f'ENTITIES: {[(ent.text, ent.label_) for ent in doc.ents]}\n')

TEXT: How much for five avocados?
ENTITIES: [('five', 'QUANTITY'), ('avocados', 'PRODUCT')]

TEXT: Ten candies please, whats the price?
ENTITIES: [('Ten', 'QUANTITY'), ('candies', 'PRODUCT')]

TEXT: Im intereseted in 5 granola bars, how much is it?
ENTITIES: [('5', 'QUANTITY'), ('granola', 'PRODUCT')]

