In [3]:
import pandas as pd
import numpy as np
import re
#from util import *
desired_width=320
pd.set_option('display.width', desired_width)
np.set_printoptions(linewidth=desired_width)
pd.set_option('display.max_columns',20)
pd.set_option('display.max_rows', 500)

In [66]:
# Load language model
import spacy
nlp = spacy.load("en_core_web_lg")

In [5]:
# Our example text
text="Sold in packets of 20 small pieces of 100g each"

In [6]:
# Convert to parsed NLP object (apply NLP pipeline)
doc=nlp(text)

In [7]:
# List tokens and POS
for token in doc:
    print(token.text, token.pos_)

Sold VERB
in ADP
packets NOUN
of ADP
20 NUM
small ADJ
pieces NOUN
of ADP
100 NUM
g NOUN
each DET


In [8]:
# List dependencies
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Sold sell VERB VBN ROOT Xxxx True False
in in ADP IN prep xx True True
packets packet NOUN NNS pobj xxxx True False
of of ADP IN prep xx True True
20 20 NUM CD nummod dd False False
small small ADJ JJ amod xxxx True False
pieces piece NOUN NNS pobj xxxx True False
of of ADP IN prep xx True True
100 100 NUM CD nummod ddd False False
g g NOUN NN pobj x True False
each each DET DT dobj xxxx True True


In [9]:
# Display dependencies
from spacy import displacy
displacy.render(doc, style="dep",options = {"compact": False, "distance": 90})

In [10]:
for token in doc:
    if token.is_digit:
        print(token, token.head)

20 pieces
100 g


In [11]:
#text='NiQuitin Clear Patch Step 2, 14mg. 7 Patches Stop Smoking Aid.'
text='PG tips Original 80 Tea Bags 232g'
doc=nlp(text)
for token in doc:
    if token.is_digit:
        print(token, token.head)

80 Bags
232 g


In [13]:
text='Tomatoes 5 per pack, 23.8p each. Baby spinach 160g, 62.5p per 100g. '
doc=nlp(text)
for token in doc:
    if token.is_digit:
        print(token, token.head)

5 Tomatoes
160 g
100 g.


In [None]:

# Rule-based Matcher


In [14]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern = [{"LIKE_NUM": True},{"LOWER": "per"}, {"IS_ASCII": True}]
matcher.add("units_per_item", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(string_id+':   "'+span.text+'"')

units_per_item:   "5 per pack"


In [15]:
doc=nlp("Pack size: 170G , pack Size 100mg")
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "pack"},{"LOWER": "size"}, 
           {"IS_PUNCT": True, "OP": "?"},
           {"LIKE_NUM": True},
           {"LOWER": {"IN": ["g", "mg"]}}]
matcher.add("pack_size", [pattern])

matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(string_id+':   "'+span.text+'"')
    
displacy.render(doc, style="dep",options = {"compact": False, "distance": 90})

pack_size:   "Pack size: 170G"
pack_size:   "pack Size 100mg"


In [None]:

# Dependency Matcher


In [16]:
from spacy.matcher import DependencyMatcher
matcher = DependencyMatcher(nlp.vocab)

pattern = [
    {
        "RIGHT_ID": "anchor_founded",
        "RIGHT_ATTRS": {"ORTH": "founded"}
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object",
        "RIGHT_ATTRS": {"DEP": "dobj"},
    },
    {
        "LEFT_ID": "founded_object",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object_modifier",
        "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
    }
]
matcher.add("FOUNDED", [pattern])

doc=nlp("John, a friend of mine, founded two AI startups.")
displacy.render(doc, style="dep",options = {"compact": False, "distance": 90})

matches = matcher(doc)
match_id, token_ids = matches[0]
for i in range(len(token_ids)):
    print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)

anchor_founded: founded
founded_subject: John
founded_object: startups
founded_object_modifier: AI


In [17]:
import spacy
from spacy.matcher import DependencyMatcher

nlp.add_pipe("merge_entities")# merge entities to simplify this


pattern = [
        {
            "RIGHT_ID": "person",
            "RIGHT_ATTRS": {"ENT_TYPE": "PERSON", "DEP": "nsubj"},
        },
        {
            "LEFT_ID": "person",
            "REL_OP": "<",
            "RIGHT_ID": "verb",
            "RIGHT_ATTRS": {"POS": "VERB"},
        }]

matcher = DependencyMatcher(nlp.vocab)
matcher.add("WHO_DID_WHAT", [pattern])

texts = [
        "Steve Jobs founded Apple when he was young.",
        "Mike, a friend of mine, wrote a book."
        '"Hello!", says Mary.',
        ]

for text in texts:
    doc = nlp(text)
    matches = matcher(doc)

    for match in matches:
        match_id, (start, end) = match
        print(doc[start], "::", doc[end])

Steve Jobs :: founded
Mike :: wrote
Mary :: says


In [None]:

# NER - Named Entity Recognition


In [74]:
doc=nlp("Everlearn Ltd founded by John McAfee partenered with Apple to provide education "
    "in the Middle East, North Africa, Asia Pacific, South Asia, and the Americas."
    "It believes that education is purpose-driven. That’s why it empowers children "
    "and adults to be creative, innovative, entrepreneurial, and disruptive.")
for entity in doc.ents:
      print(entity.label_, ' | ', entity.text)
displacy.render(doc, style="ent")

ORG  |  Everlearn Ltd
PERSON  |  John McAfee
ORG  |  Apple
LOC  |  the Middle East
GPE  |  North Africa
LOC  |  Asia Pacific
LOC  |  South Asia
LOC  |  Americas


In [75]:
doc=nlp('''The Slim Fit MVP is a slim fitting, ultra comfortable classic Jeans. The Extreme Motion series are the ultimate combination between extreme comfort, freedom to move and authentic denim, coming here in a slim fit execution.
Featuring the amazing comfort, movement and unbeatable stretch of Lee’s Extreme Motion collection, these slim fit jeans are made with motion in mind. Look for the grey elastic in the super comfy athletic waistband and the signature donut button and brown leather back patch. Made using recycled fabric and constructed using a cleverly tapered slim fit.''')
for entity in doc.ents:
      print(entity.label_, ' | ', entity.text)
displacy.render(doc, style="ent")

ORG  |  Slim Fit MVP
NORP  |  Jeans
PERSON  |  Lee


In [50]:
brands=pd.read_csv("brands.csv")
jeans=pd.read_csv("jeans.csv")["name"]

In [None]:
# Customize NER Pipeline

In [134]:
# training data
TRAIN_DATA = [( jeans[x] , {"entities": [(0,len(jeans[x].split(" ")[0] ), "BRAND")]}) 
              for x in jeans.index]
TRAIN_DATA 

[('Wrangler Regular Stretch Jeans - Navy', {'entities': [(0, 8, 'BRAND')]}),
 ('Wrangler Regular Stretch Jeans - Camel', {'entities': [(0, 8, 'BRAND')]}),
 ('Wrangler Regular Stretch Jeans - Stonewash',
  {'entities': [(0, 8, 'BRAND')]}),
 ('Wrangler Regular Stretch Jeans - Darkstone',
  {'entities': [(0, 8, 'BRAND')]}),
 ('Wrangler Regular Stretch Jeans - Black', {'entities': [(0, 8, 'BRAND')]}),
 ('Wrangler Regular Stretch Jeans - Rinsewash',
  {'entities': [(0, 8, 'BRAND')]}),
 ('Lee Slim Fit Jeans - Lenny', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Slim Chino Trousers - Taupe', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Slim Chino Trousers - Navy', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Slim Chino Trousers - Dark Grey', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Slim Fit Jeans - Rinse', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Slim Fit Jeans - Aristocrat', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Regular Straight Fit Jeans - Black', {'entities': [(0, 3, 'BRAND')]}),
 ('Lee Regular Str

In [139]:
# Load pre-existing spacy model
import spacy
nlp=spacy.load('en_core_web_sm')

# Getting the pipeline component
ner=nlp.get_pipe("ner")

# New label to add
LABEL = "BRAND"

# Add the new label to ner
ner.add_label(LABEL)

# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)

# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

from spacy.util import minibatch, compounding
import random

# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :
    sizes = compounding(1.0, 4.0, 1.001)
    # Training for 30 iterations     
    for itn in range(30):
    # shuffle examples before training
        random.shuffle(TRAIN_DATA)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # dictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example.append( Example.from_dict(doc, annotations))
                nlp.update(
                            example,
                            drop=0.5,  # dropout - make it harder to memorise data
                            losses=losses,
                        )
            print("Losses", losses)
    

Losses {'ner': 5.11392423446523}
Losses {'ner': 11.284166703544088}
Losses {'ner': 15.577562182482964}
Losses {'ner': 20.985475509729277}
Losses {'ner': 24.775912763713112}
Losses {'ner': 27.037826578638832}
Losses {'ner': 29.88866926760596}
Losses {'ner': 31.197716305214083}
Losses {'ner': 33.907528067655036}
Losses {'ner': 38.96058855682331}
Losses {'ner': 42.21106432715919}
Losses {'ner': 44.226306199782925}
Losses {'ner': 46.230815022510406}
Losses {'ner': 48.28720327172375}
Losses {'ner': 51.58097155998036}
Losses {'ner': 53.906150667593074}
Losses {'ner': 57.15503844348452}
Losses {'ner': 59.14233595885899}
Losses {'ner': 60.77487931126403}
Losses {'ner': 62.05314719549111}
Losses {'ner': 65.58124440559382}
Losses {'ner': 67.47993823359755}
Losses {'ner': 69.06064645000177}
Losses {'ner': 71.84102051232841}
Losses {'ner': 73.52141319237273}
Losses {'ner': 76.41343262914017}
Losses {'ner': 78.50669938305788}
Losses {'ner': 79.89221081086822}
Losses {'ner': 81.35532910706382}
Losse

Losses {'ner': 5.979880355342997}
Losses {'ner': 5.979880355556712}
Losses {'ner': 5.98044567393596}
Losses {'ner': 5.980445675336233}
Losses {'ner': 5.983938235336498}
Losses {'ner': 5.9839398667025465}
Losses {'ner': 5.983939867165603}
Losses {'ner': 5.986154897461626}
Losses {'ner': 5.986176233880633}
Losses {'ner': 5.986180789775023}
Losses {'ner': 5.9861829265566735}
Losses {'ner': 5.986183230000613}
Losses {'ner': 6.002320695950955}
Losses {'ner': 6.002320731830611}
Losses {'ner': 6.002320737674327}
Losses {'ner': 6.002320737951711}
Losses {'ner': 6.00232073832039}
Losses {'ner': 6.002357678638633}
Losses {'ner': 6.002357777217415}
Losses {'ner': 6.002357939291211}
Losses {'ner': 6.0023586821446875}
Losses {'ner': 6.00236904602452}
Losses {'ner': 6.002377450153758}
Losses {'ner': 6.002377452559527}
Losses {'ner': 6.614007802923502}
Losses {'ner': 6.6142188607443355}
Losses {'ner': 6.61421886075104}
Losses {'ner': 6.614218861643423}
Losses {'ner': 6.6142207995410285}
Losses {'ner'

Losses {'ner': 0.0005361853601228935}
Losses {'ner': 0.0005361854876572829}
Losses {'ner': 0.0005374017303318245}
Losses {'ner': 0.0005374101588497334}
Losses {'ner': 0.0005374106904432634}
Losses {'ner': 0.18557264358966888}
Losses {'ner': 0.18557264363410686}
Losses {'ner': 0.18557274011420302}
Losses {'ner': 0.1967508066136284}
Losses {'ner': 0.19675080663977892}
Losses {'ner': 0.19675080677702372}
Losses {'ner': 0.1968489587020786}
Losses {'ner': 0.19685837117592428}
Losses {'ner': 0.1968583711829573}
Losses {'ner': 0.1968584479590213}
Losses {'ner': 0.1968584498406084}
Losses {'ner': 0.19685844986072393}
Losses {'ner': 0.19685844986267714}
Losses {'ner': 0.19685844986279508}
Losses {'ner': 0.19685856979541852}
Losses {'ner': 0.19685856979707017}
Losses {'ner': 0.19685856979716687}
Losses {'ner': 0.19685856981610428}
Losses {'ner': 0.20057438816843046}
Losses {'ner': 0.20057438875499498}
Losses {'ner': 0.20057441491586117}
Losses {'ner': 1.9438658345839056e-08}
Losses {'ner': 8.526

Losses {'ner': 0.02153562947698226}
Losses {'ner': 0.021535895340635002}
Losses {'ner': 0.025036983585630203}
Losses {'ner': 0.030645406602297247}
Losses {'ner': 0.030670331577714764}
Losses {'ner': 0.03067033371581997}
Losses {'ner': 0.12377186542772727}
Losses {'ner': 0.12377186542794047}
Losses {'ner': 0.12377186598849149}
Losses {'ner': 0.12377186793554239}
Losses {'ner': 0.12377186980170818}
Losses {'ner': 0.12377203243784964}
Losses {'ner': 0.12377458431868818}
Losses {'ner': 0.14980436484579743}
Losses {'ner': 0.1498043656836789}
Losses {'ner': 0.1498043778677567}
Losses {'ner': 0.14980442483490752}
Losses {'ner': 1.0807173738937008}
Losses {'ner': 1.0807173738961817}
Losses {'ner': 1.0807173740264169}
Losses {'ner': 1.0807173741921468}
Losses {'ner': 1.0815996944667052}
Losses {'ner': 1.0815996944668924}
Losses {'ner': 1.0815996992382781}
Losses {'ner': 1.081599713162147}
Losses {'ner': 1.081599716513903}
Losses {'ner': 1.0815997171568406}
Losses {'ner': 1.0816100195365057}
Los

Losses {'ner': 0.04226724387345232}
Losses {'ner': 0.04226724387351708}
Losses {'ner': 0.0422672462492129}
Losses {'ner': 0.04226724624922085}
Losses {'ner': 0.042267246249383654}
Losses {'ner': 0.042267246263562396}
Losses {'ner': 0.04226724626429333}
Losses {'ner': 0.04226724626734235}
Losses {'ner': 0.042267247443506954}
Losses {'ner': 0.04226894200348827}
Losses {'ner': 0.04226894200351847}
Losses {'ner': 0.04226894200352647}
Losses {'ner': 0.04226895787240079}
Losses {'ner': 0.10895250358105849}
Losses {'ner': 6.107139958199689e-16}
Losses {'ner': 1.0060607362267257e-13}
Losses {'ner': 1.0342501268876458e-13}
Losses {'ner': 1.601441598167863e-12}
Losses {'ner': 1.354600440029873e-08}
Losses {'ner': 0.12190835042382729}
Losses {'ner': 0.12190835053825563}
Losses {'ner': 0.12190835054049469}
Losses {'ner': 0.12190835054584127}
Losses {'ner': 0.12190835065666003}
Losses {'ner': 0.12190849252963058}
Losses {'ner': 0.12190849253144621}
Losses {'ner': 0.12190849253144881}
Losses {'ner':

Losses {'ner': 1.9925059360579416}
Losses {'ner': 1.992505936156251}
Losses {'ner': 1.9925059361565265}
Losses {'ner': 1.9925087410500182}
Losses {'ner': 1.9925088113331306}
Losses {'ner': 1.9925088113331308}
Losses {'ner': 1.9925088113331308}
Losses {'ner': 1.9925088113335134}
Losses {'ner': 4.5343307314511174e-10}
Losses {'ner': 4.534424352776212e-10}
Losses {'ner': 3.0808457941140297e-09}
Losses {'ner': 3.440812023391845e-09}
Losses {'ner': 3.4408543937099175e-09}
Losses {'ner': 3.4975172810847113e-09}
Losses {'ner': 8.04639690548397e-05}
Losses {'ner': 8.114392760329496e-05}
Losses {'ner': 0.012323436912930594}
Losses {'ner': 0.012323436912933576}
Losses {'ner': 0.012323436912939183}
Losses {'ner': 0.012323436912939202}
Losses {'ner': 0.01232343691316474}
Losses {'ner': 0.012323436913164848}
Losses {'ner': 0.014802629015595559}
Losses {'ner': 0.014802629015911884}
Losses {'ner': 0.01480262901843492}
Losses {'ner': 0.01480328747382751}
Losses {'ner': 0.014803287481053072}
Losses {'n

Losses {'ner': 1.0206122398557667}
Losses {'ner': 1.020612239855816}
Losses {'ner': 1.020612239887696}
Losses {'ner': 1.0206122399085982}
Losses {'ner': 1.020612239910513}
Losses {'ner': 1.020612239912416}
Losses {'ner': 1.0206662987586372}
Losses {'ner': 1.020666298761998}
Losses {'ner': 1.0206663122468236}
Losses {'ner': 1.0206663122477393}
Losses {'ner': 1.0206737306658513}
Losses {'ner': 1.020673730780748}
Losses {'ner': 1.020673765840423}
Losses {'ner': 1.0206737658404235}
Losses {'ner': 1.0206737682555092}
Losses {'ner': 1.0206739111371796}
Losses {'ner': 1.0206739113709737}
Losses {'ner': 1.020673911372301}
Losses {'ner': 1.0206739113725294}
Losses {'ner': 1.979644775390647}
Losses {'ner': 1.9796447754265047}
Losses {'ner': 1.979644775426507}
Losses {'ner': 1.9796447754510873}
Losses {'ner': 1.9796447927476404}
Losses {'ner': 1.979644797172514}
Losses {'ner': 1.9796447974011009}
Losses {'ner': 1.9796447975380718}
Losses {'ner': 1.9796447975381304}
Losses {'ner': 1.97964479753821

In [142]:
doc=nlp('''The Slim Fit MVP is a slim fitting, ultra comfortable classic Jeans. 
The Extreme Motion series are the ultimate combination between extreme comfort, freedom to move 
and authentic denim, coming here in a slim fit execution, very much like Guess.
Featuring the amazing comfort, movement and unbeatable stretch of Lee’s Extreme Motion collection,
these slim fit jeans are made with motion in mind. Look for the grey elastic in the super comfy athletic
waistband and the signature donut button and brown leather back patch. Lee jeans are sold in the same
stores as Wrangler. Made using recycled fabric and constructed using a cleverly tapered slim fit.''')

for entity in doc.ents:
      print(entity.label_, ' | ', entity.text)
displacy.render(doc, style="ent")

BRAND  |  The
BRAND  |  Guess
BRAND  |  Lee
BRAND  |  Lee
BRAND  |  Wrangler
