In [1]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
print([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'), ('Google', 'ORG'), ('$5.1 billion', 'MONEY'), ('Wednesday', 'DATE')]


In [4]:
#token-level entity annotation
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'), (authorities, 'O', ''), (fined, 'O', ''), (Google, 'B', 'ORG'), (a, 'O', ''), (record, 'O', ''), ($, 'B', 'MONEY'), (5.1, 'I', 'MONEY'), (billion, 'I', 'MONEY'), (on, 'O', ''), (Wednesday, 'B', 'DATE'), (for, 'O', ''), (abusing, 'O', ''), (its, 'O', ''), (power, 'O', ''), (in, 'O', ''), (the, 'O', ''), (mobile, 'O', ''), (phone, 'O', ''), (market, 'O', ''), (and, 'O', ''), (ordered, 'O', ''), (the, 'O', ''), (company, 'O', ''), (to, 'O', ''), (alter, 'O', ''), (its, 'O', ''), (practices, 'O', '')]


In [13]:
spacy.explain("NORP")

'Nationalities or religious or political groups'

In [5]:
####################Extracting named entity from an article##########################################

In [19]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
filename=input("enter the file name: ")
with open(filename) as fh:
    content=fh.read().strip()
nlp=spacy.load("en_core_web_sm")
#ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(u"When learning data science, you shouldn't get discouraged! Challenges and setbacks aren't failures, they're just part of the journey. You've got this!")
#article = nlp(content)
len(article.ents) #no. of entities

enter the file name: data.txt


0

In [21]:
labels = [x.label_ for x in article.ents] # unique labels in the doc
Counter(labels)

Counter()

In [9]:
#three most frequent tokens
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13)]

In [10]:
#randomly select one sentence
sentences = [x for x in article.sents]
print(sentences[20])

A spokeswoman for the F.B.I. did not respond to a message seeking comment about why Mr. Strzok was dismissed rather than demoted.


In [11]:
#to generate the raw markup
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [12]:
#Using spaCy’s built-in displaCy visualizer to view its dependencies 
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [13]:
#verbatim, extract part-of-speech and lemmatize the sentence
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('spokeswoman', 'NOUN', 'spokeswoman'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('respond', 'VERB', 'respond'),
 ('message', 'NOUN', 'message'),
 ('seeking', 'VERB', 'seek'),
 ('comment', 'NOUN', 'comment'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('dismissed', 'VERB', 'dismiss'),
 ('demoted', 'VERB', 'demote')]

In [14]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'F.B.I.': 'ORG', 'Strzok': 'PERSON'}

In [16]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(A, 'O', ''), (spokeswoman, 'O', ''), (for, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (did, 'O', ''), (not, 'O', ''), (respond, 'O', ''), (to, 'O', ''), (a, 'O', ''), (message, 'O', ''), (seeking, 'O', ''), (comment, 'O', ''), (about, 'O', ''), (why, 'O', ''), (Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (was, 'O', ''), (dismissed, 'O', ''), (rather, 'O', ''), (than, 'O', ''), (demoted, 'O', ''), (., 'O', '')]


In [17]:
# view the entire document
displacy.render(article, jupyter=True, style='ent')

In [18]:
################################ train nER with custom traimimg data ###############################

In [11]:
import json 
  
  
 #the file to be converted to json format 
filename = 'data.txt'
  
# dictionary where the lines from 
# text will be stored 
dict1 = {} 
  
# creating dictionary 
with open(filename) as fh: 
  
    for line in fh: 
  
        # reads each line and trims of extra the spaces  
        # and gives only the valid words 
        command, description = line.strip().split(None, 1) 
  
        dict1[command] = description.strip() 
  #creating json file 
# the JSON file is named as test1 
out_file = open("test2.json", "w") 
json.dump(dict1, out_file, indent = 4, sort_keys = False) 
out_file.close() 

In [8]:
import spacy
import random


TRAIN_DATA = [('what is the price of polo?', {'entities': [(21, 25, 'PrdName')]}), ('what is the price of ball?', 
                                                                                    {'entities': [(21, 25, 'PrdName')]}),
              ('what is the price of jegging?', {'entities': [(21, 28, 'PrdName')]}), ('what is the price of t-shirt?', 
                                                                                       {'entities': [(21, 28, 'PrdName')]}),
              ('what is the price of jeans?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of bat?', {'entities':
                                                                                                                   [(21, 24, 'PrdName')]}), ('what is the price of shirt?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of bag?', {'entities': [(21, 24, 'PrdName')]}), ('what is the price of cup?', {'entities': [(21, 24, 'PrdName')]}), ('what is the price of jug?', {'entities': [(21, 24, 'PrdName')]}), ('what is the price of plate?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of glass?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of moniter?', {'entities': [(21, 28, 'PrdName')]}), ('what is the price of desktop?', {'entities': [(21, 28, 'PrdName')]}), ('what is the price of bottle?', {'entities': [(21, 27, 'PrdName')]}), ('what is the price of mouse?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of keyboad?', {'entities': [(21, 28, 'PrdName')]}), ('what is the price of chair?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of table?', {'entities': [(21, 26, 'PrdName')]}), ('what is the price of watch?', {'entities': [(21, 26, 'PrdName')]})]


def train_spacy(data,iterations):
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    return nlp

In [9]:
prdnlp = train_spacy(TRAIN_DATA, 20)

# Save our trained Model
modelfile = input("Enter your Model Name: ")
prdnlp.to_disk(modelfile)

Statring iteration 0
{'ner': 41.54640170704988}
Statring iteration 1
{'ner': 3.187142567800978}
Statring iteration 2
{'ner': 1.9999111539509422}
Statring iteration 3
{'ner': 1.910794114183301}
Statring iteration 4
{'ner': 3.4589070959070374}
Statring iteration 5
{'ner': 0.9724526122431391}
Statring iteration 6
{'ner': 1.8313767267491683}
Statring iteration 7
{'ner': 0.8737454216559152}
Statring iteration 8
{'ner': 6.217978175634302}
Statring iteration 9
{'ner': 3.187889712864469}
Statring iteration 10
{'ner': 1.7309143049072058}
Statring iteration 11
{'ner': 2.1213168003986125}
Statring iteration 12
{'ner': 0.7236799942758422}
Statring iteration 13
{'ner': 2.3236687477022535}
Statring iteration 14
{'ner': 4.155866196448457}
Statring iteration 15
{'ner': 2.8806144369917943}
Statring iteration 16
{'ner': 5.239077094012914}
Statring iteration 17
{'ner': 2.186578147208376}
Statring iteration 18
{'ner': 1.9063381998049214}
Statring iteration 19
{'ner': 1.8783274639371879}
Enter your Model N

In [10]:
#Test your text
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Enter your testing text: hi
