In [2]:
from spacy.lang.en import English

In [3]:
# Create the nlp object
nlp = English()

## Tokens

In [8]:
#Create a document object & tokenize
doc = nlp('India is my country.')
# Iterate over tokens in a Doc
for token in doc:
    print(token.text)
#Access token with index on doc object
print(doc[1])
#Slice the doc like a python list
print(doc[1:3])

India
is
my
country
.
is
is my


## Lexical Attributes
- is_alpha determines if the token is alphabets
- is_punct determines if it is a punctuation
- like_num tells if the token is numeric
- is_currency tells if it is a currency

In [14]:
doc = nlp("A pizza costs $10.5.")
#Print index
print('Index:   ', [token.i for token in doc])
#Print text tokens
print('Text:    ', [token.text for token in doc])
#Check if token is alpha
print('is_alpha:', [token.is_alpha for token in doc])
#Check if token is punctuation
print('is_punct:', [token.is_punct for token in doc])
#Check of token is like a number
print('like_num:', [token.like_num for token in doc])
#Check if token is a currency
print('is_currency:', [token.is_currency for token in doc])

Index:    [0, 1, 2, 3, 4, 5]
Text:     ['A', 'pizza', 'costs', '$', '10.5', '.']
is_alpha: [True, True, True, False, False, False]
is_punct: [False, False, False, False, False, True]
like_num: [False, False, False, False, True, False]
like_num: [False, False, False, True, False, False]


## Statistical Model for Language
- token.pos_: will give the part of speech tags
- token.dep_:  Dependency relation connecting the root to its head
- token.head.text: Gives the main word to which the token is related
- doc.ents: has the identified named entities
- explain function can be used to get the meaning of tags

In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
#Part of Speech
doc = nlp("A pizza costs $10.5.")
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

A DET
pizza NOUN
costs VERB
$ SYM
10.5 NUM
. PUNCT


In [15]:
#Syntactic Dependencies
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

Apple PROPN nsubj looking
is VERB aux looking
looking VERB ROOT looking
at ADP prep looking
buying VERB pcomp at
U.K. PROPN compound startup
startup NOUN dobj buying
for ADP prep buying
$ SYM quantmod billion
1 NUM compound billion
billion NUM pobj for


In [16]:
#Named Entities
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_)

Apple ORG
U.K. GPE
$1 billion MONEY


In [20]:
#explanation about entities & pos tags 
spacy.explain('MONEY')

'Monetary values, including unit'

## Rule based matching
- Used for searching text, their lexical attributes, models prediction of the word
- Match Patterns: list of dictionaries. Each dictionary defines a token attribute and its value to match
- Matcher will return match_id, start index, end index of match

In [25]:
from spacy.matcher import Matcher
#initialize the matcher with a vocabulary
matcher = Matcher(nlp.vocab)
#Create a pattern dictionary list. The below pattern searches for Tokens 'TEXT' attribute with value 'iPhone' & 'X'
pattern = [{'TEXT':'iPhone'},{'TEXT':'X'}]
#Add this pattern to our matcher. add function has #Name of Pattern, #Callback #actual pattern
matcher.add('PATTERN_IPHONE',None,pattern)
#Use the pattern for matching
doc = nlp("New iPhone X release date leaked")
matches = matcher(doc)
#Iterate over the matcher
for match_id, start, end in matches:
    matched = doc[start:end]
    print(matched.text)

In [29]:
#Pattern with digit and punctuation
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]
matcher.add('PATTERN_IPHONE',None,pattern)
doc = nlp("2018 FIFA World Cup: France won!")
matches = matcher(doc)
for match_id, start, end in matches:
    matched = doc[start:end]
    print(matched.text)

2018 FIFA World Cup:


In [42]:
#Pattern using LEMMA, POS
pattern = [
    {'LEMMA': 'love'},
    {'POS': 'NOUN'}
]
matcher.add('PATTERN_IPHONE',None,pattern)
doc = nlp("I loved dogs but now I love cats more. I love pizza. I love to travel.")
matches = matcher(doc)
for match_id, start, end in matches:
    matched = doc[start:end]
    print(matched.text)

loved dogs
love cats
love pizza


In [44]:
#Pattern ADJ followed by a NOUN and an optional next NOUN
pattern = [
    {'POS':'ADJ'},
    {'POS':'NOUN'},
    {'POS':'NOUN','OP':'?'}
]
matcher.add('PATTERN_IPHONE',None,pattern)
doc = nlp(
    "Features of the app include a beautiful design, smart search, automatic "
    "labels and optional voice responses."
)
matches = matcher(doc)
for match_id, start, end in matches:
    matched = doc[start:end]
    print(matched.text)

beautiful design
smart search
automatic labels
optional voice responses


## Similarity
- Compares 2 items and gives the similarity score(0 to 1). The items can be doc, slice or a token
- This uses word vectors which are included in core medium or large models (models ending with 'md' or 'lg')
- Spacy uses cosine similarity between two vectors give the results

In [1]:
import spacy
nlp = spacy.load('en_core_web_md')

In [2]:
#Compare doc to doc
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

0.8627204117787385


In [15]:
#Compare token to token
doc = nlp("I like France & England")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

0.6446904


In [17]:
#Compare doc to token
doc = nlp('I like pizza.')
token =nlp('pasta')
print(doc.similarity(token))

0.5286024505739466


## Word Vectors
- Multidimensional representation of meanings of word
- Word2Vec is Neural network model to build word Vectors. It predicts the context given a word. This will help us get similar words in same context.
- If the model is trained using 300 neuron hidden layer then we get a 300 element vector for each word.
- the doc.vector attribute gives us the vector representation

In [18]:
doc = nlp("I have a banana")
#The 300 vector representation of word banana
print(doc[3].vector)

[ 2.0228e-01 -7.6618e-02  3.7032e-01  3.2845e-02 -4.1957e-01  7.2069e-02
 -3.7476e-01  5.7460e-02 -1.2401e-02  5.2949e-01 -5.2380e-01 -1.9771e-01
 -3.4147e-01  5.3317e-01 -2.5331e-02  1.7380e-01  1.6772e-01  8.3984e-01
  5.5107e-02  1.0547e-01  3.7872e-01  2.4275e-01  1.4745e-02  5.5951e-01
  1.2521e-01 -6.7596e-01  3.5842e-01 -4.0028e-02  9.5949e-02 -5.0690e-01
 -8.5318e-02  1.7980e-01  3.3867e-01  1.3230e-01  3.1021e-01  2.1878e-01
  1.6853e-01  1.9874e-01 -5.7385e-01 -1.0649e-01  2.6669e-01  1.2838e-01
 -1.2803e-01 -1.3284e-01  1.2657e-01  8.6723e-01  9.6721e-02  4.8306e-01
  2.1271e-01 -5.4990e-02 -8.2425e-02  2.2408e-01  2.3975e-01 -6.2260e-02
  6.2194e-01 -5.9900e-01  4.3201e-01  2.8143e-01  3.3842e-02 -4.8815e-01
 -2.1359e-01  2.7401e-01  2.4095e-01  4.5950e-01 -1.8605e-01 -1.0497e+00
 -9.7305e-02 -1.8908e-01 -7.0929e-01  4.0195e-01 -1.8768e-01  5.1687e-01
  1.2520e-01  8.4150e-01  1.2097e-01  8.8239e-02 -2.9196e-02  1.2151e-03
  5.6825e-02 -2.7421e-01  2.5564e-01  6.9793e-02 -2

In [19]:
#Problem of using default similarity measure.
doc1 = nlp('I like cats.')
doc2 = nlp('I hate cats.')
doc1.similarity(doc2)
#The score says they are very similar because both talks about feeling towards cat. But the sentiment is negative here.

0.9632563384120956

## Spacy Pipeline
- when we instantiate en_core_web_md and pass a text it executes a pipeline and create a doc
- the pipeline consists of:
    - tokenizer
    - tagger
    - parser
    - ner
- we can add custom components to this pipeline. Define a function to do some action on doc object. Then add it to the pipeline using add_pipe 
- position of the custom component is determined by keyword last, first, after, before of add_pipe method

In [1]:
import spacy
nlp = spacy.load('en_core_web_md')
#Define component action using function
def myComponent(doc):
    print('Doc Length:', len(doc))
    return doc

#Add component to pipeline
nlp.add_pipe(myComponent,first=True)

#Print pipeline components
print(nlp.pipe_names)

['myComponent', 'tagger', 'parser', 'ner']


In [2]:
#create the doc item and see our component in action
doc = nlp("Hello world!")

Doc Length: 3


## Training Model for own data
- This is needed if the model need to perform better on our domain
- The pretrained models are very generic
- This is more critical for Named Entity Recognition
### Steps in training
    - Initialize the model weights randomly with nlp.begin_training
    - Predict a few examples with the current weights by calling nlp.update
    - Compare prediction with true labels
    - Calculate how to change weights to improve predictions
    - Update weights slightly
    - Go back to 2

In [3]:
#Creating training data for NER. Spacy needs data as a tuple
#We need the text and the entity start position, end position and the name of entity
("iPhone X is coming", {'entities': [(0, 8, 'GADGET')]})
#Also we need some cases where there are no entities to extract
("I need a new phone! Any tips?", {'entities': []})

('I need a new phone! Any tips?', {'entities': []})

In [4]:
#The above training data can be automatically created using pattern mathcer
from spacy.matcher import Matcher
from spacy.lang.en import English
nlp = English()
matcher = Matcher(nlp.vocab)
# Two tokens whose lowercase forms match 'iphone' and 'x'
pattern1 = [{"LOWER": "iphone"}, {"LOWER": "x"}]
# Token whose lowercase form matches 'iphone' and an optional digit
pattern2 = [{"LOWER": "iphone"}, {"IS_DIGIT": True, "OP": "?"}]
# Add patterns to the matcher
matcher.add("GADGET", None, pattern1, pattern2)

In [11]:
#Create training data using matchers created above
TRAINING_DATA = []
TEXTS = ['How to preorder the iPhone X', 
         'iPhone X is coming', 
         'Should I pay $1,000 for the iPhone X?', 
         'The iPhone 8 reviews are here', 
         'Your iPhone goes up to 11 today', 
         'I need a new phone! Any tips?']
# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, "GADGET") for span in spans]
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {"entities": entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
print(*TRAINING_DATA, sep="\n")

[]
[]
('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET'), (20, 26, 'GADGET')]})
('iPhone X is coming', {'entities': [(0, 8, 'GADGET'), (0, 6, 'GADGET')]})
('Should I pay $1,000 for the iPhone X?', {'entities': [(28, 36, 'GADGET'), (28, 34, 'GADGET')]})
('The iPhone 8 reviews are here', {'entities': [(4, 12, 'GADGET')]})
('Your iPhone goes up to 11 today', {'entities': [(5, 11, 'GADGET')]})
('I need a new phone! Any tips?', {'entities': []})


In [17]:
# Create a blank 'en' model
nlp = spacy.blank("en")

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label("GADGET")

In [32]:
import random
# Start the training
nlp.begin_training()
# Loop for 10 iterations
for itn in range(30):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
        # Update the model
        nlp.update(texts, annotations, losses=losses)

In [35]:
doc = nlp('iPhone is going on sale in London.')
for ent in doc.ents:
    print(ent.text, ent.label_)

iPhone GADGET
