In [None]:
import spacy

# Advanced NLP with spaCy

## Finding words, phrases, names and concepts

### introdution 

In [6]:
# import the english language class
from spacy.lang.en import English

In [7]:
#create the nlp object
nlp = English()

#create by processing a string of text with the nlp object
doc = nlp("Hello world!")
# iterate over tokens in a Doc
for token in doc:
    print(token.text)

Hello
world
!


In [8]:
# index into the Doc to get a string Token
token = doc[1]
# get the token text via the .text attribute
print(token.text)

world


In [9]:
# a slice from the Doc is a span object
span = doc[1:4]
#get the span text 
print(span.text)

world!


In [12]:
# lexical attributes
doc = nlp("It costs $5.")
print('Index:    ',[token.i for token in doc])
print('Text:     ',[token.text for token in doc])
print('is_alpha: ',[token.is_alpha for token in doc])
print('is_punct: ',[token.is_punct for token in doc])
print('like_num: ',[token.like_num for token in doc])

Index:     [0, 1, 2, 3, 4]
Text:      ['It', 'costs', '$', '5', '.']
is_alpha:  [True, True, False, False, False]
is_punct:  [False, False, False, False, True]
like_num:  [False, False, False, True, False]


### statistical models

In [None]:
'''
Enable spaCy to predict linguistic attributes in context
    part of speech tags
    suntactic dependencies
    name entities
trained on labeled example texts
can be updated with more examples to fine-tune predictions

In [None]:
#pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')
# binary weights
# vocabulary
# meta information(language, pipeline)

In [4]:
# predicting part-of-speech tags
doc = nlp('She ate the pizza')
for token in doc:
    print(token.text,token.pos_) #print predicted part-of-speech tags

She PRON
ate VERB
the DET
pizza NOUN


In [5]:
# predicting syntactic dependencies
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

She PRON nsubj ate
ate VERB ROOT ate
the DET det pizza
pizza NOUN dobj ate


In [6]:
# predicting named entities
doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
for ent in doc.ents:
    print(ent.text, ent.label_) #print entity text and its label

Apple ORG
U.K. GPE
$1 billion MONEY


In [7]:
spacy.explain('GPE') #explain method

'Countries, cities, states'

### rule-based matching

In [2]:
# using the matcher
import spacy
nlp = spacy.load('en_core_web_sm')

In [4]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [7]:
#add the pattern to the matcher
pattern = [{'ORTH': 'iPhone'}, {'ORTH':'X'}]
matcher.add('IPHONE_PATTERN',[pattern])

#process some text
doc = nlp("New iPhone X release date leaked")
#call the matcher on the doc
matches = matcher(doc)

#iterate over the matches
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)
    #get the span's root token and root head token
    print('Root token:', matched_span.root.text)
    print('Root head token:', matched_span.root.head.text)
    #get the previous tokne and its POS tag
    print('Previous token:', doc[start-1].text, doc[start-1].pos_)

iPhone X
Root token: X
Root head token: release
Previous token: New PROPN


In [22]:
#matching lexical attributes
pattern = [{'IS_DIGI': True},{'LOWER':'fifa'},{'LOWER':'world'},
          {'LOWER':'cup'},{'IS_PUNCT':True}]
doc = nlp("2018 FIFA World Cup: France won!")
#2018 FIFA World Cup:

pattern = [{'LEMMA':'love','POS':'VERB'},{'POS':'NOUN'}]
doc = nlp("I loved dogs but now I love cats more")
#loved dogs
#love cats

In [None]:
#using operators and quantifiers
pattern = [{'LEMMA': 'buy'},
           {'POS':'DET','OP':'?'}, #optional:match 0 or 1 times
          {'POS':'NOUN'}]
doc = nlp("I bought a smartphone. Now I'm buying apps")
#bought a smartphone
#buying apps

#!: 0;   ?: 0 or 1;   +:1 or more;   *: 0 or more

## Large-scale data analysis with spaCy

### Data structures: vocab, lexemes and StringStore

In [29]:
#shared vocab and string store
coffee_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.vocab.strings[coffee_hash]
print ('way 1:', coffee_hash, coffee_string)

doc = nlp('I love coffee')
print ('way 2:', doc.vocab.strings['coffee'])

way 1: 3197928453018144401 coffee
way 2: 3197928453018144401


In [30]:
#lexemes: entries in the vocabulary
doc = nlp('I love coffee')
lexeme = nlp.vocab['coffee']
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


In [5]:
#doc object
from spacy.lang.en import English
nlp = English()
from spacy.tokens import Doc, Span

In [6]:
#the words and spaces to create the doc from
words = ['Hello', 'world', '!']
spaces = [True, False, False]
#create a doc manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
#create a span manually
span = Span(doc, 0, 2)
#create a span with a label
span_with_label = Span(doc, 0, 2, label='GREETING')
#add span to the doc.ents
doc.ents = [span_with_label]

In [11]:
doc.text

'Hello world!'

In [None]:
'''
Doc and Span are very powerful and hold references and relationship of words and sentences
-convert result to string as late as possible
-use token attributes if available - for expamle, token.i for the token index
Don't forget to pass in the sahred vocab

### word vectors and semantic similarity

In [None]:
'''
spaCy can compare two objects and predict similarity
Doc.similarity(), Span.similarity() and Token.similarity()

*need a model that has word vectors included
en_core_web_md; en_core_web_lg

In [None]:
#similarity examples
nlp = spacy.load('en_core_web_md')

#compare two documents
doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')
print(doc1.similarity(doc2))

#comapre two tokens
doc = nlp('I like pizza and pasta')
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

#comapre a document with a token
doc = nlp('I like pizza')
token = nlp('soap')[0]
print(doc.similarity(token))

#comapre a span with a document
span = nlp('I like pizza and pasta')[2:5]
doc = nlp('McDonalds sells burgers')
print(span.similarity(doc))

In [None]:
'''
similarity is determined using word vectors: Word2Vec
default: cosine similarity
short phrases are better than long ocuments with many irrelevant words

In [None]:
#word vectors in spaCy
nlp = spacy.load('en_core_web_md')
doc = nlp('I have a banana')
print(doc[3].vector) #access the vector via the token.vector attribute

### combining models and rules

In [None]:
# rule-based matching
# efficient phrase matching
''''
PhraseMatcher like regular expressions or keyword search
-but with access to the tokens
takes Doc object as patterns
more efficient and faster than the Matcher
great for matching large word lists

In [9]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
pattern = nlp('Golden Retriever')
matcher.add('DOG', None, pattern)

doc = nlp('I have a Golden Retriever')

#iterate over the matches
for match_id, start, end in matcher(doc):
    #get the matched span
    span = doc[start:end]
    print('Matched span: ', span.text)

Matched span:  Golden Retriever


## processing pipelines

### built in pipeline components

In [None]:
'''
Name    |  Description          | Creates
tagger  |part of speech tagger  | Token.tag
parser  |dependency parser      | Token.dep, Token.head, Doc.sents, Doc.noun_chunks
ner     |named entity recognizer| Doc.ents, Token.ent_iob, Token.ent_type
textcat |text classifier        | Doc.cats

#nlp.pipe_names: list of pipeline component names
#nlp.pipeline: list of (name, component) tuples

### Custom pipeline components

In [None]:
nlp = spacy.load('en_core_web_sm')
def custom_component(doc):
    print('Doc length: ', len(doc))
    return doc

nlp.add_pipe(custom_component, first=True)
print('Pipeline:', nlp.pipe_names)
doc = nlp('Hello world!')

### extension attributes

In [9]:
##add custom metadata to documents, tokens and spans
##registered on the global Doc, Token or Span using the set_extension method
from spacy.tokens import Doc, Token, Span
Doc.set_extension('title', default=None)
Token.set_extension('is_color', default=False)
Span.set_extension('has_color', default=False)

In [None]:
# 1)attribute extensions
from spacy.tokens import Token
Token.set_extension('is_color', default=False)
doc = nlp('The sky is blue.')

doc[3]._.is_color = True

In [None]:
# 2)property extensions

##define a getter and an optional setter function
##getter only called when you retrieve the attribute value
from spacy.tokens import Token
def get_is_color(token):
    colors = ['red','yellow','blue']
    return token.text in colors

Token.set_extension('is_color', getter=get_is_color)
doc=nlp('The sky is blue')
print(doc[3]._.is_color,'-',doc[3].text)

In [None]:
##span extension should almost always use a getter
from spacy.tokens import Span
def get_has_color(span):
    colors = ['red','yellow','blue']
    return any(token.text in colors for token in span)

Span.set_extension('has_color', getter=get_has_color)
doc = nlp('The sky is blue')
print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

In [None]:
# 3)method extensions

##assign a function that becomes available as an object method
#lets you pass arguments to the extension function
from spacy.tokens import Doc
def has_token(doc, token_text):
    in_doc = token_text in [token.text for token in doc]

Doc.set_extension('has_token', method=has_token)
doc = nlp('The sky is blue.')
print(doc._.has_token('blue', '-blue'))
print(doc._.has_token('cloud', '-cloud'))

### scalling and preformance

In [None]:
# processing large volumes of text-->use nlp.pipe 
##bad
docs = [nlp(text) for text in lots_of_text]
##good
docs = list(nlp.pipe(lots_of_text))

In [15]:
# passing in context
data = [('This is a text',{'id':1,'page_num':15}),
       ('This is another text', {'id':2,'page_num':30})]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context['page_num'])

This is a text 15
This is another text 30


In [None]:
# use nlp.make_doc to turn a text in to a Doc object
##bad
doc = nlp('Hello world')
##good
doc = nlp.make_doc('Hello world')

In [None]:
# use nlp.disable_pipes to temporarily disable one or more pipes
with nlp.disable_pipes('tagger', 'parser'): #disable tagger and parser
    doc=nlp(text) #process the text and print the entities
    print(doc.ents)

## Training a neural network model

### traing and updating models

In [None]:
'''
why updating the model
better results on your specific domain
learn calssification schemes specifically for your problem
essential for text classification
very useful for named entity recognition
less critical for part-of-speech tagging and dependency parsing

In [None]:
'''
how training works
1 initialize the model weights readomly with nlp.begin_training
2 predict a few examples with the current weights by calling nlp.update
3 compare prediction with true labels
4 calculate how to change weights to improve predictions
5 update weights slightly
6 go back to 2

In [None]:
TRAINING_DATA = []

# Create a Doc object for each text in TEXTS
for doc in nlp.pipe(TEXTS):
    # Match on the doc and create a list of matched spans
    spans = [doc[start:end] for match_id, start, end in matcher(doc)]
    # Get (start character, end character, label) tuples of matches
    entities = [(span.start_char, span.end_char, 'GADGET') for span in spans]
    
    # Format the matches as a (doc.text, entities) tuple
    training_example = (doc.text, {'entities': entities})
    # Append the example to the training data
    TRAINING_DATA.append(training_example)
    
print(*TRAINING_DATA, sep='\n')   

### trainging loop

In [None]:
#example loop
TRAINING_DATA = [
    ('how to preoreder the iPhone X',{'entities':[(20,28,'GADGET')]})
    #And many more examples...
]
for i in range(10):
    #shuffle the training data
    random.shuffle(TRAINING_DATA)
    #create batches and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA):
        #Split the batch in texts and annotations
        texts = [text for text, annotation in batch]
        annotations = [annotation for text, annotation in batch]
        #update the model
        nlp.update(texts, annotations)
#save the model
nlp.to_disk(path_to_model)

In [None]:
#setting up a new pipeline from scratch
# Create a blank 'en' model
nlp = spacy.blank('en')

# Create a new entity recognizer and add it to the pipeline
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner)

# Add the label 'GADGET' to the entity recognizer
ner.add_label('GADGET')

#strat the training
nlp.begin_training()
# Loop for 10 iterations
for itn in range(10):
    # Shuffle the training data
    random.shuffle(TRAINING_DATA)
    losses = {}
    
    # Batch the examples and iterate over them
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]
        
        # Update the model
        nlp.update(texts, annotations, losses=losses)
        print(losses)
        
# Process each text in TEST_DATA
for doc in nlp.pipe(TEST_DATA):
    # Print the document text and entitites
    print(doc.text)
    print(doc.ents, '\n\n')

### training best practices

In [None]:
'''
problem 1: models can forget things
 -'catastrophic forgetting' problem. existing model can overfit on new data
solution 1: mix in previously correct predictions

problem 2: models cann't learn everything
 -spaCy's model make predictions based on local context
 -model can struggle to learn if decision is difficult to make based on context
 -label scheme need to be consistent and not too specific
solution 2: plan your label scheme carefully
 -pick categories that are reflected in local context
 -more generic is better than too specific
 -use rules to go from generic labels to specific categories