# NICF – Natural Language Processing (NLP) with Python for Beginners

## Topic 1 Overview of NLP and Deep Learning

## Topic 2 Language Modeling



### Install spaCy

In [None]:
#Install spaCy v2
#!pip3 install spacy=2.2.4

#Install spaCy v3
#!pip3 install spacy==3.0.3

In [None]:
import spacy
print(spacy.__version__)

### The nlp object

In [None]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

In [None]:
nlp

### The Doc object

In [None]:
# Created by processing a string of text with the nlp object
doc = nlp("Hello world!")
doc

In [None]:
text = open('sample.txt').read()
doc = nlp(text)
doc

### The Token object

In [None]:
doc = nlp("Hello world!")
doc

# Iterate over tokens in a Doc
for token in doc:
    print(token.text)

In [None]:
# Index into the Doc to get a single Token
token = doc[1]

# Get the token text via the .text attribute
print(token.text)

#### Activity: The Doc and Token Object

In [None]:
# Import the English language class
from spacy.lang.____ import ____
​
# Create the nlp object
nlp = ____
​
# Process a text
doc = nlp("This is a sentence.")
​
# Print the document text
print(____.text)

#### Solution: The Doc and Token Object

In [None]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

# Create the nlp object
nlp = English()

# Process the text
doc = nlp("This is a sentence.")

# Print the token's text
for token in doc:
    print(token.text)

#### Activity: The Token Object

In [None]:
# Import the English language class and create the nlp object
from ____ import ____
​
nlp = ____
​
# Process the text
doc = ____("I like tree kangaroos and narwhals.")
​
# Select the first token
first_token = doc[____]
​
# Print the first token's text
print(first_token.____)

#### Solution: The Token Object

In [None]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# Select the first token
first_token = doc[0]

# Print the first token's text
print(first_token.text)

### Stopwords

In [None]:
stopwords = nlp.Defaults.stop_words
stopwords

In [None]:
import string
punctuations = string.punctuation

text = open('sample.txt').read()
doc = nlp(text)

tokens = [token.text for token in doc if token.text not in stopwords and token.text not in punctuations]
cleaned_doc = ' '.join(tokens)
cleaned_doc


#### Activity: Stopwords

In [None]:
text= "Clutching the coin, Maria ran to the shops. She went straight to the counter and bought the sweets"
doc = nlp(text)

tokens = [tok.text for tok in doc if tok.text not in stopwords and tok.text not in punctuations]
cleaned_doc = ' '.join(tokens)
cleaned_doc

### The Span object

In [None]:
doc = nlp("Hello world!")

# A slice from the Doc is a Span object
span = doc[1:3]

# Get the span text via the .text attribute
print(span.text)

#### Activity: The Span Object

In [None]:
# Import the English language class and create the nlp object
from ____ import ____

nlp = ____

# Process the text
doc = ____("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = ____
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = ____
print(tree_kangaroos_and_narwhals.text)

#### Solution: The Span Object

In [None]:
# Import the English language class and create the nlp object
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp("I like tree kangaroos and narwhals.")

# A slice of the Doc for "tree kangaroos"
tree_kangaroos = doc[2:4]
print(tree_kangaroos.text)

# A slice of the Doc for "tree kangaroos and narwhals" (without the ".")
tree_kangaroos_and_narwhals = doc[2:6]
print(tree_kangaroos_and_narwhals.text)

### Lexical Attributes

In [None]:
doc = nlp("It costs $5.")

In [None]:
print("Index:   ", [token.i for token in doc])
print("Text:    ", [token.text for token in doc])

print("is_alpha:", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

#### Activity: Lexical Attributes

In [None]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)

# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if ____.____:
        # Get the next token in the document
        next_token = ____[____]
        # Check if the next token's text equals "%"
        if next_token.____ == "%":
            print("Percentage found:", token.text)

#### Solution: Lexical Attributes

In [None]:
from spacy.lang.en import English

nlp = English()

# Process the text
doc = nlp(
    "In 1990, more than 60% of people in East Asia were in extreme poverty. "
    "Now less than 4% are."
)
# Iterate over the tokens in the doc
for token in doc:
    # Check if the token resembles a number
    if token.like_num:
        # Get the next token in the document
        next_token = doc[token.i+1]
        # Check if the next token's text equals "%"
        if next_token.text == "%":
            print("Percentage found:", token.text)

### Statistical Model Packages

In [None]:
!python3 -m spacy download en_core_web_sm

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

#### Activity: Model Packages

In [None]:
import spacy

# Load the "en_core_web_sm" model
nlp = ____

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = ____

# Print the document text
print(____.____)

#### Solution; Model Packages

In [None]:
import spacy

# Load the "en_core_web_sm" model
nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Print the document text
print(doc.text)


### Predicting Part-of-Speech Tags

In [None]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Process a text
doc = nlp("She ate the pizza")

# Iterate over the tokens
for token in doc:
    # Print the text and the predicted part-of-speech tag
    print(token.text, token.pos_)

### Predicting Syntactic Dependencies

In [None]:
for token in doc:
    print(token.text, token.pos_, token.dep_, token.head.text)

#### Activity: Predicting POS and Syntactic Dependency

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = ____

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = ____.____
    token_pos = ____.____
    token_dep = ____.____
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

#### Solution: Predicting POS and Syntactic Dependency

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

### Visualize Syntactic Dependency

In [None]:
from spacy import displacy

doc = nlp('She ate the pizza')

html = displacy.render([doc], style='dep', jupyter=True)

### Predicting Named Entities

In [None]:
# Process a text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

#### Activity: Predict Named Entities

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = ____

# Iterate over the predicted entities
for ent in ____.____:
    # Print the entity text and its label
    print(ent.____, ____.____)

#### Solution: Predict Named Entities

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "It’s official: Apple is the first U.S. public company to reach a $1 trillion market value"

# Process the text
doc = nlp(text)

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

### Visualie Named Entities

In [None]:
from spacy import displacy

doc = nlp('Apple is looking at buying U.K. startup for $1 billion')

html = displacy.render([doc], style='ent', jupyter=True)

### Wrong Prediction

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "Upcoming iPhone X release date leaked as Apple reveals pre-orders"

# Process the text
doc = nlp(text)

# Iterate over the entities
for ent in doc.ents:
    # Print the entity text and label
    print(ent.text, ent.label_)

# Get the span for "iPhone X"
iphone_x = doc[1:3]

# Print the span text
print("Missing entity:", iphone_x.text)

### Lemmatization

In [None]:
doc = nlp('I ran to the clinic with running nose')


for token in doc:
	print(token.text,token.lemma_)

### Rule-Based Matching

In [None]:
import spacy

# Import the Matcher
from spacy.matcher import Matcher

# Load a model and create the nlp object
nlp = spacy.load("en_core_web_sm")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

In [None]:
# Call the matcher on the doc
doc = nlp("Upcoming iPhone X release date leaked")
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

In [None]:
pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]

doc = nlp("I loved dogs but now I love cats more.")

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
matcher.add('PETS', [pattern])

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

#### Activity: Rule Based Matching

In [None]:
import spacy

# Import the Matcher
from spacy.____ import ____

nlp = spacy.load("en_core_web_sm")
doc = nlp("Upcoming iPhone X release date leaked as Apple reveals pre-orders")

# Initialize the Matcher with the shared vocabulary
matcher = ____(____.____)

# Create a pattern matching two tokens: "iPhone" and "X"
pattern = [____]

# Add the pattern to the matcher
____.____("IPHONE_X_PATTERN",  ____)

# Use the matcher on the doc
matches = ____
print("Matches:", [doc[start:end].text for match_id, start, end in matches])

#### Solutoin: Rule Based Matching

In [None]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

doc = nlp(
    "After making the iOS update you won't notice a radical system-wide "
    "redesign: nothing like the aesthetic upheaval we got with iOS 7. Most of "
    "iOS 11's furniture remains the same as in iOS 10. But you will discover "
    "some tweaks once you delve a little deeper."
)

# Write a pattern for full iOS versions ("iOS 7", "iOS 11", "iOS 10")
pattern = [{"TEXT": "iOS"}, {"IS_DIGIT": True}]

# Add the pattern to the matcher and apply the matcher to the doc
matcher.add("IOS_VERSION_PATTERN", [pattern])
matches = matcher(doc)
print("Total matches found:", len(matches))

# Iterate over the matches and print the span text
for match_id, start, end in matches:
    print("Match found:", doc[start:end].text)

## Topic 3 Word Embedding

### Vocab

In [None]:
# Import the English language class
from spacy.lang.en import English

# Create the nlp object
nlp = English()

In [None]:
doc = nlp("I love coffee")
print("hash value:", nlp.vocab.strings["coffee"])
print("string value:", nlp.vocab.strings[3197928453018144401])

#### Activity: Vocab

In [None]:
from spacy.lang.en import English
​
nlp = English()
doc = nlp("I have a cat")
​
# Look up the hash for the word "cat"
cat_hash = ____.____.____[____]
print(cat_hash)
​
# Look up the cat_hash to get the string
cat_string = ____.____.____[____]
print(cat_string)

#### Solution: Vocab

In [None]:
from spacy.lang.en import English

nlp = English()
doc = nlp("I have a cat")

# Look up the hash for the word "cat"
cat_hash = nlp.vocab.strings["cat"]
print(cat_hash)

# Look up the cat_hash to get the string
cat_string = nlp.vocab.strings[cat_hash]
print(cat_string)

#### Activity: Vocab

In [None]:
from spacy.lang.en import English

nlp = English()
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash = ____.____.____[____]
print(person_hash)

# Look up the person_hash to get the string
person_string = ____.____.____[____]
print(person_string)

#### Solution: Vocab

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("David Bowie is a PERSON")

# Look up the hash for the string label "PERSON"
person_hash = nlp.vocab.strings["PERSON"]
print(person_hash)

# Look up the person_hash to get the string
person_string = nlp.vocab.strings[person_hash]
print(person_string)

### Lexeme

In [None]:
doc = nlp("I love coffee")
lexeme = nlp.vocab["coffee"]

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

### Install Medium English Model

In [None]:
!python3 -m spacy download en_core_web_md

### Word Vectors

In [None]:
# Load a larger model with vectors
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("I have a banana")
# Access the vector via the token.vector attribute
print(doc[3].vector)

#### Activity: Word Vectors

In [None]:
import spacy

# Load the en_core_web_md model
nlp = ____

# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = ____.____
print(bananas_vector)

#### Solution: Word Vectors

In [None]:
import spacy

# Load the en_core_web_md model
nlp = spacy.load("en_core_web_md")

# Process a text
doc = nlp("Two bananas in pyjamas")

# Get the vector for the token "bananas"
bananas_vector = doc[1].vector
print(bananas_vector)

### Comparing Similarity

In [None]:
# Load a larger model with vectors
nlp = spacy.load("en_core_web_md")

# Compare two documents
doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))

In [None]:
# Compare two tokens
doc = nlp("I like pizza and pasta")
token1 = doc[2]
token2 = doc[4]
print(token1.similarity(token2))

In [None]:
dog = nlp.vocab['mother']
animal = nlp.vocab['father']

print(dog.similarity(animal))


In [None]:
doc1 = nlp("Dogs are awesome.")
doc2 = nlp("Some gorgeous creatures are felines.")

print(doc1.similarity(doc2))


#### Activity: Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = ____.____(____)
print(similarity)

#### Solution; Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc1 = nlp("It's a warm summer day")
doc2 = nlp("It's sunny outside")

# Get the similarity of doc1 and doc2
similarity = doc1.similarity(doc2)
print(similarity)

In [None]:
# Compare a document with a token
doc = nlp("I like pizza")
token = nlp("soap")[0]

print(doc.similarity(token))

In [None]:
# Compare a span with a document
span = nlp("I like pizza and pasta")[2:5]
doc = nlp("McDonalds sells burgers")

print(span.similarity(doc))

#### Activity: Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = ____.____(____)
print(similarity)

#### Solution: Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("TV and books")
token1, token2 = doc[0], doc[2]

# Get the similarity of the tokens "TV" and "books"
similarity = token1.similarity(token2)
print(similarity)

### Similarity Depends on the Application Context

In [None]:
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

#### Activity: Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = ____
span2 = ____

# Get the similarity of the spans
similarity = ____.____(____)
print(similarity)

#### Solution: Comparing Similarity

In [None]:
import spacy

nlp = spacy.load("en_core_web_md")

doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")

# Create spans for "great restaurant" and "really nice bar"
span1 = doc[3:5]
span2 = doc[12:15]

# Get the similarity of the spans
similarity = span1.similarity(span2)
print(similarity)

## Topic 4 Text Classification


### Default Pipeline

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

print(nlp.pipe_names)

In [None]:
text ="I ate apple"
doc = nlp("I ate apple")
for token in doc:
    print(token.text, token.pos_)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm",exclude=['tagger'])

print(nlp.pipe_names)

In [None]:
text ="I ate apple"
doc = nlp("I ate apple")
for token in doc:
    print(token.text, token.pos_)

In [None]:
nlp.pipe_names

#### Activity: Inspecting the Pipeline

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = ____

# Print the names of the pipeline components
print(____.____)

# Print the full pipeline of (name, component) tuples
print(____.____)

#### Solution: Inspecting the Pipelines

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm")

# Print the names of the pipeline components
print(nlp.pipe_names)

# Print the full pipeline of (name, component) tuples
print(nlp.pipeline)

### Disable Components

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load("en_core_web_sm",disable=['tagger'])

nlp.pipeline

In [None]:
doc = nlp('The cat sit on the mat')
for token in doc:
    print(token.text,token.pos_)

### Exclude Components

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm",exclude=['tagger'])

print(nlp.pipe_names)

In [None]:
doc = nlp('The cat sit on the mat')
for token in doc:
    print(token.text, token.pos_)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")
nlp.remove_pipe('tagger')

print(nlp.pipe_names)

In [None]:
doc = nlp('The cat sit on the mat')
for token in doc:
    print(token.text, token.pos_)

### Custom Component

### Example 1

#### SpaCy v2.x

In [None]:
import spacy
from spacy.language import Language

def custom_component(doc):
    print("Doc length:", len(doc))
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(custom_component, first=True)

print("Pipeline:", nlp.pipe_names)
doc = nlp("Hello World!")

#### SpaCy v3.x

In [None]:
import spacy
from spacy.language import Language

@Language.component("custom_component")
def custom_component(doc):
    print("Doc length:", len(doc))
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("custom_component", name="custom_component", first=True)

print("Pipeline:", nlp.pipe_names)
doc = nlp("Hello World!")

### Example 2

#### SpaCy v2.x

In [None]:
import spacy
from spacy.language import Language

def my_component(doc):
    print(f"After tokenization, this doc has {len(doc)} tokens.")
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(my_component, last=True)
print(nlp.pipe_names)  
doc = nlp("This is a sentence.")

#### SpaCy v3.x

In [None]:
import spacy
from spacy.language import Language

@Language.component("info_component")
def my_component(doc):
    print(f"After tokenization, this doc has {len(doc)} tokens.")
    print("The part-of-speech tags are:", [token.pos_ for token in doc])
    if len(doc) < 10:
        print("This is a pretty short document.")
    return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("info_component", name="print_info", last=True)
print(nlp.pipe_names)  
doc = nlp("The fox jump over a lazy dog and a cat chase after a slow rat.")

#### Activity: Custom Component

#### SpaCy v2.x

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = ____
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    ____

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
____.____(____)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")


#### SpaCy v3.x

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
@Language.component("length_component")
def length_component(doc):
    # Get the doc's length
    doc_length = ____
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    ____

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
____.____(____)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")


#### Solution: Custom Component

#### SpaCy v2.x

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe(length_component,first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

#### SpaCy v3.x

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
@Language.component("length_component")
def length_component(doc):
    # Get the doc's length
    doc_length = len(doc)
    print(f"This document is {doc_length} tokens long.")
    # Return the doc
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe("length_component", name="length_component",first=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

#### Activity 2: Custom Component

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
@Language.component("my_component2")
def my_component2(doc):
    [Add your code here]
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe("____", name="",last=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

#### Solution: Custom Component

In [None]:
import spacy
from spacy.language import Language

# Define the custom component
@Language.component("my_component2")
def my_component2(doc):
    for tok in doc:
        print(tok.text, tok.pos_)
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe("my_component2", name="my_component2",last=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("This is a sentence.")

#### Activity 3: Custom Component

In [None]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher

# Define the custom component
@Language.component("my_component3")
def my_component3(doc):
    pattern = [
    ____
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add('PETS', [pattern])
    matches = matcher(doc)
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        print(matched_span.text)
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe("___", name="_____",last=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("I loved dogs but now I love cats more.")

#### Solution 3: Custom Component

In [None]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher

# Define the custom component
@Language.component("my_component3")
def my_component3(doc):
    pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add('PETS', [pattern])
    matches = matcher(doc)
    for match_id, start, end in matches:
        matched_span = doc[start:end]
        print(matched_span.text)
    return doc

# Load the small English model
nlp = spacy.load("en_core_web_sm")

# Add the component first in the pipeline and print the pipe names
nlp.add_pipe("my_component3", name="my_component3",last=True)
print(nlp.pipe_names)

# Process a text
doc = nlp("I loved dogs but now I love cats more.")

### Model Training

#### Prepare the Data

In [None]:
import spacy
import random
import json
from spacy.training import Example

with open("gadgets.json", encoding="utf8") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("en")
ner = nlp.create_pipe('ner')
nlp.add_pipe('ner')
ner.add_label("GADGET")

examples = []
for text, annots in TRAINING_DATA:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
    
#print(examples)

#### Train th Model

In [None]:
nlp.begin_training()
for i in range(10):
    random.shuffle(examples)
    for batch in spacy.util.minibatch(examples, size=3):
        nlp.update(batch)

#### Evaluate the Model

In [None]:
doc = nlp("I want to buy a new iPhone 12 and iPhone X this Christmas")
for ent in doc.ents:
    print(ent.text, ent.label_)

In [None]:
from spacy import displacy

doc = nlp('I want to buy a new iPhone 12 and iPhone X this Christmas')

html = displacy.render([doc], style='ent', jupyter=True)

### Text Classifcation (Example 1)
####  The example below only works for SpaCy v3.x (Error due to compatibility issue)

In [None]:
!python3 -m spacy download en_core_web_md

In [None]:
import spacy
# tqdm is a great progress bar for python
# tqdm.auto automatically selects a text based progress 
# for the console 
# and html based output in jupyter notebooks
from tqdm.auto import tqdm
# DocBin is spacys new way to store Docs in a 
# binary format for training later
from spacy.tokens import DocBin
# We want to classify movie reviews as positive or negative
# http://ai.stanford.edu/~amaas/data/sentiment/
from ml_datasets import imdb
# load movie reviews as a tuple (text, label)
train_data, valid_data = imdb()
# load a medium sized english language model in spacy
nlp = spacy.load("en_core_web_md")

In [None]:
def make_docs(data):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    
    docs = []
    # nlp.pipe([texts]) is way faster than running 
    # nlp(text) for each text
    # as_tuples allows us to pass in a tuple, 
    # the first one is treated as text
    # the second one will get returned as it is.
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        # we need to set the (text)cat(egory) for each document
        doc.cats["positive"] = label
        
        # put them into a nice list
        docs.append(doc)
    
    return docs

In [None]:
# we are so far only interested in the first 5000 reviews
# this will keep the training time short.
# In practice take as much data as you can get.
# you can always reduce it to make the script even faster.
num_texts = 5000
# first we need to transform all the training data
train_docs = make_docs(train_data[:num_texts])
# then we save it in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./data/train.spacy")
# repeat for validation data
valid_docs = make_docs(valid_data[:num_texts])
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./data/valid.spacy")

In [None]:
!python3 -m spacy init fill-config ./base_config.cfg ./config.cfg

In [None]:
!python3 -m spacy train config.cfg --output ./output

In [None]:
import spacy
# load thebest model from training
nlp = spacy.load("output/model-best")
text = ""
print("type : ‘quit’ to exit")
# predict the sentiment until someone writes quit
while text != "quit":
    text = input("Please enter example input: ")
    doc = nlp(text)
    if doc.cats['positive'] >.5:
        print(f"the sentiment is positive")
    else:
        print(f"the sentiment is negative")

### Text Classification (Example 2)
#### The example below only works for SpaCy v3.x

#### Download dataset

In [None]:
# Import pandas & read csv file
import pandas as pd
reviews=pd.read_csv("https://raw.githubusercontent.com/hanzhang0420/Women-Clothing-E-commerce/master/Womens%20Clothing%20E-Commerce%20Reviews.csv")

In [None]:
# Extract desired columns and view the dataframe 
reviews = reviews[['Review Text','Recommended IND']].dropna()
reviews.head(10)

#### Add Textcat Pipeline

In [None]:
import spacy

nlp = spacy.blank('en')
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

nlp.pipe_names

#### Preparing Training Data

In [None]:
# Converting the dataframe into a list of tuples
reviews['tuples'] = reviews.apply(lambda row: (row['Review Text'],row['Recommended IND']), axis=1)
train =reviews['tuples'].tolist()
train[:10]

In [None]:
import random

def load_data(limit=0, split=0.8):
    train_data=train
    # Shuffle the data
    random.shuffle(train_data)
    texts, labels = zip(*train_data)
    # get the categories for each review
    cats = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)} for y in labels]

    # Splitting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

n_texts=23486

# Calling the load_data() function 
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)

# Processing the final format of training data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:10]

#### Train the Model

In [None]:
from spacy.training import Example

nlp.begin_training()

examples = []
for text, annots in train_data:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))

#nlp.initialize(lambda: examples)

for i in range(10):
    random.shuffle(examples)
    for batch in spacy.util.minibatch(examples, size=3):
        nlp.update(batch)


#### Evaluate the Model

In [None]:
test_text="I hate this dress"
doc=nlp(test_text)
doc.cats 

## Topic 5 Overview of Attention Mechanism

In [None]:
!pip3 install spacy-transformers

In [None]:
!python3 -m spacy download en_core_web_trf

In [None]:
import spacy

nlp = spacy.load("en_core_web_trf")
print(nlp.pipe_names) 

In [None]:
math_sent_a = """The most common approach to turn this intuitive idea into a """\
              """precise definition is to define the derivative as a limit of """\
              """difference quotients of real numbers"""
math_sent_b = """The fundamental theorem of calculus relates antidifferentiation """\
              """with integration"""
history_sent_c = """He went on to lead the Conservatives to a record fourth """\
                 """consecutive electoral victory, winning the most votes in """\
                 """British electoral history with over 14 million votes at """\
                 """the 1992 general election, albeit with a reduced majority """\
                 """in the House of Commons"""

In [None]:
import spacy

nlp = spacy.load("en_core_web_trf")

math_nlp_a = nlp(math_sent_a)
math_nlp_b = nlp(math_sent_b)
history_nlp_c = nlp(history_sent_c)

In [None]:
print(math_nlp_a[0].similarity(math_nlp_b[0]))

In [None]:
print(math_nlp_a[0].similarity(history_nlp_c[0]))

In [None]:
import spacy

TRAIN_DATA = [
    ("He told us a very exciting adventure story.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
    ("She wrote him a long letter, but he didn't read it.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
    ("I am never at home on Sundays.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
    ("He ran out of money, so he had to stop playing poker.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]

nlp = spacy.load("en_core_web_trf")

textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")


In [None]:
from spacy.training import Example

examples = []
for text, annots in TRAIN_DATA:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))

optimizer = nlp.create_optimizer()   

for i in range(10):
    random.shuffle(examples)
    for batch in spacy.util.minibatch(examples, size=3):
        nlp.update(batch)