In [8]:
"""Chapter 1: Finding words, phrases, names and concepts"""
#importing the english language class
from spacy.lang.en import English

#the nlp object/processor which is a pipeline
nlp = English()

In [17]:
"""
Tokenzing in spacy
when using spacy nlp objects to process texts it will create a doc object
"""
doc = nlp("Mrs. Smith you look nice. How are you today?")
doc1 = doc[1]
#getting the token text via .text attribute
doc1_text = doc1.text
for i in doc:
    print(i)
print(doc1,doc1_text)

Mrs.
Smith
you
look
nice
.
How
are
you
today
?
Smith Smith


In [10]:
from nltk.tokenize import word_tokenize

txt = "Mrs. Smith you look nice. How are you today?"
print(word_tokenize(doc))

['Mrs.', 'Smith', 'you', 'look', 'nice', '.', 'How', 'are', 'you', 'today', '?']


In [18]:
"""
Span in spaCy
span is a slice of a obejcts that consits of 1<=
it's only a view of the object and doesn't provide any data
"""
#a span can be created via a python slice
txt = doc[1:3]

#we can directly get the index of token
print([token.text for token in doc])
print([token.i for token in doc])

['Mrs.', 'Smith', 'you', 'look', 'nice', '.', 'How', 'are', 'you', 'today', '?']
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [23]:
"""
Returns a boolean value:
is_alpha: whther the char is an english alpahbet
is_punct: whether it's punctutation
like_num: is the similar to a number(10 or ten would be return True)
"""

doc = nlp("Mrs. Smith you look nice. How are you today? one")

print("is_alpha", [token.is_alpha for token in doc])
print("is_punct:", [token.is_punct for token in doc])
print("like_num:", [token.like_num for token in doc])

is_alpha [False, True, True, True, True, False, True, True, True, True, False, True]
is_punct: [False, False, False, False, False, True, False, False, False, False, True, False]
like_num: [False, False, False, False, False, False, False, False, False, False, False, True]


In [4]:
""" 
Statistical models
it allows spacy to make predictions based on contexts
ex: POS, Syntactic dependencies, Named Enities
spaCy comes with a pre-trained model packages for use
it can be downloaded via (python -m spacy download en_core_web_sm)
"""
import spacy
nlp = spacy.load("en_core_web_sm") #trained on web texts

In [36]:
doc = nlp("She likes apple")
for token in doc:
    #in spacy attributes that return string usually end in an underscore, otherwise it returns an int
    print(token.pos_)

PRON
VERB
NOUN


In [37]:
"""
A dependency parser analyzes the grammatical structure of a sentence, 
establishing relationships between "head" words and words which modify those heads.
"""
for token in doc:
    #dep_: dependency
    #token.head:the parent token the word is attached to 
    print(token.text, token.pos_,token.dep_,token.head.text)

She PRON nsubj likes
likes VERB ROOT likes
apple NOUN dobj likes


In [25]:
"""Named Entity"""

#seems like captilization is important
doc = nlp("Apple is looking at Twitter")

for ne in doc.ents:
    print(ne.text, ne.label_)

Apple ORG
Twitter PRODUCT


In [31]:
""" Getting help from spacy"""
print(spacy.explain("dobj"))
print(spacy.explain("det"))
print(spacy.explain("PRODUCT"))

None
determiner
Objects, vehicles, foods, etc. (not services)


In [2]:
"""spaCy Match"""
"""
instead of re, spacy Match on Doc objects, token and str,whereas re only works on str
it can also search for other lexical attributes
rules can be written for the model's prediction
ex: find duck if only it is a verb
Match patterns are list of dicts:
token texts:[{"TEXT":"iPhone"},{"TEXT":"X"}]
lexical attributes:[{"LOWER":"iphone"},{"LOWER":"x"}]
any token attributes: [{"LEMMA":"buy","POS":"NOUN"}]
each dict contains the key-value pair of the names of token attribute and their expected values
"""

import spacy
#import the Matcher
from spacy.matcher import Matcher

#load the model
nlp = spacy.load("en_core_web_sm")

# #intialize the matcher with the shared vocab
#will be explained, rn just remember to pass it in
matcher = Matcher(nlp.vocab)
#creating a pattern and adding it to Matcher
pattern=[{"TEXT":"iPhone"},{"TEXT":"X"}]
#1st arg:unique id to identify which pattern was matched
#2nd arg: optional callback
#3rd arg: the pattern
#https://stackoverflow.com/questions/66164156/problem-with-using-spacy-matcher-matcher-matcher-add-method
matcher.add("IPHONE_PATTRN",[pattern])

doc = nlp("upcoming iPhone X release date")
#this will return a list of tuples(match-id,start index,end index)
matched=matcher(doc)

for matchID,start,end in matched:
    #matchID: hash id 
    print(doc[start:end].text)

iPhone X


In [41]:
pattern = [{"LOWER":"mlb"},{"LOWER":"the"},{"LOWER":"show"},{"IS_DIGIT":True}]
matcher.add("MLB",[pattern])
doc=nlp("MLB The Show 2018 > FIFA 2020")
matched = matcher(doc)
for x,y,z in matched:
    print(doc[y:z].text)

MLB The Show 2018


In [52]:
#when the love word has verb speech-tag followed by a noun
pattern = [{"LEMMA":"love","POS":"VERB"},{"POS":"NOUN"}]
matcher.add("MLB",[pattern])
doc=nlp("i love pizza and i love ball")
matched=matcher(doc)
for x,y,z in matched:
    print(doc[y:z].text)

love pizza
love ball


In [15]:
"""using identifiers and modifiers"""
pattern = [{"LEMMA":"buy"},{"POS":"DET","OP":"+"},{"POS":"NOUN"}]
pattern2 = [{"LOWER":"mlb"},{"LOWER":"the"},{"LOWER":"show"},{"IS_DIGIT":True}]
# pattern = [{"LEMMA":"buy"},{"POS":"DET","OP":"?"},{"POS":"NOUN"}]
#{"POS":"DET","OP":"?"}: OP: operator, ?: a quantifier(similar to modifier for re) and in this 0 or 1 times
matcher.add("MLB",[pattern,pattern2])
doc=nlp("I bought a smartphone. Now I'm buying apps. I also like MLB The Show 2018")
matched=matcher(doc)
for x,y,z in matched:
    print(doc[y:z].text)
# for i in doc:
#     print(i,i.pos_)

"""
OP can have these values:
!: match 0 times
?: 0 or 1 times
+: 1 or more times
*: 0 or more times
"""

bought a smartphone
MLB The Show 2018


'\nOP can have these values:\n!: match 0 times\n?: 0 or 1 times\n+: 1 or more times\n*: 0 or more times\n'

In [42]:
"""Chapter 2: Large-scale data analysis with spaCy"""

"""
Vocab: stores all shared data in vocab, it contains the word and its tags
spaCy encodes all strs to hash codes
str are only stored once in the StringStore via nlp.vocab.strings
String Sotre:look up values in both directions(hash->str||str->hash)
"""

word_hash=nlp.vocab.strings["coffee"]
word_str=nlp.vocab.strings[word_hash]
print(word_str,word_hash)

#we can also use/do this via doc
doc = nlp("coffee is overrated")
print(doc.vocab.strings["coffee"])

coffee 3197928453018144401
3197928453018144401


In [43]:
#lexmes objects are an context-independent entry in vocab
lexme = nlp.vocab["coffee"]
print(lexme.text,lexme.orth,lexme.is_alpha)

coffee 3197928453018144401 True


In [5]:
"""Creating a Doc manually"""
"""
although a Doc object is automaticlly created when creating a nlp
a Doc object can also be created manually
"""

from spacy.lang.en import English
from spacy.tokens import Doc, Span
nlp = English()
words = ["Hello","World","!"]
spaces = [True,False,False]
#spaces=spaces: whether the word will be followed by a space, this is also includes the word
doc = Doc(nlp.vocab,words=words,spaces=spaces)

#spans can also be created manually
span = Span(doc,0,2)
#labeling the span
span_labeled=Span(doc ,0,2)
#adding a ent to doc
doc.ents = [span_labeled]
for i in doc.ents:
    print(i.text)

In [4]:
"""Word Vector and semantic similairty"""

"""
Note:
comparing words and their similaities requires a larger model than "en_core_web_sm"(small)
valid ones are: "en_core_web_md"(medium) & "en_core_web_lg"(large)
"""

import spacy
# download the large version later
nlp = spacy.load("en_core_web_md")

#what is depended by? subject? meaning? target?

"DOC"
doc1 = nlp("I like chicken")
doc2 = nlp("I like beef")
doc1.similarity(doc2)

0.9420460095864367

In [5]:
"TOKEN"
t1,t2=doc1[2],doc2[2]
t1.similarity(t2)

0.795485

In [6]:
"DOC AND TOKEN"
t2 = doc2[2]
doc1.similarity(t2)

0.5900396443523004

In [8]:
"SPAN AND DOC"
doc = nlp("I like fruits and potatoes")
span = doc[2:5]
span.similarity(doc1)

0.608555753616397

In [13]:
doc = nlp("This was a great restaurant. Afterwards, we went to a really nice bar.")
print(doc[3:5])

great restaurant.


In [33]:
from ideas import p1,p2
import spacy
import en_core_web_md

nlp = en_core_web_md.load()
doc1 = nlp(p1)
doc2 = nlp(p2)

doc1.similarity(doc2)



hello intj
how advmod
are ROOT
you nsubj


In [3]:
"""Combining Rules and models"""
from spacy.matcher import PhraseMatcher
import en_core_web_md as md
#PhraseMatcher is faster than Matcher which is better for a large text size

nlp = md.load()

pattern = nlp("Mango Juice")
pattern2 = nlp("mango juice")
matcher = PhraseMatcher(nlp.vocab)
#instead of taking list of Dicts as args it instead takes Doc objects
matcher.add("MANGO",[pattern,pattern2])

text = "I love apple juice and mango juice"
text=nlp(text)
for matchId,start,end in matcher(text):
    print(text[start:end])

mango juice


In [6]:
"""Chpater 3: Pipelines"""
"""
Custom Pipeline:
we can add our own function that will be executed when the nlp object is called
"""
from spacy.language import Language
import spacy

# @Language.component("custom_work")
def custom_work(doc):
    print("worked")
    return doc

#last=True: add its as the last 
#first=True: add its as the first
# before: before a certain pipeline
# after: after a certain pipeline 
# nlp.add_pipe("custom_work")
nlp.pipe_names
txt = nlp("This is Harry Potter")

txt.ents


worked


(Harry Potter,)

In [2]:
""" Custom Attributes"""
"""
Features: add custom metatdata, to docs, tokens, and spans
used via the "._" property for custom attributes
this makes it is to differenitate whether it was made by the user or not
"""

#to set attributes it needs to be set on global classes
from spacy.tokens import Doc,Token,Span
import en_core_web_md
nlp = en_core_web_md.load()

try:
    Doc.set_extension("title",default=None)
    Span.set_extension("has_color",default=False)
    Token.set_extension("is_color",default=False)
except:
    print(0)

doc = nlp("This is my doc. This is blue")

doc._.title = "My doc"
for token in doc:
    if token.text == "blue":
        token._.is_color=True
        

"""
Types of extensions:
Attribute, Property, Method
"""

doc._.title
doc[7]._.is_color


True

In [15]:
"""Getter"""

#Property extensions
def get_rgb(token):
    rgb = ["red","green","blue"]
    return token.text.lower() in rgb

try:
    Token.set_extension("is_rgb",getter=get_rgb)
except:
    pass
#TRY A SETTER NEXT
doc = nlp("This is blue. BLUE")
doc[2]._.is_rgb



True

In [19]:
"""Span Extenstions should always use a getter"""
#Property extensions
def get_if_rgb(span):
    rgb = ["red", "green", "blue"]
    return any(token.text.lower() in rgb for token in span)
try:
    Span.set_extension("has_color",getter=get_if_rgb,force=True)
except Exception as e:
    print(e)

doc[0:4]._.has_color


True

In [24]:
"""Method extension"""
"""
Features:
assign functions that can be called as an object method
allows passing of args
"""

from spacy.tokens import Doc

def has_token(doc,txt):
    return txt.lower() in [token.text.lower() for token in doc]

# Doc.set_extension("has_token",method=has_token)

doc=nlp("I like the color red.")

doc._.has_token("RED")


True

In [29]:
import spacy
from spacy.tokens import Span

nlp = spacy.load("en_core_web_md")


def get_wikipedia_url(span):
    # Get a Wikipedia URL if the span has one of the labels
    if span.label_ in ("PERSON", "ORG", "GPE", "LOCATION"):
        entity_text = span.text.replace(" ", "_")
        return "https://en.wikipedia.org/w/index.php?search=" + entity_text


# Set the Span extension wikipedia_url using the getter get_wikipedia_url
# Span.set_extension("wikipedia_url", getter=get_wikipedia_url)

doc = nlp(
    "In over fifty years from his very first recordings right through to his "
    "last album, David Bowie was at the vanguard of contemporary culture."
)
for ent in doc.ents:
    # Print the text and Wikipedia URL of the entity
    print(ent.text, ent._.wikipedia_url)
    # print(ent)
nlp.pipe_names

over fifty years None
first None
David Bowie https://en.wikipedia.org/w/index.php?search=David_Bowie


['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [None]:
import json
from spacy.lang.en import English
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher

with open("exercises/en/countries.json", encoding="utf8") as f:
    COUNTRIES = json.loads(f.read())

with open("exercises/en/capitals.json", encoding="utf8") as f:
    CAPITALS = json.loads(f.read())

nlp = English()
matcher = PhraseMatcher(nlp.vocab)
matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    # Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [Span(doc, start, end, label="GPE") for match_id, start, end in matches]
    return doc


# Add the component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

# Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

# Register the Span extension attribute "capital" with the getter get_capital
Span.set_extension("capital", getter=get_capital)

# Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protect its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])

In [1]:
"""Processing and performance"""
import spacy,en_core_web_md

nlp = en_core_web_md.load()
"""
"SLOW"
docs = [nlp(text) for texts in MANY_TEXTS]
"FAST"
nlp.pipe will make faster since it is a generator and yields objects
docs = list(nlp.pipe(MANY_TEXTS))
"""


'\n"SLOW"\ndocs = [nlp(text) for texts in MANY_TEXTS]\n"FAST"\nnlp.pipe will make faster since it yields objects\ndocs = list(nlp.pipe(MANY_TEXTS))\n'

In [2]:
"""
Setting as_tuples=True on nlp.pipe lets you pass in (text, context) tuples
Yields (doc, context) tuples
Useful for associating metadata with the doc
"""

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

This is a text 15
And another text 16


In [3]:
"""Extensions and pipe"""
from spacy.tokens import Doc

Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

data = [
    ("This is a text", {"id": 1, "page_number": 15}),
    ("And another text", {"id": 2, "page_number": 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context["id"]
    doc._.page_number = context["page_number"]

In [9]:
"""Tokenizing only"""
"slow"
doc = nlp("The sky is blue.")
doc[0].like_num

False

In [10]:
"this returns a Doc object and only the words tokenized"
"so this will be a lot faster"
doc = nlp.make_doc("The sky is blue.")

In [33]:
"""Disabling pipelines"""
import spacy
nlp = spacy.load("en_core_web_md")

text="Joe Biden"
with nlp.disable_pipes("parser","ner"):
    doc = nlp(text)
    print(doc.ents)

()


In [21]:
from spacy.lang.en import English

nlp = English()

people = ["David Bowie", "Angela Merkel", "Lady Gaga"]

patterns = [nlp(person) for person in people]

for i in patterns:
    print(i)

David Bowie
Angela Merkel
Lady Gaga


In [22]:
patterns = list(nlp.pipe(people))

for i in patterns:
    print(i)

David Bowie
Angela Merkel
Lady Gaga


In [None]:
"""Chapter 4"""

"""Training Loops"""

"""
Steps of traning:
1)Loop for a number of times.
2)Shuffle the training data.
3)Divide the data into batches.
4)Update the model for each batch.
5)Save the updated model.
"""

""" The
numbers printed to the shell represent the loss on each iteration, the amount of
work left for the optimizer. The lower the number, the better.
"""
"Do both sentiment analysis using both spacy and nltk to see which gets better results"
