In [1]:
# !python -m spacy download en_core_web_lg
# from spacy.lang.en import English
# nlp = English()
import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_lg")

In [2]:
# Tagger (part of speech tagger) -> Token.tag, Token,pos
# Parser (Dependency parser) -> Token.dep, Token,head, Doc.sents, Doc.noun_chunks
# ner (Named entity recognizer) -> Doc.ents, Token.ent_iob, Token.ent_type
# textcat (Text classifier) -> Doc.cats

doc = nlp("1. Let's learn Spacy for $0 billion")
# doc = nlp("An awesome tutorial")
print(doc.text)
list(doc.sents), list(doc.noun_chunks)
doc.ents, doc.cats

1. Let's learn Spacy for $0 billion


((1, $0 billion), {})

In [3]:
# token.pos_: returns string, token.pos: returns: ID value
for token in doc:
    print(f"Text({token.i}): {token.text}, Attributr: {token.pos_}, TAG: {token.tag_}")
    print(f"is_alpha: {token.is_alpha}, is_punctuation: {token.is_punct}, like_number: {token.like_num}, is_currency:{token.is_currency}")
    print(f"{token.dep_}")
    print()


Text(0): 1, Attributr: X, TAG: LS
is_alpha: False, is_punctuation: False, like_number: True, is_currency:False
ROOT

Text(1): ., Attributr: PUNCT, TAG: .
is_alpha: False, is_punctuation: True, like_number: False, is_currency:False
punct

Text(2): Let, Attributr: VERB, TAG: VB
is_alpha: True, is_punctuation: False, like_number: False, is_currency:False
ROOT

Text(3): 's, Attributr: PRON, TAG: PRP
is_alpha: False, is_punctuation: False, like_number: False, is_currency:False
nsubj

Text(4): learn, Attributr: VERB, TAG: VB
is_alpha: True, is_punctuation: False, like_number: False, is_currency:False
ccomp

Text(5): Spacy, Attributr: PROPN, TAG: NNP
is_alpha: True, is_punctuation: False, like_number: False, is_currency:False
dobj

Text(6): for, Attributr: ADP, TAG: IN
is_alpha: True, is_punctuation: False, like_number: False, is_currency:False
prep

Text(7): $, Attributr: SYM, TAG: $
is_alpha: False, is_punctuation: False, like_number: False, is_currency:True
quantmod

Text(8): 0, Attributr:

In [4]:
doc = nlp("""
A match is a tool for starting a fire. 
1. Typically, modern matches are made of small wooden sticks or stiff paper. 
2. It don't cost $1 million !
3. One end is coated with a material that can be ignited by frictional heat generated by striking the match 
against a suitable surface. 
4. Wooden matches are packaged in matchboxes, and paper matches are partially cut into rows and stapled 
into matchbooks.""")

### Operator
`{"OP": "!"}` -> Negation: match 0 times  
`{"OP": "?"}` -> Negation: match 0 or 1 times  
`{"OP": "+"}` -> Negation: match 1 or more times  
`{"OP": "*"}` -> Negation: match 0 or more times

In [5]:
matcher = Matcher(nlp.vocab)
pattern = [{"TEXT": "match"}, {"TEXT": "is"}]     # ["match is"]
pattern = [{"TEXT": "matches"}, {"TEXT": "is"}]   # None

pattern = [{"LOWER": "match"}, {"LOWER": "is"}]   # ["match is"]
pattern = [{"LOWER": "matches"}, {"LOWER": "is"}] # None

pattern = [{"LEMMA": "match"}, {"LOWER": "are"}] # ["matches are", "matches are", "matches are"]
matcher.add("1", [pattern])
pattern = [{"LEMMA": "match"}, {"TEXT": "are"}]  # same as above
pattern = [{"LEMMA": "matches"}, {"LOWER": "are"}] # None

for match_id, start, end in matcher(doc):
    print(f"string repr: {nlp.vocab.strings[match_id]}")
    print(doc[start:end].text)


string repr: 1
matches are
string repr: 1
matches are
string repr: 1
matches are


In [6]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)
pattern = nlp("ignited by frictional")
matcher.add("2", [pattern])
for match_id, start, end in matcher(doc):
    print(f"string repr: {nlp.vocab.strings[match_id]}")
    print(nlp.vocab.strings[match_id])

string repr: 2
2


In [7]:
spacy_id = nlp.vocab.strings["Spacy"]
print(f"ID: {spacy_id}")
print(f"string repr: {nlp.vocab.strings[spacy_id]}")

ID: 6947170135922310690
string repr: Spacy


In [8]:
# Similarity measure

doc1 = nlp("I like fast food")
doc2 = nlp("I like pizza")
print(doc1.similarity(doc2))
print(f"sim({doc1[2:]}, {doc2[2]}) {doc1[2:].similarity(doc2[2])}")
print(f"sim({doc1[3], doc2[2]}) {doc1[3].similarity(doc2[2])}")
print(f"sim(food, dog) {nlp('food').similarity(nlp('dog'))}")
print(f"sim(food, eat) {nlp('food').similarity(nlp('eat'))}")


0.8627204117787385
sim(fast food, pizza) 0.5443140268325806
sim((food, pizza)) 0.5924741625785828
sim(food, dog) 0.45913576505873555
sim(food, eat) 0.7164894513173897


## Build in pipeline components
- `Tagger` (part of speech tagger) -> Token.tag, Token,pos
- `Parser` (Dependency parser) -> Token.dep, Token,head, Doc.sents, Doc.noun_chunks
- `ner` (Named entity recognizer) -> Doc.ents, Token.ent_iob, Token.ent_type
- `textcat` (Text classifier) -> Doc.cats

Pipeline defined in model's `meta.json` in order  
- All models include a meta.json that defines the language to initialize, the pipeline component names to load as well as general meta information like the model name, version, license, data sources, author and accuracy figures (if available).  
- To predict linguistic annotations like part-of-speech tags, dependency labels or named entities, models include binary weights.  
- Model packages include a strings.json that stores the entries in the model’s vocabulary and the mapping to hashes. This allows spaCy to only communicate in hashes and look up the corresponding string if needed.  

```json
{
    "lang": "en":,
    "name", "core_web_lg",
    "pipeline": ["tagger", "parser", "ner"]
}
```
Build-in components need binary data to make predictions.

In [9]:
nlp.pipe_names  # ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x242a59e3180>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x242a5a38770>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x242a584ce80>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x242a5705880>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x242a5aa6b40>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x242a5aa9700>)]

## Custom pipeline components


In [10]:
from spacy.language import Language

@Language.component("custom_component")
def custom_component(doc):
    print(f"After tokenization, this doc has {len(doc)} tokens.")
    print("The part of speech tags are:", [token.pos_ for token in doc])
    return doc

In [11]:
import spacy
nlp = spacy.load("en_core_web_lg")
from spacy.language import Language
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [12]:
nlp.add_pipe("custom_component", first=True)
nlp.pipe_names

['custom_component',
 'tok2vec',
 'tagger',
 'parser',
 'ner',
 'attribute_ruler',
 'lemmatizer']

In [13]:
doc = nlp("1. Let's learn Spacy for $0 billion")
print(doc.text)
# for token in doc:
#     print(token.text)


After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']
1. Let's learn Spacy for $0 billion


## Custom Attributes

In [14]:
from spacy.tokens import Doc, Token, Span

In [15]:
# Attribute Extensions
Doc.set_extension("title", default=None)
Token.set_extension("is_important", default=False)
Span.set_extension("weight", default=0)

In [16]:
doc = nlp("1. Let's learn Spacy for $0 billion")
doc._.title = "Statement"
doc[5]._.is_important = True
doc[2:6]._.weight = 0.5
doc._.title, doc[5]._.is_important, doc[2:4]._.weight


After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']


('Statement', True, 0)

In [17]:
Token.remove_extension("is_important")
Token.set_extension("is_important", getter=lambda token : token.is_title or token.is_currency)

doc = nlp("1. Let's learn Spacy for $0 billion")
for token in doc:
    print(f"\"{token}\" (is imp): {token._.is_important}")


After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']
"1" (is imp): False
"." (is imp): False
"Let" (is imp): True
"'s" (is imp): False
"learn" (is imp): False
"Spacy" (is imp): True
"for" (is imp): False
"$" (is imp): True
"0" (is imp): False
"billion" (is imp): False


In [18]:
Span.set_extension("has_title", getter=lambda span : any(token.is_title for token in span))
Span.set_extension("has_currency", getter=lambda span : any(token.is_currency for token in span))
Span.set_extension("has_important", getter=lambda span : any((token.is_title or token.is_currency) for token in span))

doc = nlp("1. Let's learn Spacy for $0 billion")
print(f"\"{doc[:2]}\" (has title): {doc[:2]._.has_title}")
print(f"\"{doc[:3]}\" (has title): {doc[:3]._.has_title}")
print(f"\"{doc[6:]}\" (has currency): {doc[6:]._.has_currency}")
print(f"\"{doc[:]}\" (has important): {doc[:]._.has_currency}")

After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']
"1." (has title): False
"1. Let" (has title): True
"for $0 billion" (has currency): True
"1. Let's learn Spacy for $0 billion" (has important): True


In [19]:
# Method Extensions
Doc.set_extension("has_token", method=lambda doc, token_txt: token_txt in [token.text for token in doc])

In [20]:
doc = nlp("1. Let's learn Spacy for $0 billion")
print(f"Is \"billion\" in doc: {doc._.has_token('billion')}")
print(f"Is \"billions\" in doc: {doc._.has_token('billions')}")

After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']
Is "billion" in doc: True
Is "billions" in doc: False


In [21]:
# Processing large volume of Text

# BAD:
# docs = [nlp(text) for text in LOTS_OF_TEXTS]

# GOOD:
# docs = list(nlp.pipe(LOTS_OF_TEXTS))
#  It process the text as a String and "yields" the Docs Objects, it batches up the text


In [22]:
from spacy.tokens import Doc
Doc.set_extension("id", default=None)
Doc.set_extension("page_number", default=None)

In [23]:
data = [
    ("1. Let's Learn Spacy in $0 billion!", dict(id=1, page_number=1)),
    ("2. Go and get some BTC worth $100K.", dict(id=1, page_number=2)),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context["page_number"])

After tokenization, this doc has 11 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '', '']
After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']
1. Let's Learn Spacy in $0 billion! 1
2. Go and get some BTC worth $100K. 2


In [24]:
for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id=context["id"]
    doc._.page_number=context["page_number"]

After tokenization, this doc has 11 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '', '']
After tokenization, this doc has 10 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '']


In [25]:
# Using only tokenizer
doc = nlp("Hello World!")  # BAD: it will run whole pipeline
doc = nlp.make_doc("Hello World!")  # GOOD: it only tokenize the sentence don't run the whole pipeline

After tokenization, this doc has 3 tokens.
The part of speech tags are: ['', '', '']


In [26]:
# Disabling the pipeline components
with nlp.disable_pipes("tagger", "parser", "lemmatizer"):
    doc = nlp("1. Let's Learn Spacy in $0 billion!")
    print(doc.text)
    print(nlp.pipe_names)
doc = nlp("1. Let's Learn Spacy in $0 billion!")
print(doc.text)
print(nlp.pipe_names)

After tokenization, this doc has 11 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '', '']
1. Let's Learn Spacy in $0 billion!
['custom_component', 'tok2vec', 'ner', 'attribute_ruler']
After tokenization, this doc has 11 tokens.
The part of speech tags are: ['', '', '', '', '', '', '', '', '', '', '']
1. Let's Learn Spacy in $0 billion!
['custom_component', 'tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
