# NLP `spacy` tutorial

https://realpython.com/natural-language-processing-spacy-python/

## Imports

In [1]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy import displacy

import pathlib
import re
from collections import Counter

## Load ```spacy```

In [2]:
nlp = spacy.load("en_core_web_sm")
type(nlp)

spacy.lang.en.English

## Working with `spacy`

In [3]:
introduction_doc = nlp(
    "Hello my friend. How are you?"
)

type(introduction_doc)

spacy.tokens.doc.Doc

In [4]:
[token.text for token in introduction_doc]

['Hello', 'my', 'friend', '.', 'How', 'are', 'you', '?']

### Import data from a file

In [5]:
filename = 'nlp_introduction.txt'
introduction_doc = nlp(pathlib.Path(filename).read_text(encoding='utf-8'))
[token.text for token in introduction_doc]

['This',
 'tutorial',
 'is',
 'about',
 'Natural',
 'Language',
 'Processing',
 'in',
 'spaCy',
 '.',
 '\n']

### Retrieve sentence informations

In [6]:
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_text_doc = nlp(about_text)
sents = list(about_text_doc.sents)

print(f'There are {len(sents)} sentences in the given text:')

for sent in sents:
    print(f'\t- {sent[:5]} ...')

There are 2 sentences in the given text:
	- Gus Proto is a Python ...
	- He is interested in learning ...


Check whether a word / token is start of a sentence.

In [7]:
for token in about_text_doc:
    print(f'{token} -> {token.is_sent_start}')

Gus -> True
Proto -> False
is -> False
a -> False
Python -> False
developer -> False
currently -> False
working -> False
for -> False
a -> False
London -> False
- -> False
based -> False
Fintech -> False
company -> False
. -> False
He -> True
is -> False
interested -> False
in -> False
learning -> False
Natural -> False
Language -> False
Processing -> False
. -> False


It is also possible to add custom delimiter.

In [8]:
from spacy.language import Language


@Language.component("set_custom_boundaries")
def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True
            
    return doc


ellipsis_text = (
    "Gus, can you, ... never mind, I forgot"
    " what I was saying. So, do you think"
    " we should ..."
)

custom_nlp = spacy.load("en_core_web_sm")
custom_nlp.add_pipe("set_custom_boundaries", before="parser")

custom_ellipsis_doc = custom_nlp(ellipsis_text)
custom_ellipsis_sentences = list(custom_ellipsis_doc.sents)

for sentence in custom_ellipsis_sentences:
    print(sentence)

Gus, can you, ...
never mind, I forgot what I was saying.
So, do you think we should ...


The tokens object has few interesting attributes:

In [9]:
about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

about_text_doc = nlp(about_text)

for token in about_text_doc:
    print(f'{token.text:15}\t ith-word:{token.i} \t ith-char:{token.idx}')

Gus            	 ith-word:0 	 ith-char:0
Proto          	 ith-word:1 	 ith-char:4
is             	 ith-word:2 	 ith-char:10
a              	 ith-word:3 	 ith-char:13
Python         	 ith-word:4 	 ith-char:15
developer      	 ith-word:5 	 ith-char:22
currently      	 ith-word:6 	 ith-char:32
working        	 ith-word:7 	 ith-char:42
for            	 ith-word:8 	 ith-char:50
a              	 ith-word:9 	 ith-char:54
London         	 ith-word:10 	 ith-char:56
-              	 ith-word:11 	 ith-char:62
based          	 ith-word:12 	 ith-char:63
Fintech        	 ith-word:13 	 ith-char:69
company        	 ith-word:14 	 ith-char:77
.              	 ith-word:15 	 ith-char:84
He             	 ith-word:16 	 ith-char:86
is             	 ith-word:17 	 ith-char:89
interested     	 ith-word:18 	 ith-char:92
in             	 ith-word:19 	 ith-char:103
learning       	 ith-word:20 	 ith-char:106
Natural        	 ith-word:21 	 ith-char:115
Language       	 ith-word:22 	 ith-char:123
Processing     	 it

In [10]:
print(
    f"{'Text with Whitespace':22}"
    f"{'Is Alphanumeric?':18}"
    f"{'Is Punctuation?':18}"
    f"{'Is Stop Word?'}"
    )

for token in about_text_doc:
    print(
        f"{str(token.text_with_ws):22}"
        f"{str(token.is_alpha):18}"
        f"{str(token.is_punct):18}"
        f"{str(token.is_stop)}"
    )

Text with Whitespace  Is Alphanumeric?  Is Punctuation?   Is Stop Word?
Gus                   True              False             False
Proto                 True              False             False
is                    True              False             True
a                     True              False             True
Python                True              False             False
developer             True              False             False
currently             True              False             False
working               True              False             False
for                   True              False             True
a                     True              False             True
London                True              False             False
-                     False             True              False
based                 True              False             False
Fintech               True              False             False
company               True          

Custom Tokeninzer:

In [11]:
custom_about_text = (
    "Gus Proto is a Python developer currently"
    " working for a London@based Fintech"
    " company. He is interested in learning"
    " Natural Language Processing."
)

print([token.text for token in nlp(custom_about_text)[8:15]])

['for', 'a', 'London@based', 'Fintech', 'company', '.', 'He']


In [12]:
custom_nlp = spacy.load("en_core_web_sm")
prefix_re = spacy.util.compile_prefix_regex(
    custom_nlp.Defaults.prefixes
)

suffix_re = spacy.util.compile_suffix_regex(
    custom_nlp.Defaults.suffixes
)

custom_infixes = [r"@"]

infix_re = spacy.util.compile_infix_regex(
    list(custom_nlp.Defaults.infixes) + custom_infixes
)

custom_nlp.tokenizer = Tokenizer(
    nlp.vocab,
    prefix_search=prefix_re.search,
    suffix_search=suffix_re.search,
    infix_finditer=infix_re.finditer,
    token_match=None,
)

custom_tokenizer_about_doc = custom_nlp(custom_about_text)
print([token.text for token in custom_tokenizer_about_doc[8:15]])

['for', 'a', 'London', '@', 'based', 'Fintech', 'company']


### Stop Words

In [13]:
stop_words = spacy.lang.en.stop_words.STOP_WORDS
list(stop_words)[:10]

['‘s',
 'are',
 'through',
 'down',
 'latter',
 'same',
 'have',
 'being',
 'throughout',
 'your']

Remove stop words:

In [14]:
nlp = spacy.load("en_core_web_sm")
about_doc = nlp(custom_about_text)
print([token for token in about_doc if not token.is_stop])

[Gus, Proto, Python, developer, currently, working, London@based, Fintech, company, ., interested, learning, Natural, Language, Processing, .]


### Lemmatization

In [15]:
conference_help_text = (
    "Gus is helping organize a developer"
    " conference on Applications of Natural Language"
    " Processing. He keeps organizing local Python meetups"
    " and several internal talks at his workplace."
)

conference_doc = nlp(conference_help_text)

for token in conference_doc:
    print(f'{token} {token.lemma_}')


Gus Gus
is be
helping helping
organize organize
a a
developer developer
conference conference
on on
Applications Applications
of of
Natural Natural
Language Language
Processing Processing
. .
He he
keeps keep
organizing organize
local local
Python Python
meetups meetup
and and
several several
internal internal
talks talk
at at
his his
workplace workplace
. .


### Word Frequency

In [16]:
complete_text = (
    "Gus Proto is a Python developer currently"
    " working for a London-based Fintech company. He is"
    " interested in learning Natural Language Processing."
    " There is a developer conference happening on 21 July"
    ' 2019 in London. It is titled "Applications of Natural'
    ' Language Processing". There is a helpline number'
    " available at +44-1234567891. Gus is helping organize it."
    " He keeps organizing local Python meetups and several"
    " internal talks at his workplace. Gus is also presenting"
    ' a talk. The talk will introduce the reader about "Use'
    ' cases of Natural Language Processing in Fintech".'
    " Apart from his work, he is very passionate about music."
    " Gus is learning to play the Piano. He has enrolled"
    " himself in the weekend batch of Great Piano Academy."
    " Great Piano Academy is situated in Mayfair or the City"
    " of London and has world-class piano instructors."
)

complete_doc = nlp(complete_text)
words = [token.text for token in complete_doc if not token.is_punct and not token.is_stop]

Counter(words).most_common(5)

[('Gus', 4), ('London', 3), ('Natural', 3), ('Language', 3), ('Processing', 3)]

### Part of Speech tagging

There are typically 8 parts of speech:
1. Noun
2. Pronoun
3. Adjective
4. Verb
5. Adverb
6. Preposition
7. Conjunction
8. Interjection

In [17]:
for token in about_doc:
    print(f'{token.text:15} {token.tag_}\t{token.pos_}\t{spacy.explain(token.tag_)}')

Gus             NNP	PROPN	noun, proper singular
Proto           NNP	PROPN	noun, proper singular
is              VBZ	AUX	verb, 3rd person singular present
a               DT	DET	determiner
Python          NNP	PROPN	noun, proper singular
developer       NN	NOUN	noun, singular or mass
currently       RB	ADV	adverb
working         VBG	VERB	verb, gerund or present participle
for             IN	ADP	conjunction, subordinating or preposition
a               DT	DET	determiner
London@based    JJ	ADJ	adjective (English), other noun-modifier (Chinese)
Fintech         NNP	PROPN	noun, proper singular
company         NN	NOUN	noun, singular or mass
.               .	PUNCT	punctuation mark, sentence closer
He              PRP	PRON	pronoun, personal
is              VBZ	AUX	verb, 3rd person singular present
interested      JJ	ADJ	adjective (English), other noun-modifier (Chinese)
in              IN	ADP	conjunction, subordinating or preposition
learning        VBG	VERB	verb, gerund or present participle
N

In [18]:
nouns = []
verbs = []

for token in about_doc:
    if token.pos_ == 'NOUN':
        nouns.append(token.text)
        
    elif token.pos_ == 'VERB':
        verbs.append(token.text)

print(nouns)
print(verbs)

['developer', 'company']
['working', 'learning']


### Visualize

In [19]:
about_interest_text = (
    "He is interested in learning Natural Language Processing."
)

about_doc = nlp(about_interest_text)
displacy.render(complete_doc, style='ent', jupyter=True)
displacy.render(about_doc, style='dep', jupyter=True)

### Preprocessing

In [20]:
def is_valid_token(token):
    return bool(
        token
        and str(token).strip()
        and not token.is_stop
        and not token.is_punct
    )

def preprocess_token(token):
    return token.lemma_.strip().lower()

complete_filtered_tokens = [preprocess_token(token) for token in complete_doc if is_valid_token(token)]
complete_filtered_tokens

['gus',
 'proto',
 'python',
 'developer',
 'currently',
 'work',
 'london',
 'base',
 'fintech',
 'company',
 'interested',
 'learn',
 'natural',
 'language',
 'processing',
 'developer',
 'conference',
 'happen',
 '21',
 'july',
 '2019',
 'london',
 'title',
 'application',
 'natural',
 'language',
 'processing',
 'helpline',
 'number',
 'available',
 '+44',
 '1234567891',
 'gus',
 'helping',
 'organize',
 'keep',
 'organize',
 'local',
 'python',
 'meetup',
 'internal',
 'talk',
 'workplace',
 'gus',
 'present',
 'talk',
 'talk',
 'introduce',
 'reader',
 'use',
 'case',
 'natural',
 'language',
 'processing',
 'fintech',
 'apart',
 'work',
 'passionate',
 'music',
 'gus',
 'learn',
 'play',
 'piano',
 'enrol',
 'weekend',
 'batch',
 'great',
 'piano',
 'academy',
 'great',
 'piano',
 'academy',
 'situate',
 'mayfair',
 'city',
 'london',
 'world',
 'class',
 'piano',
 'instructor']

### Rule based matching

In [21]:
about_doc = nlp(about_text)

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)


def extract_full_name(nlp_doc):
    pattern = [{"POS": "PROPN"}, {"POS": "PROPN"}]
    matcher.add("FULL_NAME", [pattern])
    matches = matcher(nlp_doc)

    for _, start, end in matches:
        span = nlp_doc[start:end]
        yield span.text
        
next(extract_full_name(about_doc))

'Gus Proto'

In [22]:
matcher = Matcher(nlp.vocab)

conference_org_text = (
    "There is a developer conference"
    " happening on 21 July 2019 in London. It is titled"
    ' "Applications of Natural Language Processing".'
    " There is a helpline number available"
    " at (123) 456-7891"
)


def extract_phone_number(nlp_doc):
    pattern = [
        {"ORTH": "("},
        {"SHAPE": "ddd"},
        {"ORTH": ")"},
        {"SHAPE": "ddd"},
        {"ORTH": "-", "OP": "?"},
        {"SHAPE": "dddd"},
    ]

    matcher.add("PHONE_NUMBER", [pattern])
    matches = matcher(nlp_doc)
    
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        print(span.text)


conference_org_doc = nlp(conference_org_text)
extract_phone_number(conference_org_doc)

(123) 456-7891


### Dependency Parsing

In [23]:
piano_text = "Gus is learning piano"
piano_doc = nlp(piano_text)

for token in piano_doc:
    print(
        f"""
        TOKEN: {token.text}
        =====
        {token.tag_ = }
        {token.head.text = }
        {token.dep_ = }"""
    )


        TOKEN: Gus
        =====
        token.tag_ = 'NNP'
        token.head.text = 'learning'
        token.dep_ = 'nsubj'

        TOKEN: is
        =====
        token.tag_ = 'VBZ'
        token.head.text = 'learning'
        token.dep_ = 'aux'

        TOKEN: learning
        =====
        token.tag_ = 'VBG'
        token.head.text = 'learning'
        token.dep_ = 'ROOT'

        TOKEN: piano
        =====
        token.tag_ = 'NN'
        token.head.text = 'learning'
        token.dep_ = 'dobj'


In [24]:
displacy.render(piano_doc)

In [25]:
one_line_about_text = (
    "Gus Proto is a Python developer"
    " currently working for a London-based Fintech company"
)

one_line_about_doc = nlp(one_line_about_text)
displacy.render(one_line_about_doc)

In [26]:
# Extract children of `developer`
print([token.text for token in one_line_about_doc[5].children])

['a', 'Python', 'working']


In [27]:
# Extract all tokens on the left of `developer`
print([token.text for token in one_line_about_doc[5].lefts])

['a', 'Python']


In [28]:
# Extract tokens on the right of `developer`
print([token.text for token in one_line_about_doc[5].rights])

['working']


In [29]:
# Extract previous neighboring node of `developer`
print(one_line_about_doc[5].nbor(-1))

Python


In [30]:
# Extract next neighboring node of `developer`
print(one_line_about_doc[5].nbor())

currently


In [31]:
# Print subtree of `developer`
print(list(one_line_about_doc[5].subtree))

[a, Python, developer, currently, working, for, a, London, -, based, Fintech, company]


### Shallow Parsing

In [32]:
conference_text = (
    "There is a developer conference happening on 21 July 2019 in London."
)

conference_doc = nlp(conference_text)

# Extract non Phrases
for chunk in conference_doc.noun_chunks:
    print(chunk)

a developer conference
21 July
London


In [33]:
import textacy

about_talk_text = (
    "The talk will introduce reader about use"
    " cases of Natural Language Processing in"
    " Fintech, making use of"
    " interesting examples along the way."
)


patterns = [{"POS": "AUX"}, {"POS": "VERB"}]
about_talk_doc = nlp(about_talk_text)

verb_phrases = textacy.extract.token_matches(
    about_talk_doc, patterns=patterns
)

# Print all verb phrases
for chunk in verb_phrases:
    print(chunk.text)

will introduce


In [34]:
# Extract noun phrase to explain what nouns are involved
for chunk in about_talk_doc.noun_chunks:
    print (chunk)

The talk
reader
use cases
Natural Language Processing
Fintech
use
interesting examples
the way


### Named-Entity Recognition

In [35]:
piano_class_text = (
    "Great Piano Academy is situated"
    " in Mayfair or the City of London and has"
    " world-class piano instructors."
)
piano_class_doc = nlp(piano_class_text)

for ent in piano_class_doc.ents:
    print(
        f"""
        {ent.text = }
        {ent.start_char = }
        {ent.end_char = }
        {ent.label_ = }
        spacy.explain('{ent.label_}') = {spacy.explain(ent.label_)}"""
    )


        ent.text = 'Great Piano Academy'
        ent.start_char = 0
        ent.end_char = 19
        ent.label_ = 'ORG'
        spacy.explain('ORG') = Companies, agencies, institutions, etc.

        ent.text = 'Mayfair'
        ent.start_char = 35
        ent.end_char = 42
        ent.label_ = 'GPE'
        spacy.explain('GPE') = Countries, cities, states

        ent.text = 'the City of London'
        ent.start_char = 46
        ent.end_char = 64
        ent.label_ = 'GPE'
        spacy.explain('GPE') = Countries, cities, states


In [36]:
displacy.render(piano_class_doc, style="ent")

In [37]:
survey_text = (
    "Out of 5 people surveyed, James Robert,"
    " Julie Fuller and Benjamin Brooks like"
    " apples. Kelly Cox and Matthew Evans"
    " like oranges."
)


def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == "PERSON":
        return "[REDACTED] "
    
    return token.text_with_ws


def redact_names(nlp_doc):
    with nlp_doc.retokenize() as retokenizer:
        for ent in nlp_doc.ents:
            retokenizer.merge(ent)

            tokens = map(replace_person_names, nlp_doc)

        return "".join(tokens)


survey_doc = nlp(survey_text)
print(redact_names(survey_doc))

Out of 5 people surveyed, [REDACTED] [REDACTED] , [REDACTED] [REDACTED] and [REDACTED] [REDACTED] like apples. [REDACTED] [REDACTED] and [REDACTED] [REDACTED] like oranges.
