# SETUP

## Import

In [1]:
import re

from shared.corpus import Corpus
from shared.constants import *

import spacy
from spacy.matcher import Matcher
from spacy import displacy

# DATA

In [2]:
hp_corpus = Corpus(DIR_HP)
chapter1 = hp_corpus.books[0].chapters[0].text
print(chapter1[:100])

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly norma


# SPACY

In [3]:
nlp = spacy.load("en_core_web_lg")

## Sentences

In [4]:
doc = nlp(chapter1) # Our doc is just Book 1 Chapter 1 !!!
sentences = list(doc.sents)
example_sentence = sentences[8]
print(example_sentence)

Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be.


## Named Entities

In [5]:
ents = example_sentence.ents

for ent in ents:
    print(ent.label)
    print(ent.label_)
    print(ent.text, end='\n\n')

380
PERSON
Potter

380
PERSON
Dursley

391
DATE
several years

380
PERSON
Dursley



In [6]:
people_from_chapter_1 = []

for sent in sentences:
    for ent in sent.ents:
        if ent.label_ == 'PERSON' and not ent.text in people_from_chapter_1:
            people_from_chapter_1.append(ent.text)

print(sorted(people_from_chapter_1))

['Albus', 'Albus Dumbledore', 'Dedalus Diggle', 'Dudley', 'Dumbledore', 'Dursley', 'Godric', 'Hagrid', 'Harold', 'Harry', 'Harry Potter', 'Harvey', 'Howard', 'James', 'James Potter', 'Jim', 'Jim McGuffin', 'Lily', 'McGonagall', 'Muggle', "Next Door's", 'Petunia', 'Pomfrey', 'Potter', 'Privet Drive', 'Ted', 'Voldemort', 'baker']


## Parts of Speech

In [7]:
for token in example_sentence[-20:]:
    print(f"{token.text:<15} {token.pos:<5} {token.pos_:<10}")

her             95    PRON      
sister          92    NOUN      
and             89    CCONJ     
her             95    PRON      
good            84    ADJ       
-               97    PUNCT     
for             85    ADP       
-               97    PUNCT     
nothing         95    PRON      
husband         92    NOUN      
were            87    AUX       
as              86    ADV       
unDursleyish    84    ADJ       
as              98    SCONJ     
it              95    PRON      
was             87    AUX       
possible        84    ADJ       
to              94    PART      
be              87    AUX       
.               97    PUNCT     


In [8]:
print(example_sentence)
print('Nouns: ', [token for token in example_sentence if token.pos_ == 'NOUN'])
print('Noun chunks: ', list(example_sentence.noun_chunks))

Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be.
Nouns:  [sister, years, fact, sister, sister, husband]
Noun chunks:  [Mrs. Potter, Mrs. Dursley's sister, they, several years, fact, Mrs. Dursley, she, a sister, her sister, nothing, it]


In [9]:
Dursleys_chunks = [chunk.text for chunk in doc.noun_chunks if 'Dursley' in chunk.text]
print(set(Dursleys_chunks))

{'Mrs. Dursley', "Mrs. Dursley's scream", "Mrs. Dursley's sister", 'Mr. Dursley', 'Mr. and Mrs. Dursley', 'The Dursleys', "the Dursleys' house", "the Dursleys' dark living-room window"}


### SpaCy Matcher

In [10]:
# Prepare patterns
pron_verb_adv = [{"POS": "PRON"}, {"POS": "VERB"}, {"POS": "ADV"}]
adv_adj_noun = [{"POS": "ADV"}, {"POS": "ADJ"},{"POS": "NOUN"}]
patterns = [pron_verb_adv, adv_adj_noun]

# Create Matcher
matcher = Matcher(nlp.vocab)
matcher.add("ADV_VERB", patterns)

# Match patterns
matches = matcher(doc)
phrases = [doc[start:end] for _, start, end in matches]

print(phrases[:5])

[very large mustache, she spent so, most boring tie, It stared back, strangely dressed people]


### Lemmatization

In [11]:
print(example_sentence)
for token in example_sentence:
    if token.text != token.lemma_:
        print(f"{token.text:<15} {token.lemma_:<15}")

Mrs. Potter was Mrs. Dursley's sister, but they hadn't met for several years; in fact, Mrs. Dursley pretended she didn't have a sister, because her sister and her good-for-nothing husband were as unDursleyish as it was possible to be.
was             be             
had             have           
n't             not            
met             meet           
years           year           
pretended       pretend        
did             do             
n't             not            
were            be             
unDursleyish    undursleyish   
was             be             


## Displacy

In [12]:
# html = displacy.serve(example_sentence, style="dep")
# html = displacy.render(example_sentence, style="dep")

def visualize(doc, style, distance=70):
    ents_colors = {
        "PERSON": "linear-gradient(90deg, red, yellow)", 
        "ORG": "blue"
    }
    displacy.render(
        doc,
        style=style,
        jupyter=True,
        options={
            "compact": True,       # tighter layout
            "color": "#FFFF00",
            "bg": "#111111",
            "font": "Source Sans Pro",
            "distance": 70,        # space between words
            "page_width": 500,
            "ents": ["PERSON", "ORG"],
            "colors": ents_colors
        }
    )

In [13]:
visualize(example_sentence, "dep")

In [14]:
visualize(example_sentence, "ent")