Jupyter setup: make output more readable

In [1]:
%%html
<style>
.rendered_html {
    font-size: 30px;
}
td {
    font-size: 20px;
}
</style>

Jupyter setup: Import [pandas](https://pandas.pydata.org/) for table formatting.

In [2]:
import pandas as pd
table = pd.DataFrame

In [3]:
import spacy
#import neuralcoref

# Information on spaCy models: https://spacy.io/models
# see https://spacy.io/models/en#en_core_web_md
nlp = spacy.load(name="en_core_web_md")

# neuralcoref is incompatible with more recent versions of spaCy, but a fix is in the works.
# load NeuralCoref and add it to the pipe of spaCy's model
# see https://github.com/huggingface/neuralcoref#adding-neuralcoref-to-the-pipe-of-an-english-spacy-language
#neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f652571fb50>

Run all the spaCy NLP models on some sample text.

In [4]:
doc = nlp(u'''Dr. Jennifer Smith visited
   China. She liked the country a lot.''')

Print out each of the sentences.

In [5]:
[sent.text for sent in doc.sents]

['Dr. Jennifer Smith visited\n   China.', 'She liked the country a lot.']

Print out each of the tokens in each sentence.

In [6]:
[[token.text for token in sent]
 for sent in doc.sents]

[['Dr.', 'Jennifer', 'Smith', 'visited', '\n   ', 'China', '.'],
 ['She', 'liked', 'the', 'country', 'a', 'lot', '.']]

Print out a table of tokens and their lemmas

In [7]:
table(data=[[token.text, token.lemma_] for token in doc], columns=["TOKEN", "LEMMA"])

Unnamed: 0,TOKEN,LEMMA
0,Dr.,Dr.
1,Jennifer,Jennifer
2,Smith,Smith
3,visited,visit
4,\n,\n
5,China,China
6,.,.
7,She,-PRON-
8,liked,like
9,the,the


Print a sample embedding

In [8]:
visited = doc[3]
china = doc[5]
country = doc[10]

visited.vector

array([ 7.7700e-01, -3.9311e-02, -8.5042e-02,  3.3745e-02,  1.6042e-01,
        2.4297e-01,  2.2972e-01, -1.1241e-01, -1.9158e-01,  3.0601e+00,
       -7.0187e-02, -3.9855e-02,  5.7133e-02, -6.4045e-01, -3.0548e-01,
        8.3156e-02, -2.2163e-01, -2.4453e-01, -4.0568e-03,  8.6693e-02,
        7.4006e-02,  3.0738e-01, -2.4226e-01,  6.9430e-01,  2.3617e-02,
       -3.5779e-01, -2.4493e-02, -2.5985e-01, -5.8758e-01,  3.5571e-01,
        5.1124e-01, -3.7170e-01,  2.5344e-01,  2.1248e-01, -8.1111e-01,
       -1.2378e-01,  1.1823e-01,  1.0599e-01, -2.7711e-01,  5.3685e-02,
       -3.8109e-01, -4.4203e-01,  9.1727e-02,  5.0961e-02,  1.3432e-01,
        2.0528e-02, -1.8890e-01,  1.0488e-01,  4.6370e-01, -5.9452e-01,
       -4.9471e-01,  2.5003e-02,  2.2552e-01,  9.7131e-02,  3.2651e-01,
       -4.5753e-02,  5.4682e-02, -2.4251e-01,  4.2306e-01,  3.9414e-01,
        1.1117e-01,  2.8891e-01, -3.8979e-01,  2.2626e-01, -5.1952e-02,
        3.0413e-01, -4.6989e-04,  1.1571e-01, -1.7320e-01,  5.27

Print some embedding similarities

In [9]:
sim = china.similarity
# sim "china" & "visited", "china" & "country", "china" & "India",
sim(visited), sim(country), sim(nlp("India")[0])

(0.11377088, 0.34879076, 0.62464035)

Print out a table of tokens and their part-of-speech tags

In [10]:
table(data=[[token.text, token.pos_] for token in doc], columns=["TOKEN", "POS TAG"])

Unnamed: 0,TOKEN,POS TAG
0,Dr.,PROPN
1,Jennifer,PROPN
2,Smith,PROPN
3,visited,VERB
4,\n,SPACE
5,China,PROPN
6,.,PUNCT
7,She,PRON
8,liked,VERB
9,the,DET


Print out a table of named entities and their types.

In [11]:
table(data=[[entity.text, entity.label_] for entity in doc.ents], columns=["NAMED ENTITY", "NE TYPE"])

Unnamed: 0,NAMED ENTITY,NE TYPE
0,Jennifer Smith,PERSON
1,China,GPE


Use spaCy's visualizer to display the named entities.

In [12]:
spacy.displacy.render(doc, style='ent', jupyter=True)

Print out a table of tokens and their grammatical heads

In [13]:
table(data=[[token.text, token.head.text] for token in doc], columns=["TOKEN", "HEAD"])

Unnamed: 0,TOKEN,HEAD
0,Dr.,Smith
1,Jennifer,Smith
2,Smith,visited
3,visited,visited
4,\n,visited
5,China,visited
6,.,visited
7,She,liked
8,liked,liked
9,the,country


Use spacy's visualizer to display the dependency tree

In [14]:
# see https://spacy.io/usage/visualizers#jupyter
spacy.displacy.render(doc.sents, style='dep', jupyter=True, options={"distance": 150})

Print out a table of tokens and their entities (coreference)

In [15]:
# table(data=[[token.text, [cluster.main.text for cluster in token._.coref_clusters]]
#        for token in doc], columns=["TOKEN", "COREF CLUSTERS"])

Unnamed: 0,TOKEN,COREF CLUSTERS
0,Dr.,[Dr. Jennifer Smith]
1,Jennifer,[Dr. Jennifer Smith]
2,Smith,[Dr. Jennifer Smith]
3,visited,[]
4,\n,[]
5,China,[China]
6,.,[]
7,She,[Dr. Jennifer Smith]
8,liked,[]
9,the,[China]


### Coref visualization

[See the example](https://huggingface.co/coref/?text=Dr.%20Jennifer%20Smith%20visited%20%20%20%20China.%20She%20liked%20the%20country%20a%20lot.)