In [1]:
import spacy

In [3]:
!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm') # import the data we installed with `python -m download en_core_web_sm`

### Reading text data

In [4]:
with open('data/wiki_us.txt', 'r') as f:
    text = f.read()

In [5]:
print(text) # read and print local text file

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America. It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j] At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d] The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world. The national capital is Washington, D.C., and the most populous city is New York.

Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century. The United States emerged from the thirteen British colonies est

In [6]:
len(text)

3521

In [7]:
doc = nlp(text) # create a new document
len(doc)

654

### Tokens and attributes

In [8]:
for token in text[:10]:
    print(token)

T
h
e
 
U
n
i
t
e
d


In [9]:
for token in doc[:10]:
    print(token)

The
United
States
of
America
(
U.S.A.
or
USA
)


In [10]:
for sent in doc.sents: # tokenized text at the sentence level
    print(sent)

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.
It consists of 50 states, a federal district, five major unincorporated territories, 326 Indian reservations, and some minor possessions.[j]
At 3.8 million square miles (9.8 million square kilometers), it is the world's third- or fourth-largest country by total area.[d]
The United States shares significant land borders with Canada to the north and Mexico to the south, as well as limited maritime borders with the Bahamas, Cuba, and Russia.[22] With a population of more than 331 million people, it is the third most populous country in the world.
The national capital is Washington, D.C., and the most populous city is New York.


Paleo-Indians migrated from Siberia to the North American mainland at least 12,000 years ago, and European colonization began in the 16th century.
The United States emerged from the thirteen British colonies es

In [11]:
sentence1 = list(doc.sents)[0] # sents is a generator object
sentence1

The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a country primarily located in North America.

In [12]:
token2 = sentence1[2]
token2.text # .text gives us the string and not the object representation

'States'

In [13]:
token2.left_edge, token2.right_edge # where this token fits within a larger 'span'

(The, America)

In [14]:
token2.ent_type_ # geo political entity (from NER - Named Entity Recognition capabilities of Spacy)

'GPE'

In [15]:
token2.ent_iob_ # IOB (Inside Outside Begin) - the type of entity this token is, 'I' means this word is inside a larger entity (the united states of america)

'I'

In [16]:
token2.lemma_

'States'

In [17]:
sentence1[12], sentence1[12].lemma_, sentence1[12].morph # lemmatized form, morphology info of a verb

(known, 'know', Aspect=Perf|Tense=Past|VerbForm=Part)

In [18]:
token2.pos_ # part of speech info, States is a proper noun in this context

'PROPN'

In [19]:
token2.dep_ # dependency relation, 'ROOT' is the root of a sentence, states is a noun-subject in this sentence

'nsubj'

In [20]:
token2.lang_
    

'en'

### Working with tokens and entites

In [21]:
text = "Waqas enjoys playing football"
doc2 = nlp(text)
doc2

Waqas enjoys playing football

In [22]:
for token in doc2:
    print(token.text, token.pos_, token.dep_)

Waqas PROPN nsubj
enjoys VERB ROOT
playing VERB xcomp
football NOUN dobj


In [23]:
from spacy import displacy
displacy.render(doc2, style='dep', jupyter=True)

In [24]:
for ent in doc.ents:
    print(ent.text, ent.label_)

The United States of America GPE
U.S.A. GPE
USA GPE
the United States GPE
U.S. GPE
US GPE
America GPE
North America LOC
50 CARDINAL
five CARDINAL
326 CARDINAL
Indian NORP
3.8 million square miles QUANTITY
9.8 million square kilometers QUANTITY
third- or DATE
fourth ORDINAL
The United States GPE
Canada GPE
Mexico GPE
Bahamas GPE
Cuba GPE
more than 331 million QUANTITY
third ORDINAL
Washington GPE
D.C. GPE
New York GPE
Siberia LOC
North American NORP
at least 12,000 years ago DATE
European NORP
the 16th century DATE
The United States GPE
thirteen CARDINAL
British NORP
the East Coast LOC
Great Britain GPE
the American Revolutionary War ORG
1775–1783 CARDINAL
the late 18th century DATE
U.S. GPE
North America LOC
Native Americans NORP
1848 DATE
the United States GPE
United States GPE
the second half of the 19th century DATE
the American Civil War EVENT
Spanish NORP
World War I EVENT
U.S. GPE
World War II EVENT
the Cold War EVENT
the United States GPE
the Korean War EVENT
the Vietnam War EVE

In [25]:
displacy.render(doc, style='ent', jupyter=True)

### Word Vectors:

In [None]:
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')

In [33]:
with open ("data/wiki_us.txt", "r") as f:
    text = f.read()
doc = nlp(text)
sentence1 = list(doc.sents)[0]
# getting a specific word embedding:
nlp.vocab.vectors[nlp.vocab.strings['something']]

In [59]:
import numpy as np

target_word = 'country'
# find the words most similar to our target_word
ms = nlp.vocab.vectors.most_similar(
    np.asarray([nlp.vocab.vectors[nlp.vocab.strings[target_word]]]), n=10)
words = [nlp.vocab.strings[w] for w in ms[0][0]]
distances = ms[2]
words

['POVERTY',
 'inner-city',
 'Poverty',
 'INTERSECT',
 'INEQUALITY',
 'Inequality',
 'ILLITERACY',
 'illiteracy',
 'handicaps',
 'poorest']

In [63]:
# get similarity of 2 documents
doc1 = nlp("I like burgers and pizza")
doc2 = nlp("I love fast food")
doc3 = nlp("Museums are boring")

print(doc1, '<->',  doc2, doc1.similarity(doc2))
print(doc1, '<->',  doc3, doc1.similarity(doc3))

I like burgers and pizza <-> I love fast food 0.8655217850846202
I like burgers and pizza <-> Museums are boring 0.48845335287845937


### SpaCy Pipelines

In [64]:
# add empty model
nlp = spacy.blank("en")

nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7fc41b5a9ec0>

In [65]:
# get shakespeare corpus to train our model
import requests
from bs4 import BeautifulSoup
s = requests.get("https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt")
soup = BeautifulSoup(s.content).text.replace("-\n", "").replace("\n", " ")
nlp.max_length = 5278439

In [66]:
%%time
doc = nlp(soup)
print (len(list(doc.sents)))

94134
CPU times: user 12.9 s, sys: 45 ms, total: 12.9 s
Wall time: 12.9 s


In [67]:
nlp.analyze_pipes()

{'summary': {'sentencizer': {'assigns': ['token.is_sent_start', 'doc.sents'],
   'requires': [],
   'scores': ['sents_f', 'sents_p', 'sents_r'],
   'retokenizes': False}},
 'problems': {'sentencizer': []},
 'attrs': {'doc.sents': {'assigns': ['sentencizer'], 'requires': []},
  'token.is_sent_start': {'assigns': ['sentencizer'], 'requires': []}}}