In [None]:
import spacy

In [None]:
nlp = spacy.load('en')

In [None]:
doc = nlp(u'I am flying to Frisco')

In [None]:
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Frisco']


In [None]:
print(doc)

I am flying to Frisco


In [None]:
doc = nlp(u'this product integrates both libraries for downloading and applying patches.')

In [None]:
for token in doc:
  print(token.text, token.lemma_)

this this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch
. .


In [None]:
from spacy.symbols import ORTH, LEMMA

In [None]:
doc = nlp(u'I am flying to Frisco')

In [None]:
print([w.text for w in doc])

['I', 'am', 'flying', 'to', 'Frisco']


In [None]:
special_case = [{ORTH: u'Frisco', LEMMA: u'San Francisco'}]

In [None]:
special_case

[{65: 'Frisco', 73: 'San Francisco'}]

In [None]:
nlp.tokenizer.add_special_case(u'Frisco', special_case)

In [None]:
print([w.lemma_ for w in nlp(u'I am flying to Frisco')])

['-PRON-', 'be', 'fly', 'to', 'San Francisco']


In [None]:
doc = nlp(u'I have flown to LA. Now, I am flying to Frisco')

In [None]:
print([w.text for w in doc if w.tag_=='VBG' or w.tag_=='VB'])

['flying']


In [None]:
print([w.text for w in doc if w.pos_=='PROPN'])

['LA', 'Frisco']


In [None]:
for token in doc:
  print(token.text, token.pos_, token.dep_)

I PRON nsubj
have AUX aux
flown VERB ROOT
to ADP prep
LA PROPN pobj
. PUNCT punct
Now ADV advmod
, PUNCT punct
I PRON nsubj
am AUX aux
flying VERB ROOT
to ADP prep
Frisco PROPN pobj


In [None]:
for token in doc:
  print(token.head.text, token.dep_, token.text)

flown nsubj I
flown aux have
flown ROOT flown
flown prep to
to pobj LA
flown punct .
flying advmod Now
flying punct ,
flying nsubj I
flying aux am
flying ROOT flying
flying prep to
to pobj Frisco


In [None]:
for sent in doc.sents:
  print([w.text for w in sent if w.dep_ == 'ROOT' or w.dep_ == 'pobj'])

['flown', 'LA']
['flying', 'Frisco']


In [None]:
special_case = [{ORTH: u'Frisco', LEMMA: u'San Francisco'}]

In [None]:
nlp.tokenizer.add_special_case(u'Frisco', special_case)

In [None]:
doc

I have flown to LA. Now, I am flying to Frisco

In [None]:
print([w.lemma_ for w in doc])

['-PRON-', 'have', 'fly', 'to', 'LA', '.', 'now', ',', '-PRON-', 'be', 'fly', 'to', 'Frisco']


In [None]:
doc = nlp(u'I have flown to LA. Now, I am flying to Frisco')

In [None]:
print([w.lemma_ for w in doc])

['-PRON-', 'have', 'fly', 'to', 'LA', '.', 'now', ',', '-PRON-', 'be', 'fly', 'to', 'San Francisco']


In [None]:
for sent in doc.sents:
  print([w.text for w in sent if w.tag_ == 'VBG' or w.dep_=='pobj'])

['LA']
['flying', 'Frisco']


In [None]:
list1 = []
for sent in doc.sents:
  for w in sent:
    if w.tag_ == 'VBG':
      list1.append(w.lemma_)
      for w in sent:
        if w.dep_ == 'pobj':
          list1.append(w.lemma_)




In [None]:
print(list1)

['fly', 'San Francisco']


In [None]:
for token in doc:
  if token.ent_type !=0:
    print(token.text, token.ent_type_)

LA GPE
Frisco ORG


In [None]:
l1 = 'simple list of words.'

In [None]:
len(l1)

21

In [None]:
from spacy.tokens.doc import Doc
from spacy.vocab import Vocab
doc = Doc(Vocab(), words=[u'Hi', u'There'] )
doc

Hi There 

## **Chapter 3**

In [None]:
doc = nlp(u'I want a green apple.')

In [None]:
[w for w in doc[4].lefts]

[a, green]

In [None]:
[w for w in doc[4].children]

[a, green]

In [None]:
[w for w in doc[1].rights]

[apple, .]

In [None]:
doc[2]

a

In [None]:
doc = nlp(u'A severe storm hit the beach. I started to rain.')
for sent in doc.sents:
  print([sent[i] for i in range(len(sent))])

[A, severe, storm, hit, the, beach, .]
[I, started, to, rain, .]


In [None]:
print([doc[i] for i in range(len(doc))])

[A, severe, storm, hit, the, beach, ., I, started, to, rain, .]


In [None]:
for i, sent in enumerate(doc.sents):
  if i==1 and sent[0].pos_=='PRON':
    print('The second sentence begins with a pronoun.')

The second sentence begins with a pronoun.


In [None]:
for i, sent in enumerate(doc.sents):
  print(i, sent)

0 A severe storm hit the beach.
1 I started to rain.


In [None]:
counter = 0
for sent in doc.sents:
  if sent[len(sent) - 2].pos_ == 'VERB':
    counter += 1
print(counter)

1


In [None]:
for sent in doc.sents:
  print(sent[len(sent)-2])

beach
rain


In [None]:
doc = nlp(u'A noun chunk is a phrase that has a noun as its head.')
for chunk in doc.noun_chunks:
  print(chunk)

A noun chunk
a phrase
a noun
its head


In [None]:
for token in doc:
  if token.pos_ == 'NOUN':
    chunk = ''
    for w in token.children:
      if w.pos_ == 'DET' or w.pos_ == 'ADJ':
        chunk = chunk + w.text + ' '

    chunk = chunk + token.text
    print(chunk)

A noun chunk
a phrase
a noun
its head


In [None]:
for token in doc:
  if token.pos_=='NOUN':
    chunk = ''
    for w in token.children:
      if w.pos_ == 'DET' or w.pos_ == 'ADJ':
        chunk = chunk + w.text + ' '
    chunk = chunk + token.text
    print(chunk)


A noun chunk
a phrase
a noun
its head


In [None]:
for token in doc:
  if token.pos_=='NOUN':
    chunk = ''
    for w in token.lefts:
        chunk = chunk + w.text + ' '
    chunk = chunk + token.text
    print(chunk)

A noun chunk
a phrase
a noun
its head


In [None]:
doc = nlp('I want a green apple.')

In [None]:
doc[2:5]

a green apple

In [None]:
doc = nlp(u'The Golden Gate Bridge is an iconic landmark in San Francisco.')

In [None]:
print([doc[i] for i in range(len(doc))])

[The, Golden, Gate, Bridge, is, an, iconic, landmark, in, San, Francisco, .]


In [None]:
span = doc[1:4]

In [None]:
span

Golden Gate Bridge

In [None]:
doc.vocab.strings[span.text]

17145121013916591709

In [None]:
lem_id = doc.vocab.strings[span.text]

In [None]:
span.merge(lemma=lem_id)

Golden Gate Bridge

In [None]:
for token in doc:
  print(token.text, token.lemma_, token.pos_, token.dep_)

The the DET det
Golden Gate Bridge Golden Gate Bridge PROPN nsubj
is be AUX ROOT
an an DET det
iconic iconic ADJ amod
landmark landmark NOUN attr
in in ADP prep
San San PROPN compound
Francisco Francisco PROPN pobj
. . PUNCT punct


In [None]:
span = doc[-3:-1]

In [None]:
span

San Francisco

In [None]:
doc.vocab.strings[span.text]

10393695292957549069

In [None]:
lem_id = doc.vocab.strings[span.text]

In [None]:
span.merge(lemma=lem_id)

San Francisco

In [None]:
for token in doc:
  print(token.text, token.lemma_, token.pos_, token.dep_)

The the DET det
Golden Gate Bridge Golden Gate Bridge PROPN nsubj
is be AUX ROOT
an an DET det
iconic iconic ADJ amod
landmark landmark NOUN attr
in in ADP prep
San Francisco San Francisco PROPN pobj
. . PUNCT punct


In [None]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
print(nlp.meta['lang'] + '_' + nlp.meta['name'])

en_core_web_sm


In [None]:
print(nlp.meta['lang'])

en


In [None]:
print(nlp.meta['name'])

core_web_sm


In [None]:
from spacy import util
util.get_package_path('en_core_web_sm')

PosixPath('/usr/local/lib/python3.6/dist-packages/en_core_web_sm')

In [None]:
doc = nlp(u'I need a taxi to Fasty.')

In [None]:
for ent in doc.ents:
  print(ent.text, ent.label_)

Fasty ORG


In [None]:
LABEL = 'DISTRICT'
TRAIN_DATA = [

          ('We need to deliver it to Festy.', { 'entities': [(25, 30, 'DISTRICT')]}),
          ('I like red oranges', {'entities': []})

]

In [None]:
l = 'We need to deliver it to Festy.'

In [None]:
l[25:31]

'Festy.'

In [None]:
ner = nlp.get_pipe('ner')

In [None]:
ner.add_label(LABEL)

In [None]:
nlp.disable_pipes('tagger')
nlp.disable_pipes('parser')

[('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fee1fc3e2e8>)]

In [None]:
optimizer = nlp.entity.create_optimizer()
import random

In [None]:
for i in range(25):
  random.shuffle(TRAIN_DATA)
  for text, annotations in TRAIN_DATA:
    nlp.update([text], [annotations], sgd=optimizer)

In [None]:
doc = nlp(u'I need a taxi to Festy.')
for ent in doc.ents:
  print(ent.text, ent.label_)


Festy DISTRICT


In [None]:
ner.to_disk('/usr/ner')

In [None]:
'/content/sample_data'

In [None]:
import spacy
from spacy.pipeline import EntityRecognizer
nlp = spacy.load('en', disable=['ner'])
ner = EntityRecognizer(nlp.vocab)
ner.from_disk('/usr/ner')
nlp.add_pipe(ner)

In [None]:
doc = nlp(u'We need to deliver it to Festy.')
for ent in doc.ents:
  print(ent.text, ent.label_)

Festy ORG


In [None]:
!pip install Cython



In [None]:
from gensim.test.utils import datapath, get_tmpfile

In [None]:
from gensim.corpora import WikiCorpus, MmCorpus

In [None]:
path_to_wiki_dump = datapath("/content/enwiki-latest-pages-articles14.xml-p7697595p7744800.bz2")

In [None]:
corpus_path = get_tmpfile("wiki-corpus.mm")

In [None]:
wiki = WikiCorpus(path_to_wiki_dump)

In [None]:
MmCorpus.serialize(corpus_path, wiki)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
!python setup.py build_ext --inplace

running build_ext
building 'spacytext' extension
creating build
creating build/temp.linux-x86_64-3.6
x86_64-linux-gnu-gcc -pthread -DNDEBUG -g -fwrapv -O2 -Wall -g -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2 -fPIC -I/usr/local/lib/python3.6/dist-packages/numpy/core/include -I/usr/include/python3.6m -c spacytext.cpp -o build/temp.linux-x86_64-3.6/spacytext.o
In file included from [01m[K/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1832:0[m[K,
                 from [01m[K/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12[m[K,
                 from [01m[K/usr/local/lib/python3.6/dist-packages/numpy/core/include/numpy/arrayobject.h:4[m[K,
                 from [01m[Kspacytext.cpp:627[m[K:
  [01;35m[K^~~~~~~[m[K
x86_64-linux-gnu-g++ -pthread -shared -Wl,-O1 -Wl,-Bsymbolic-functions -Wl,-Bsymbolic-functions -Wl,-z,relro -Wl,-Bsymbolic-functions -Wl,-z,relro -g -f

In [None]:
from spacytext import main

In [None]:
import spacy
nlp = spacy.load('en')
f = open('test.txt', 'rb')
contents = f.read()
doc = nlp(contents[:100000].decode('utf8'))
main(doc)

FileNotFoundError: ignored

## **Chapter 4**

In [None]:
import spacy

In [None]:
nlp = spacy.load('en')
doc = nlp(u"The firm earned 1.5 million in 2017.")
for token in doc:
  print(token.text, token.pos_, spacy.explain(token.pos_))

The DET determiner
firm NOUN noun
earned VERB verb
1.5 NUM numeral
million NUM numeral
in ADP adposition
2017 NUM numeral
. PUNCT punctuation


In [None]:
for token in doc:
  print(token.text, token.pos_, token.tag_, spacy.explain(token.tag_))

The DET DT determiner
firm NOUN NN noun, singular or mass
earned VERB VBD verb, past tense
1.5 NUM CD cardinal number
million NUM CD cardinal number
in ADP IN conjunction, subordinating or preposition
2017 NUM CD cardinal number
. PUNCT . punctuation mark, sentence closer


In [None]:
import spacy

In [None]:
nlp = spacy.load('en')
doc = nlp(u'The firm earned $1.5 million in 2017.')
phrase = ' '
for token in doc:
  if token.tag_ == '$':
    phrase = token.text
    i = token.i+1
    while doc[i].tag_ == 'CD':
      phrase += doc[i].text + ' '
      i +=1
    break

phrase = phrase[:-1]
print(phrase)

$1.5 million


In [None]:
nlp = spacy.load('en')
doc = nlp(u'The firm earned $1.5 million in 2017in comparison with $1.2 million in 2016.')
phrase = ' '
for token in doc:
  if token.tag_ == '$':
    phrase = token.text
    i = token.i+1
    while doc[i].tag_ == 'CD':
      phrase += doc[i].text + ' '
      i +=1
    phrase = phrase[:-1]
    print(phrase)

$1.5 million
$1.2 million


In [None]:
doc = nlp(u'I can promise it is worth your time.')
for token in doc:
  print(token.text, token.pos_, token.tag_)

I PRON PRP
can VERB MD
promise VERB VB
it PRON PRP
is AUX VBZ
worth ADJ JJ
your DET PRP$
time NOUN NN
. PUNCT .


In [None]:
doc = nlp(u'I can promise it is worth your time.')

In [None]:
sent = ' '

In [None]:
for i, token in enumerate(doc):
  if token.tag_ == 'PRP' and doc[i+1].tag_ == 'MD' and doc[i+2].tag_ == 'VB':
    sent = doc[i+1].text.capitalize() + ' ' + doc[i].text
    sent = sent + ' ' + doc[i+2:].text
    break



In [None]:
doc = nlp(sent)

In [None]:
for i, token in enumerate(doc):
  if token.tag_ == 'PRP' and token.text == 'I':
    sent = doc[:i].text + ' you ' + doc[i+1:].text
    break

In [None]:
doc = nlp(sent)

In [None]:
for i, token in enumerate(doc):
  if token.tag_ == 'PRP$' and token.text == 'your':
    sent = doc[:i].text + ' my ' + doc[i+1:].text
    break

In [None]:
doc = nlp(sent)

In [None]:
for i, token in enumerate(doc):
  if token.tag_ == 'VB':
    sent = doc[:i].text + ' really ' + doc[i:].text
    break

In [None]:
doc = nlp(sent)

In [None]:
sent = doc[:len(doc)-1].text + '?'

In [None]:
print(sent)

Can you really promise it is worth my time?


In [None]:
def doc_Q(stringg, sent=' '):
  doc = nlp(stringg)
  for i, token in enumerate(doc):
    if token.tag_ == 'PRP' and doc[i+1].tag_ == 'MD' and doc[i+2].tag_ == 'VB':
      sent = doc[i+1].text.capitalize() + ' you ' + doc[i+2:].text
    elif token.tag_ == 'PRP$' and token.text == 'your':
        sent = sent.replace('your', 'my')
        doc = nlp(sent)

  for i, token in enumerate(doc):
     if token.tag_ == 'VB':
        sent = doc[:i].text + ' really ' + doc[i:].text
        doc = sent

  sent = doc[:len(doc)-1] + '?'
  doc = sent



  print(doc)
  print(sent)

In [None]:
doc_Q('I can promise it is worth your time.')

Can you really promise it is worth my time?
Can you really promise it is worth my time?


In [None]:
doc = nlp(u'I can promise it is worth your time.')

In [None]:
for token in doc:
  print(token.text, token.pos_, token.tag_, token.dep_, spacy.explain(token.dep_))

I PRON PRP nsubj nominal subject
can VERB MD aux auxiliary
promise VERB VB ROOT None
it PRON PRP nsubj nominal subject
is AUX VBZ ccomp clausal complement
worth ADJ JJ acomp adjectival complement
your DET PRP$ poss possession modifier
time NOUN NN npadvmod noun phrase as adverbial modifier
. PUNCT . punct punctuation


In [None]:
def find_chunk(doc):
	chunk = ''
	for i, token in enumerate(doc):
		if token.dep_ == 'dobj':
			shift = len([w for w in token.children])
			#print([w for w in token.children])
			chunck = doc[i-shift:i+1]


			break

	return chunck, shift, i

In [None]:
doc = nlp(u'I want a green apple')
find_chunk(doc)

(a green apple, 2, 4)

In [None]:
!python question.py 'I want a green apple.'

Do you?


In [None]:
!python question.py 'I want an apple.'

Do you?
