In [1]:
import spacy

In [2]:
# load the small language model for English
nlp = spacy.load('en_core_web_sm')

nlp

<spacy.lang.en.English at 0x7fee89fd68e0>

In [3]:
text = 'This simple replacement operation would allow us to remove erroneous line breaks within the text. We can still identify actual line breaks by the indentations at the beginning of each paragraph, which are marked by three whitespaces. We can use this information to insert actual line breaks back into the text by replacing sequences of three whitespaces with the line break sequence and three whitespaces.'
text

'This simple replacement operation would allow us to remove erroneous line breaks within the text. We can still identify actual line breaks by the indentations at the beginning of each paragraph, which are marked by three whitespaces. We can use this information to insert actual line breaks back into the text by replacing sequences of three whitespaces with the line break sequence and three whitespaces.'

In [4]:
# feed the string to to a language oject (language model)
doc=nlp(text)

In [5]:
doc

This simple replacement operation would allow us to remove erroneous line breaks within the text. We can still identify actual line breaks by the indentations at the beginning of each paragraph, which are marked by three whitespaces. We can use this information to insert actual line breaks back into the text by replacing sequences of three whitespaces with the line break sequence and three whitespaces.

In [6]:
# look over items in the Doc object
for token in doc:
    print(token)

This
simple
replacement
operation
would
allow
us
to
remove
erroneous
line
breaks
within
the
text
.
We
can
still
identify
actual
line
breaks
by
the
indentations
at
the
beginning
of
each
paragraph
,
which
are
marked
by
three
whitespaces
.
We
can
use
this
information
to
insert
actual
line
breaks
back
into
the
text
by
replacing
sequences
of
three
whitespaces
with
the
line
break
sequence
and
three
whitespaces
.


In [7]:
# loop over items in doc, print the tokens and POS tags
for token in doc:
    print(token, token.pos_,token.tag_)

This DET DT
simple ADJ JJ
replacement NOUN NN
operation NOUN NN
would AUX MD
allow VERB VB
us PRON PRP
to PART TO
remove VERB VB
erroneous ADJ JJ
line NOUN NN
breaks NOUN NNS
within ADP IN
the DET DT
text NOUN NN
. PUNCT .
We PRON PRP
can AUX MD
still ADV RB
identify VERB VB
actual ADJ JJ
line NOUN NN
breaks NOUN NNS
by ADP IN
the DET DT
indentations NOUN NNS
at ADP IN
the DET DT
beginning NOUN NN
of ADP IN
each DET DT
paragraph NOUN NN
, PUNCT ,
which DET WDT
are AUX VBP
marked VERB VBN
by ADP IN
three NUM CD
whitespaces NOUN NNS
. PUNCT .
We PRON PRP
can AUX MD
use VERB VB
this DET DT
information NOUN NN
to PART TO
insert VERB VB
actual ADJ JJ
line NOUN NN
breaks VERB VBZ
back ADV RB
into ADP IN
the DET DT
text NOUN NN
by ADP IN
replacing VERB VBG
sequences NOUN NNS
of ADP IN
three NUM CD
whitespaces NOUN NNS
with ADP IN
the DET DT
line NOUN NN
break NOUN NN
sequence NOUN NN
and CCONJ CC
three NUM CD
whitespaces NOUN NNS
. PUNCT .


In [8]:
# loop over items in doc, print the tokens and the results of morphological analysis
for token in doc:
    print(token,token.morph)

This Number=Sing|PronType=Dem
simple Degree=Pos
replacement Number=Sing
operation Number=Sing
would VerbForm=Fin
allow VerbForm=Inf
us Case=Acc|Number=Plur|Person=1|PronType=Prs
to 
remove VerbForm=Inf
erroneous Degree=Pos
line Number=Sing
breaks Number=Plur
within 
the Definite=Def|PronType=Art
text Number=Sing
. PunctType=Peri
We Case=Nom|Number=Plur|Person=1|PronType=Prs
can VerbForm=Fin
still 
identify VerbForm=Inf
actual Degree=Pos
line Number=Sing
breaks Number=Plur
by 
the Definite=Def|PronType=Art
indentations Number=Plur
at 
the Definite=Def|PronType=Art
beginning Number=Sing
of 
each 
paragraph Number=Sing
, PunctType=Comm
which 
are Mood=Ind|Tense=Pres|VerbForm=Fin
marked Aspect=Perf|Tense=Past|VerbForm=Part
by 
three NumType=Card
whitespaces Number=Plur
. PunctType=Peri
We Case=Nom|Number=Plur|Person=1|PronType=Prs
can VerbForm=Fin
use VerbForm=Inf
this Number=Sing|PronType=Dem
information Number=Sing
to 
insert VerbForm=Inf
actual Degree=Pos
line Number=Sing
breaks Number=

In [21]:
# get aspect 
doc[35].morph.get('Aspect')

['Perf']

In [22]:
for token in doc:
    print(token.i,token,token.dep_,token.head.i,token.head)

0 This det 3 operation
1 simple amod 3 operation
2 replacement compound 3 operation
3 operation nsubj 5 allow
4 would aux 5 allow
5 allow ROOT 5 allow
6 us nsubj 8 remove
7 to aux 8 remove
8 remove ccomp 5 allow
9 erroneous amod 11 breaks
10 line compound 11 breaks
11 breaks dobj 8 remove
12 within prep 8 remove
13 the det 14 text
14 text pobj 12 within
15 . punct 5 allow
16 We nsubj 19 identify
17 can aux 19 identify
18 still advmod 19 identify
19 identify ROOT 19 identify
20 actual amod 22 breaks
21 line compound 22 breaks
22 breaks dobj 19 identify
23 by prep 19 identify
24 the det 25 indentations
25 indentations pobj 23 by
26 at prep 25 indentations
27 the det 28 beginning
28 beginning pobj 26 at
29 of prep 28 beginning
30 each det 31 paragraph
31 paragraph pobj 29 of
32 , punct 31 paragraph
33 which nsubjpass 35 marked
34 are auxpass 35 marked
35 marked relcl 31 paragraph
36 by agent 35 marked
37 three nummod 38 whitespaces
38 whitespaces pobj 36 by
39 . punct 19 identify
40 We ns

In [23]:
from spacy import displacy

In [24]:
displacy.render(doc,style='dep',options={'compact':True})

In [25]:
spacy.explain('pobj')

'object of preposition'