## Spacy Pipeline

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')



In [3]:
type(nlp)

spacy.lang.en.English

In [4]:
doc_1 = nlp('You are about to log in to the world of Online Learning at NMIMS. A world made possible due to a combination of 30 years of legacy of best in class education and state of the art learning technology! Log in using the credentials given by the University. Please go through your profile details and update your contact information, it will help University to stay in touch with you.With this Portal, we hope to provide you all the support you need during your enrollment with the Program offered by the University. It will be our endeavour to keep improving your experience with this Portal as we go along. Happy Learning! - Team')

In [5]:
type(doc_1)

spacy.tokens.doc.Doc

In [6]:
count = 0
for token in doc_1:
    print (token)
    count = count + 1

You
are
about
to
log
in
to
the
world
of
Online
Learning
at
NMIMS
.
A
world
made
possible
due
to
a
combination
of
30
years
of
legacy
of
best
in
class
education
and
state
of
the
art
learning
technology
!
Log
in
using
the
credentials
given
by
the
University
.
Please
go
through
your
profile
details
and
update
your
contact
information
,
it
will
help
University
to
stay
in
touch
with
you
.
With
this
Portal
,
we
hope
to
provide
you
all
the
support
you
need
during
your
enrollment
with
the
Program
offered
by
the
University
.
It
will
be
our
endeavour
to
keep
improving
your
experience
with
this
Portal
as
we
go
along
.
Happy
Learning
!
-
Team


In [7]:
print (count)

122


### Text as stream of strings

**We need to use nlp.pipe() instead of nlp, when there is a stream of strings involved.**

### List of Strings

In [8]:
text_2 = ['Today is Monday', 'Tomorrow is Tuesday', 'Yesterday was Sunday, which was a holiday']

In [9]:
type (text_2)

list

In [10]:
for sentence in nlp.pipe(text_2):
    print (sentence)

Today is Monday
Tomorrow is Tuesday
Yesterday was Sunday, which was a holiday


In [11]:
sent_count = 0
for sentence in nlp.pipe(text_2):
    sent_count = sent_count + 1
    print (sent_count, '======>', sentence)



In [12]:
sent_count = 0
for sentence in nlp.pipe(text_2):
    sent_count = sent_count + 1
    print (sent_count, '=====>', sentence)
    for token in sentence:
        print (token)

1 =====> Today is Monday
Today
is
Monday
2 =====> Tomorrow is Tuesday
Tomorrow
is
Tuesday
3 =====> Yesterday was Sunday, which was a holiday
Yesterday
was
Sunday
,
which
was
a
holiday


### Tuple of Strings

In [13]:
text_3 = ('Today is Monday', 'Tomorrow is Tuesday', 'Yesterday was Sunday, which was a holiday')

In [14]:
type(text_3)

tuple

In [15]:
sent_count = 0
for sent in nlp.pipe(text_3):
    sent_count = sent_count + 1
    print (sent_count, '====>', sent)
    print ('Tokens:')
    for token in sent:
        print (token)

1 ====> Today is Monday
Tokens:
Today
is
Monday
2 ====> Tomorrow is Tuesday
Tokens:
Tomorrow
is
Tuesday
3 ====> Yesterday was Sunday, which was a holiday
Tokens:
Yesterday
was
Sunday
,
which
was
a
holiday


### List of Tuples

In [16]:
text_4 = [('Today is Monday'), ('Tomorrow is Tuesday'), ('Yesterday was Sunday, which was a holiday')]

In [17]:
type(text_4)

list

In [18]:
sent_count = 0
for sent in nlp.pipe(text_4):
    sent_count = sent_count + 1
    print (sent_count, '====>', sent)
    print ('Tokens:')
    for token in sent:
        print (token)

1 ====> Today is Monday
Tokens:
Today
is
Monday
2 ====> Tomorrow is Tuesday
Tokens:
Tomorrow
is
Tuesday
3 ====> Yesterday was Sunday, which was a holiday
Tokens:
Yesterday
was
Sunday
,
which
was
a
holiday


### DataFrame of Strings

In [19]:
text_2

['Today is Monday',
 'Tomorrow is Tuesday',
 'Yesterday was Sunday, which was a holiday']

In [20]:
# Converting to a DataFrame

import pandas as pd
text_df = pd.DataFrame(text_2, columns = ['Sentence'])
text_df

Unnamed: 0,Sentence
0,Today is Monday
1,Tomorrow is Tuesday
2,"Yesterday was Sunday, which was a holiday"


In [21]:
sent_count = 0
for sent in nlp.pipe(text_df['Sentence']):
    sent_count = sent_count + 1
    print (sent_count, '====>', sent)
    print ('Tokens:')
    for token in sent:
        print (token)

1 ====> Today is Monday
Tokens:
Today
is
Monday
2 ====> Tomorrow is Tuesday
Tokens:
Tomorrow
is
Tuesday
3 ====> Yesterday was Sunday, which was a holiday
Tokens:
Yesterday
was
Sunday
,
which
was
a
holiday


### Tagger

In [22]:
for token in doc_1:
    print (token.text, '====>', token.tag_)

You ====> PRP
are ====> VBP
about ====> JJ
to ====> TO
log ====> VB
in ====> RP
to ====> IN
the ====> DT
world ====> NN
of ====> IN
Online ====> NNP
Learning ====> NNP
at ====> IN
NMIMS ====> NNP
. ====> .
A ====> DT
world ====> NN
made ====> VBD
possible ====> JJ
due ====> IN
to ====> IN
a ====> DT
combination ====> NN
of ====> IN
30 ====> CD
years ====> NNS
of ====> IN
legacy ====> NN
of ====> IN
best ====> JJS
in ====> IN
class ====> NN
education ====> NN
and ====> CC
state ====> NN
of ====> IN
the ====> DT
art ====> NN
learning ====> VBG
technology ====> NN
! ====> .
Log ====> VB
in ====> IN
using ====> VBG
the ====> DT
credentials ====> NNS
given ====> VBN
by ====> IN
the ====> DT
University ====> NNP
. ====> .
Please ====> UH
go ====> VB
through ====> IN
your ====> PRP$
profile ====> JJ
details ====> NNS
and ====> CC
update ====> VB
your ====> PRP$
contact ====> NN
information ====> NN
, ====> ,
it ====> PRP
will ====> MD
help ====> VB
University ====> NNP
to ====> TO
stay ====> 

In [23]:
for token in doc_1:
    print (token.text, '====>', token.tag)

You ====> 13656873538139661788
are ====> 9188597074677201817
about ====> 10554686591937588953
to ====> 5595707737748328492
log ====> 14200088355797579614
in ====> 6860118812490040284
to ====> 1292078113972184607
the ====> 15267657372422890137
world ====> 15308085513773655218
of ====> 1292078113972184607
Online ====> 15794550382381185553
Learning ====> 15794550382381185553
at ====> 1292078113972184607
NMIMS ====> 15794550382381185553
. ====> 12646065887601541794
A ====> 15267657372422890137
world ====> 15308085513773655218
made ====> 17109001835818727656
possible ====> 10554686591937588953
due ====> 1292078113972184607
to ====> 1292078113972184607
a ====> 15267657372422890137
combination ====> 15308085513773655218
of ====> 1292078113972184607
30 ====> 8427216679587749980
years ====> 783433942507015291
of ====> 1292078113972184607
legacy ====> 15308085513773655218
of ====> 1292078113972184607
best ====> 14753207560692742245
in ====> 1292078113972184607
class ====> 15308085513773655218
ed

In [28]:
spacy.explain('VBG')

'verb, gerund or present participle'

In [29]:
spacy.explain('NNS')

'noun, plural'

In [30]:
spacy.explain('NNP')

'noun, proper singular'

In [32]:
for token in doc_1:
    print (token.text, '====>', token.tag) # We get a hash number. Internally, the tokens get stored as a hash number.

You ====> 13656873538139661788
are ====> 9188597074677201817
about ====> 10554686591937588953
to ====> 5595707737748328492
log ====> 14200088355797579614
in ====> 6860118812490040284
to ====> 1292078113972184607
the ====> 15267657372422890137
world ====> 15308085513773655218
of ====> 1292078113972184607
Online ====> 15794550382381185553
Learning ====> 15794550382381185553
at ====> 1292078113972184607
NMIMS ====> 15794550382381185553
. ====> 12646065887601541794
A ====> 15267657372422890137
world ====> 15308085513773655218
made ====> 17109001835818727656
possible ====> 10554686591937588953
due ====> 1292078113972184607
to ====> 1292078113972184607
a ====> 15267657372422890137
combination ====> 15308085513773655218
of ====> 1292078113972184607
30 ====> 8427216679587749980
years ====> 783433942507015291
of ====> 1292078113972184607
legacy ====> 15308085513773655218
of ====> 1292078113972184607
best ====> 14753207560692742245
in ====> 1292078113972184607
class ====> 15308085513773655218
ed

### Parts of Speech (POS)

In [33]:
for token in doc_1:
    print (token.text, '===>', token.pos_)

You ===> PRON
are ===> AUX
about ===> ADJ
to ===> PART
log ===> VERB
in ===> ADP
to ===> ADP
the ===> DET
world ===> NOUN
of ===> ADP
Online ===> PROPN
Learning ===> PROPN
at ===> ADP
NMIMS ===> PROPN
. ===> PUNCT
A ===> DET
world ===> NOUN
made ===> VERB
possible ===> ADJ
due ===> ADP
to ===> ADP
a ===> DET
combination ===> NOUN
of ===> ADP
30 ===> NUM
years ===> NOUN
of ===> ADP
legacy ===> NOUN
of ===> ADP
best ===> ADJ
in ===> ADP
class ===> NOUN
education ===> NOUN
and ===> CCONJ
state ===> NOUN
of ===> ADP
the ===> DET
art ===> NOUN
learning ===> VERB
technology ===> NOUN
! ===> PUNCT
Log ===> VERB
in ===> ADP
using ===> VERB
the ===> DET
credentials ===> NOUN
given ===> VERB
by ===> ADP
the ===> DET
University ===> PROPN
. ===> PUNCT
Please ===> INTJ
go ===> VERB
through ===> ADP
your ===> PRON
profile ===> ADJ
details ===> NOUN
and ===> CCONJ
update ===> VERB
your ===> PRON
contact ===> NOUN
information ===> NOUN
, ===> PUNCT
it ===> PRON
will ===> AUX
help ===> VERB
University

In [24]:
spacy.explain('PRON')

'pronoun'

In [34]:
for token in doc_1:
    print (token.text, '===>', token.pos)

You ===> 95
are ===> 87
about ===> 84
to ===> 94
log ===> 100
in ===> 85
to ===> 85
the ===> 90
world ===> 92
of ===> 85
Online ===> 96
Learning ===> 96
at ===> 85
NMIMS ===> 96
. ===> 97
A ===> 90
world ===> 92
made ===> 100
possible ===> 84
due ===> 85
to ===> 85
a ===> 90
combination ===> 92
of ===> 85
30 ===> 93
years ===> 92
of ===> 85
legacy ===> 92
of ===> 85
best ===> 84
in ===> 85
class ===> 92
education ===> 92
and ===> 89
state ===> 92
of ===> 85
the ===> 90
art ===> 92
learning ===> 100
technology ===> 92
! ===> 97
Log ===> 100
in ===> 85
using ===> 100
the ===> 90
credentials ===> 92
given ===> 100
by ===> 85
the ===> 90
University ===> 96
. ===> 97
Please ===> 91
go ===> 100
through ===> 85
your ===> 95
profile ===> 84
details ===> 92
and ===> 89
update ===> 100
your ===> 95
contact ===> 92
information ===> 92
, ===> 97
it ===> 95
will ===> 87
help ===> 100
University ===> 96
to ===> 94
stay ===> 100
in ===> 85
touch ===> 92
with ===> 85
you ===> 95
. ===> 97
With ===> 85

### POS Count

In [35]:
doc_1

You are about to log in to the world of Online Learning at NMIMS. A world made possible due to a combination of 30 years of legacy of best in class education and state of the art learning technology! Log in using the credentials given by the University. Please go through your profile details and update your contact information, it will help University to stay in touch with you.With this Portal, we hope to provide you all the support you need during your enrollment with the Program offered by the University. It will be our endeavour to keep improving your experience with this Portal as we go along. Happy Learning! - Team

In [37]:
# Splitting into sentences
sent_count = 0
for sent in doc_1.sents:
    sent_count = sent_count + 1
    print (sent_count, '===>', sent)

1 ===> You are about to log in to the world of Online Learning at NMIMS.
2 ===> A world made possible due to a combination of 30 years of legacy of best in class education and state of the art learning technology!
3 ===> Log in using the credentials given by the University.
4 ===> Please go through your profile details and update your contact information, it will help University to stay in touch with you.
5 ===> With this Portal, we hope to provide you all the support you need during your enrollment with the Program offered by the University.
6 ===> It will be our endeavour to keep improving your experience with this Portal as we go along.
7 ===> Happy Learning! - Team


### POS

In [38]:
pos_count = doc_1.count_by(spacy.attrs.POS)
pos_count

{95: 13,
 87: 4,
 84: 4,
 94: 4,
 100: 17,
 85: 21,
 90: 12,
 92: 20,
 96: 11,
 97: 10,
 93: 1,
 89: 2,
 91: 1,
 98: 1,
 86: 1}

In [39]:
for x, y in sorted(pos_count.items()):
    print (x, doc_1.vocab[x].text, y)

84 ADJ 4
85 ADP 21
86 ADV 1
87 AUX 4
89 CCONJ 2
90 DET 12
91 INTJ 1
92 NOUN 20
93 NUM 1
94 PART 4
95 PRON 13
96 PROPN 11
97 PUNCT 10
98 SCONJ 1
100 VERB 17


### Visualization of POS structure

In [40]:
from spacy import displacy
displacy.render(doc_1, style = 'dep')

In [42]:
options = {'compact': 'True', 'color': 'blue'}
options

{'compact': 'True', 'color': 'blue'}

In [44]:
displacy.render(doc_1, style = 'dep', options = options)

### Converting text into DF with tokens, POS

In [45]:
text_df

Unnamed: 0,Sentence
0,Today is Monday
1,Tomorrow is Tuesday
2,"Yesterday was Sunday, which was a holiday"


In [46]:
token = []
for sent in nlp.pipe(text_df['Sentence']):
    if sent.has_annotation('DEP'):
        token.append([word.text for word in sent])

In [47]:
token

[['Today', 'is', 'Monday'],
 ['Tomorrow', 'is', 'Tuesday'],
 ['Yesterday', 'was', 'Sunday', ',', 'which', 'was', 'a', 'holiday']]

In [49]:
token = []
pos = []
for sent in nlp.pipe(text_df['Sentence']):
    if sent.has_annotation('Dep'):
        token.append([word.text for word in sent])
        pos.append([word.pos_ for word in sent])

In [50]:
pos

[['NOUN', 'AUX', 'PROPN'],
 ['NOUN', 'AUX', 'PROPN'],
 ['NOUN', 'AUX', 'PROPN', 'PUNCT', 'PRON', 'AUX', 'DET', 'NOUN']]

In [51]:
# Updating DataFrame

text_df['Token'] = token
text_df['POS'] = pos

text_df

Unnamed: 0,Sentence,Token,POS
0,Today is Monday,"[Today, is, Monday]","[NOUN, AUX, PROPN]"
1,Tomorrow is Tuesday,"[Tomorrow, is, Tuesday]","[NOUN, AUX, PROPN]"
2,"Yesterday was Sunday, which was a holiday","[Yesterday, was, Sunday, ,, which, was, a, hol...","[NOUN, AUX, PROPN, PUNCT, PRON, AUX, DET, NOUN]"
