## Pre-processing using Spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')



In [3]:
nlp

<spacy.lang.en.English at 0x1dfea096a60>

In [4]:
type(nlp)

spacy.lang.en.English

In [5]:
doc_1 = nlp ('India, a South Asian nation, is the seventh-largest country by area, the second-most populous country with over 1.38 billion people, and the most populous democracy in the world. India boasts of an immensely rich cultural heritage, including numerous languages, traditions, and people. The country holds its uniqueness in its diversity, and hence has adapted itself to international changes with poise and comfort. While the economy has welcomed international companies to invest in it with open arms since liberalisation in the 1990s, Indians have been prudent and proactive in adopting global approaches and skills. Indian villagers have proudly taken up farming, advanced agriculture and unique handicrafts as their profession on one hand, while the modern industries and professional services sectors are coming up in a big way on the other. Thus, the country is attracting many global majors for strategic investments owing to the presence of a vast range of industries, investment avenues and a supportive Government. A huge population, mostly comprising the youth, is a strong driver for demand and an ample source of manpower')

In [6]:
doc_1

India, a South Asian nation, is the seventh-largest country by area, the second-most populous country with over 1.38 billion people, and the most populous democracy in the world. India boasts of an immensely rich cultural heritage, including numerous languages, traditions, and people. The country holds its uniqueness in its diversity, and hence has adapted itself to international changes with poise and comfort. While the economy has welcomed international companies to invest in it with open arms since liberalisation in the 1990s, Indians have been prudent and proactive in adopting global approaches and skills. Indian villagers have proudly taken up farming, advanced agriculture and unique handicrafts as their profession on one hand, while the modern industries and professional services sectors are coming up in a big way on the other. Thus, the country is attracting many global majors for strategic investments owing to the presence of a vast range of industries, investment avenues and a

### Tokenization

In [7]:
for token in doc_1:
    print (token)

India
,
a
South
Asian
nation
,
is
the
seventh
-
largest
country
by
area
,
the
second
-
most
populous
country
with
over
1.38
billion
people
,
and
the
most
populous
democracy
in
the
world
.
India
boasts
of
an
immensely
rich
cultural
heritage
,
including
numerous
languages
,
traditions
,
and
people
.
The
country
holds
its
uniqueness
in
its
diversity
,
and
hence
has
adapted
itself
to
international
changes
with
poise
and
comfort
.
While
the
economy
has
welcomed
international
companies
to
invest
in
it
with
open
arms
since
liberalisation
in
the
1990s
,
Indians
have
been
prudent
and
proactive
in
adopting
global
approaches
and
skills
.
Indian
villagers
have
proudly
taken
up
farming
,
advanced
agriculture
and
unique
handicrafts
as
their
profession
on
one
hand
,
while
the
modern
industries
and
professional
services
sectors
are
coming
up
in
a
big
way
on
the
other
.
Thus
,
the
country
is
attracting
many
global
majors
for
strategic
investments
owing
to
the
presence
of
a
vast
range
of
industries
,
in

In [8]:
# Length of the document

len(doc_1)

200

In [9]:
# Count of tokens

t_count = 0

for token in doc_1:
    t_count = t_count + 1
print ("The total no of token is:", t_count)

The total no of token is: 200


In [10]:
from spacy.lang.en.stop_words import STOP_WORDS
print (STOP_WORDS)

{'ourselves', 'see', 'others', 'afterwards', 'bottom', 'hers', 'you', 'yours', 'no', 'still', 'together', 'after', 'against', 'namely', 'so', 'never', 'eleven', 'such', 'perhaps', 'down', 'was', 'wherein', '’d', 'during', 'up', 'not', 'though', 'just', 'although', 'serious', 'why', 'above', 'all', 'well', 'where', 'within', 'many', 'those', 'thus', 'latter', 'somewhere', 'being', 'into', 'call', 'my', 'had', 'third', 'but', "'ll", 'can', 'them', 'almost', 'noone', 'even', 'forty', 'top', "'d", 'himself', 'alone', 'whom', '’ve', 'herein', 'several', 'as', 'six', 'becoming', 'quite', 'seeming', 'seem', 'through', 'they', 'due', 'anywhere', 'she', 'however', 'and', 'two', 'if', 'three', 'me', 'hence', 'whatever', 'some', 'how', 'her', 'name', 'amount', 'are', 'everywhere', "'s", '’m', 'much', 'once', '’ll', 'nobody', 'is', 'in', 'must', 'few', "'m", 'towards', 'really', 'already', 'meanwhile', 'somehow', 'say', 'own', 'go', 'because', 'fifteen', 'nowhere', 'it', 'regarding', 'under', 'acr

In [11]:
# Checking if the tokens are stop words

for token in doc_1:
    print (token, '=====>', token.is_stop)

India =====> False
, =====> False
a =====> True
South =====> False
Asian =====> False
nation =====> False
, =====> False
is =====> True
the =====> True
seventh =====> False
- =====> False
largest =====> False
country =====> False
by =====> True
area =====> False
, =====> False
the =====> True
second =====> False
- =====> False
most =====> True
populous =====> False
country =====> False
with =====> True
over =====> True
1.38 =====> False
billion =====> False
people =====> False
, =====> False
and =====> True
the =====> True
most =====> True
populous =====> False
democracy =====> False
in =====> True
the =====> True
world =====> False
. =====> False
India =====> False
boasts =====> False
of =====> True
an =====> True
immensely =====> False
rich =====> False
cultural =====> False
heritage =====> False
, =====> False
including =====> False
numerous =====> False
languages =====> False
, =====> False
traditions =====> False
, =====> False
and =====> True
people =====> False
. =====> False
Th

In [12]:
# Counting the no of non-stop words

s_count = 0
for token in doc_1:
    if token.is_stop != True:
        s_count = s_count + 1
        print (token)
        
print ('\n The total non-stop words are:', s_count)

India
,
South
Asian
nation
,
seventh
-
largest
country
area
,
second
-
populous
country
1.38
billion
people
,
populous
democracy
world
.
India
boasts
immensely
rich
cultural
heritage
,
including
numerous
languages
,
traditions
,
people
.
country
holds
uniqueness
diversity
,
adapted
international
changes
poise
comfort
.
economy
welcomed
international
companies
invest
open
arms
liberalisation
1990s
,
Indians
prudent
proactive
adopting
global
approaches
skills
.
Indian
villagers
proudly
taken
farming
,
advanced
agriculture
unique
handicrafts
profession
hand
,
modern
industries
professional
services
sectors
coming
big
way
.
,
country
attracting
global
majors
strategic
investments
owing
presence
vast
range
industries
,
investment
avenues
supportive
Government
.
huge
population
,
comprising
youth
,
strong
driver
demand
ample
source
manpower

 The total non-stop words are: 120


In [13]:
print ('The total non-stop words are:', s_count)

The total non-stop words are: 120


### Checking for punctuation

In [14]:
# Count of non-punctuation tokens

p_count = 0
for token in doc_1:
    print (token, '======>', token.is_punct)
    print ('\n \n The non-punctuation tokens are:')
    if token.is_punct == False:
        p_count = p_count + 1
        print (token)
print ('\n The total non-punctuation words are:', p_count)


 
 The non-punctuation tokens are:
India

 
 The non-punctuation tokens are:

 
 The non-punctuation tokens are:
a

 
 The non-punctuation tokens are:
South

 
 The non-punctuation tokens are:
Asian

 
 The non-punctuation tokens are:
nation

 
 The non-punctuation tokens are:

 
 The non-punctuation tokens are:
is

 
 The non-punctuation tokens are:
the

 
 The non-punctuation tokens are:
seventh

 
 The non-punctuation tokens are:

 
 The non-punctuation tokens are:
largest

 
 The non-punctuation tokens are:
country

 
 The non-punctuation tokens are:
by

 
 The non-punctuation tokens are:
area

 
 The non-punctuation tokens are:

 
 The non-punctuation tokens are:
the

 
 The non-punctuation tokens are:
second

 
 The non-punctuation tokens are:

 
 The non-punctuation tokens are:
most

 
 The non-punctuation tokens are:
populous

 
 The non-punctuation tokens are:
country

 
 The non-punctuation tokens are:
with

 
 The non-punctuation tokens are:
over

 
 The non-punctuation tok

In [15]:
# Count of punctuation tokens

pu_count = 0
for token in doc_1:
    print (token, '======>', token.is_punct)
    print ('\n \n The punctuation tokens are:')
    if token.is_punct == True:
        pu_count = pu_count + 1
        print (token)
print ('\n The total non-punctuation words are:', pu_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:
,

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
,

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
-

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
,

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
-

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
,

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens a

In [16]:
# Count of left punctuation tokens

left_p_count = 0
for token in doc_1:
    print (token, '======>', token.is_left_punct)
    print ('\n \n The punctuation tokens are:')
    if token.is_left_punct == True:
        left_p_count = left_p_count + 1
        print (token)
print ('\n The total non-punctuation words are:', left_p_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

In [17]:
# Count of right punctuation tokens

right_p_count = 0
for token in doc_1:
    print (token, '======>', token.is_right_punct)
    print ('\n \n The punctuation tokens are:')
    if token.is_right_punct == True:
        right_p_count = right_p_count + 1
        print (token)
print ('\n The total non-punctuation words are:', right_p_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

### Checking for an alphabet

In [18]:
# Count of alphabetical tokens

a_count = 0
for token in doc_1:
    print (token, '======>', token.is_alpha)
    print ('\n \n The punctuation tokens are:')
    if token.is_alpha == True:
        a_count = a_count + 1
        print (token)
print ('\n The total non-punctuation words are:', a_count)


 
 The punctuation tokens are:
India

 
 The punctuation tokens are:

 
 The punctuation tokens are:
a

 
 The punctuation tokens are:
South

 
 The punctuation tokens are:
Asian

 
 The punctuation tokens are:
nation

 
 The punctuation tokens are:

 
 The punctuation tokens are:
is

 
 The punctuation tokens are:
the

 
 The punctuation tokens are:
seventh

 
 The punctuation tokens are:

 
 The punctuation tokens are:
largest

 
 The punctuation tokens are:
country

 
 The punctuation tokens are:
by

 
 The punctuation tokens are:
area

 
 The punctuation tokens are:

 
 The punctuation tokens are:
the

 
 The punctuation tokens are:
second

 
 The punctuation tokens are:

 
 The punctuation tokens are:
most

 
 The punctuation tokens are:
populous

 
 The punctuation tokens are:
country

 
 The punctuation tokens are:
with

 
 The punctuation tokens are:
over

 
 The punctuation tokens are:

 
 The punctuation tokens are:
billion

 
 The punctuation tokens are:
people

 
 The punc

### Checking for digit tokens

In [19]:
# Count of digit tokens

d_count = 0
for token in doc_1:
    print (token, '======>', token.is_digit)
    print ('\n \n The punctuation tokens are:')
    if token.is_digit == True:
        d_count = d_count + 1
        print (token)
print ('\n The total non-punctuation words are:', d_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

In [20]:
doc_2 = nlp('10000 is a big number')
for token in doc_2:
    print (token, '====>', token.is_digit)

10000 ====> True
is ====> False
a ====> False
big ====> False
number ====> False


### Checking for lower case tokens

In [21]:
# Count of lower case tokens

low_count = 0
for token in doc_1:
    print (token, '======>', token.is_lower)
    print ('\n \n The punctuation tokens are:')
    if token.is_lower == True:
        low_count = low_count + 1
        print (token)
print ('\n The total non-punctuation words are:', low_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
a

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
nation

 
 The punctuation tokens are:

 
 The punctuation tokens are:
is

 
 The punctuation tokens are:
the

 
 The punctuation tokens are:
seventh

 
 The punctuation tokens are:

 
 The punctuation tokens are:
largest

 
 The punctuation tokens are:
country

 
 The punctuation tokens are:
by

 
 The punctuation tokens are:
area

 
 The punctuation tokens are:

 
 The punctuation tokens are:
the

 
 The punctuation tokens are:
second

 
 The punctuation tokens are:

 
 The punctuation tokens are:
most

 
 The punctuation tokens are:
populous

 
 The punctuation tokens are:
country

 
 The punctuation tokens are:
with

 
 The punctuation tokens are:
over

 
 The punctuation tokens are:

 
 The punctuation tokens are:
billion

 
 The punctuation tokens are:
people

 
 The punctuation tokens are

### Checking for upper case

In [22]:
# Count of upper case tokens

up_count = 0
for token in doc_1:
    print (token, '======>', token.is_upper)
    print ('\n \n The punctuation tokens are:')
    if token.is_upper == True:
        up_count = up_count + 1
        print (token)
print ('\n The total non-punctuation words are:', up_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

### Checking for title case tokens

In [23]:
# Count of title case tokens

t_count = 0
for token in doc_1:
    print (token, '======>', token.is_title)
    print ('\n \n The punctuation tokens are:')
    if token.is_title == True:
        t_count = t_count + 1
        print (token)
print ('\n The total non-punctuation words are:', t_count)


 
 The punctuation tokens are:
India

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
South

 
 The punctuation tokens are:
Asian

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation to

### Checking for quote tokens

In [24]:
# Count of quote tokens

q_count = 0
for token in doc_1:
    print (token, '======>', token.is_quote)
    print ('\n \n The punctuation tokens are:')
    if token.is_quote == True:
        q_count = q_count + 1
        print (token)
print ('\n The total non-punctuation words are:', q_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

### Checking for bracket tokens

In [25]:
# Count of bracket tokens

b_count = 0
for token in doc_1:
    print (token, '======>', token.is_bracket)
    print ('\n \n The punctuation tokens are:')
    if token.is_bracket == True:
        b_count = b_count + 1
        print (token)
print ('\n The total non-punctuation words are:', b_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The 

### Checking for a number

In [26]:
# Count of number tokens

n_count = 0
for token in doc_1:
    print (token, '======>', token.like_num)
    print ('\n \n The punctuation tokens are:')
    if token.like_num == True:
        n_count = n_count + 1
        print (token)
        
print ('\n The total non-punctuation words are:', n_count)


 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
seventh

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
second

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:
1.38

 
 The punctuation tokens are:
billion

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punctuation tokens are:

 
 The punc

### Checking for URL token

In [27]:
doc_4 = nlp('The website for NMIMS is www.nmims.edu.in')

In [28]:
u_count = 0
for token in doc_4:
    print (token, '=====>', token.like_url)
    if token.like_url == True:
        u_count = u_count + 1
print ("\n The no of URL tokens in the doc is:", u_count)

The =====> False
website =====> False
for =====> False
NMIMS =====> False
is =====> False
www.nmims.edu.in =====> True

 The no of URL tokens in the doc is: 1


### Checking for email token

In [29]:
doc_5 = nlp('My email id is abinash_mishra001@nmims.edu.in')

In [30]:
# Checking for email token

e_count = 0
for token in doc_5:
    print (token, '=====>', token.like_email)
    if token.like_email == True:
        e_count = e_count + 1
        
print ("\n The no of email tokens in the doc is:", e_count)

My =====> False
email =====> False
i =====> False
d =====> False
is =====> False
abinash_mishra001@nmims.edu.in =====> True

 The no of email tokens in the doc is: 1


### Parts of Speech - POS

In [31]:
doc_2

10000 is a big number

In [32]:
for token in doc_2:
    print (token, '===>', token.pos_) #pos_ shows the Parts of Speech, while pos shows the position of the token.

10000 ===> NUM
is ===> AUX
a ===> DET
big ===> ADJ
number ===> NOUN


### Converting into a Dataframe

In [33]:
spacy.explain('AUX') # This provides some details about the POS

'auxiliary'

In [34]:
spacy.explain('ADJ')

'adjective'

In [35]:
cols = ['Token', 'POS', 'Explain_POS', 'TAG', 'Explain_TAG']
cols

['Token', 'POS', 'Explain_POS', 'TAG', 'Explain_TAG']

In [36]:
rows = []
for token in doc_1:
    row = token, token.pos_, spacy.explain(token.pos_), token.tag_, spacy.explain(token.tag_)
    rows.append(row)
rows

[(India, 'PROPN', 'proper noun', 'NNP', 'noun, proper singular'),
 (,, 'PUNCT', 'punctuation', ',', 'punctuation mark, comma'),
 (a, 'DET', 'determiner', 'DT', 'determiner'),
 (South,
  'ADJ',
  'adjective',
  'JJ',
  'adjective (English), other noun-modifier (Chinese)'),
 (Asian,
  'ADJ',
  'adjective',
  'JJ',
  'adjective (English), other noun-modifier (Chinese)'),
 (nation, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (,, 'PUNCT', 'punctuation', ',', 'punctuation mark, comma'),
 (is, 'AUX', 'auxiliary', 'VBZ', 'verb, 3rd person singular present'),
 (the, 'DET', 'determiner', 'DT', 'determiner'),
 (seventh, 'ADV', 'adverb', 'RB', 'adverb'),
 (-, 'PUNCT', 'punctuation', 'HYPH', 'punctuation mark, hyphen'),
 (largest, 'ADJ', 'adjective', 'JJS', 'adjective, superlative'),
 (country, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (by, 'ADP', 'adposition', 'IN', 'conjunction, subordinating or preposition'),
 (area, 'NOUN', 'noun', 'NN', 'noun, singular or mass'),
 (,, 'PUNCT', 'p

In [37]:
import pandas as pd

In [38]:
token_df = pd.DataFrame(rows, columns = cols)
token_df

Unnamed: 0,Token,POS,Explain_POS,TAG,Explain_TAG
0,India,PROPN,proper noun,NNP,"noun, proper singular"
1,",",PUNCT,punctuation,",","punctuation mark, comma"
2,a,DET,determiner,DT,determiner
3,South,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
4,Asian,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
...,...,...,...,...,...
195,an,DET,determiner,DT,determiner
196,ample,ADJ,adjective,JJ,"adjective (English), other noun-modifier (Chin..."
197,source,NOUN,noun,NN,"noun, singular or mass"
198,of,ADP,adposition,IN,"conjunction, subordinating or preposition"


In [39]:
# Count of each POS

token_df['POS'].value_counts()

NOUN     48
ADJ      29
ADP      24
PUNCT    23
DET      20
VERB     12
CCONJ    10
AUX       9
ADV       9
PRON      5
NUM       4
PROPN     3
SCONJ     3
PART      1
Name: POS, dtype: int64