# Installation

In [41]:
#pip install -U spacy

In [8]:
# import libraries

import spacy
import pandas as pd
#ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
spacy.__version__

'2.1.9'

In [12]:

from spacy.lang.en import English
from spacy import displacy
nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_lg')

In [23]:
test_sent= "Pakistan got independence in 1947. Karachi, Lahore and Islamabad are few of the major cities Pakistan."

# Tokenization

In [33]:
parsed_sent = nlp(test_sent)

In [34]:
type(parsed_sent)

spacy.tokens.doc.Doc

In [25]:
parsed_sent.text.split()

['Pakistan',
 'got',
 'independence',
 'in',
 '1947.',
 'Karachi,',
 'Lahore',
 'and',
 'Islamabad',
 'are',
 'few',
 'of',
 'the',
 'major',
 'cities',
 'Pakistan.']

In [None]:
.orth_ method, which returns a string representation of the token rather than a SpaCy token object, this might not always be desirable, but worth noting. SpaCy recognises punctuation and is able to split these punctuation tokens from word tokens.

In [26]:
for token in parsed_sent:
    print(token.orth_ )

Pakistan
got
independence
in
1947
.
Karachi
,
Lahore
and
Islamabad
are
few
of
the
major
cities
Pakistan
.


In [27]:
for token in parsed_sent:
    if not token.is_punct | token.is_space:
        print(token.orth_ )

Pakistan
got
independence
in
1947
Karachi
Lahore
and
Islamabad
are
few
of
the
major
cities
Pakistan


In [28]:
[token.orth_ for token in parsed_sent if not token.is_punct | token.is_space] 

['Pakistan',
 'got',
 'independence',
 'in',
 '1947',
 'Karachi',
 'Lahore',
 'and',
 'Islamabad',
 'are',
 'few',
 'of',
 'the',
 'major',
 'cities',
 'Pakistan']

# Named Entities

In [29]:
parsed_sent = nlp(test_sent)
spacy.displacy.render(parsed_sent, style='ent',jupyter=True)

# Part-of-Speech Tagging

In [30]:
sentence_spans = list(parsed_sent.sents)
displacy.render(sentence_spans, style='dep', jupyter=True)

In [32]:
for token in parsed_sent:
    print(token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)")

Pakistan GPE
got (not an entity)
independence (not an entity)
in (not an entity)
1947 DATE
. (not an entity)
Karachi GPE
, (not an entity)
Lahore GPE
and (not an entity)
Islamabad GPE
are (not an entity)
few (not an entity)
of (not an entity)
the (not an entity)
major (not an entity)
cities (not an entity)
Pakistan GPE
. (not an entity)


# Lemmatization

In [38]:
for token in parsed_sent:
    print(token, ' -> Its Lemma word ', token.lemma_)
    print()

Pakistan  -> Its Lemma word  Pakistan

got  -> Its Lemma word  get

independence  -> Its Lemma word  independence

in  -> Its Lemma word  in

1947  -> Its Lemma word  1947

.  -> Its Lemma word  .

Karachi  -> Its Lemma word  Karachi

,  -> Its Lemma word  ,

Lahore  -> Its Lemma word  Lahore

and  -> Its Lemma word  and

Islamabad  -> Its Lemma word  Islamabad

are  -> Its Lemma word  be

few  -> Its Lemma word  few

of  -> Its Lemma word  of

the  -> Its Lemma word  the

major  -> Its Lemma word  major

cities  -> Its Lemma word  city

Pakistan  -> Its Lemma word  Pakistan

.  -> Its Lemma word  .



In [39]:
df_token = pd.DataFrame()

for i, token in enumerate(parsed_sent):
    df_token.loc[i, 'text'] = token.text
    df_token.loc[i, 'lemma'] = token.lemma_,
    df_token.loc[i, 'pos'] = token.pos_
    df_token.loc[i, 'tag'] = token.tag_
    df_token.loc[i, 'dep'] = token.dep_
    #df_token.loc[i, 'shape'] = token.shape_
    #df_token.loc[i, 'is_alpha'] = token.is_alpha
    df_token.loc[i, 'is_stop'] = token.is_stop
    
print(df_token)

            text            lemma    pos  tag    dep is_stop
0       Pakistan         Pakistan  PROPN  NNP  nsubj   False
1            got           (get,)   VERB  VBD   ROOT   False
2   independence  (independence,)   NOUN   NN   dobj   False
3             in            (in,)    ADP   IN   prep    True
4           1947          (1947,)    NUM   CD   pobj   False
5              .             (.,)  PUNCT    .  punct   False
6        Karachi       (Karachi,)  PROPN  NNP  nsubj   False
7              ,             (,,)  PUNCT    ,  punct   False
8         Lahore        (Lahore,)  PROPN  NNP   conj   False
9            and           (and,)  CCONJ   CC     cc    True
10     Islamabad     (Islamabad,)  PROPN  NNP   conj   False
11           are            (be,)   VERB  VBP   ROOT    True
12           few           (few,)    ADJ   JJ   attr    True
13            of            (of,)    ADP   IN   prep    True
14           the           (the,)    DET   DT    det    True
15         major        

In [40]:
df_token.to_excel('Tokens Data.xlsx', index=False)