## Tokenization

In [26]:
import spacy
from spacy import displacy

nlp = spacy.load("en")

In [20]:
mystring = "We are moving to Mumbai from St.Louis in U.S. I really like this new house as we have invested close to $6 million in it. I work for Apple in India."

In [21]:
doc = nlp(mystring)

In [22]:
for token in doc:
    #print(token)
    print(token, end = '|') # another method

We|are|moving|to|Mumbai|from|St|.|Louis|in|U.S.|I|really|like|this|new|house|as|we|have|invested|close|to|$|6|million|in|it|.|I|work|for|Apple|in|India|.|

In [23]:
for entity in doc.ents:
    print(entity, end = '|')
    print(entity.label_, end = '|')
    print(str(spacy.explain(entity.label_)),end ='|')
    print("\n")

Mumbai|GPE|Countries, cities, states|

St.|GPE|Countries, cities, states|

U.S.|GPE|Countries, cities, states|

close to $6 million|MONEY|Monetary values, including unit|

Apple|ORG|Companies, agencies, institutions, etc.|

India|GPE|Countries, cities, states|



In [24]:
for noun in doc.noun_chunks:
    print(noun, end = '|')

We|Mumbai|St|Louis|U.S.|I|this new house|we|it|I|Apple|India|

In [28]:
displacy.render(doc,style = 'dep',jupyter = True, options = {'distance':110})

In [30]:
displacy.render(doc,style ='ent',jupyter=True)

## Stemming

In [33]:
import nltk

from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

p_stemmer = PorterStemmer()
s_stemmer = SnowballStemmer(language = 'english')

In [41]:
words = ['run', 'ran','runs', 'running','easyily','fairly','runner','fairness','fairer', 'fairy']

In [42]:
for word in words:
    print(f"{word} ---> {p_stemmer.stem(word)}")

run ---> run
ran ---> ran
runs ---> run
running ---> run
easyily ---> easyili
fairly ---> fairli
runner ---> runner
fairness ---> fair
fairer ---> fairer
fairy ---> fairi


In [43]:
for word in words:
    print(f"{word} -----> {s_stemmer.stem(word)}")

run -----> run
ran -----> ran
runs -----> run
running -----> run
easyily -----> easyili
fairly -----> fair
runner -----> runner
fairness -----> fair
fairer -----> fairer
fairy -----> fairi


## Lemmatization

In [52]:
text = nlp(u"I am a runner running a race today as I like to run and I ran today.")

In [53]:
def show_lemma(text):
    for token in text:
        print(token, token.pos_, token.lemma, token.lemma_)

In [54]:
show_lemma(text)

I PRON 561228191312463089 -PRON-
am AUX 10382539506755952630 be
a DET 11901859001352538922 a
runner NOUN 12640964157389618806 runner
running VERB 12767647472892411841 run
a DET 11901859001352538922 a
race NOUN 8048469955494714898 race
today NOUN 11042482332948150395 today
as SCONJ 7437575085468336610 as
I PRON 561228191312463089 -PRON-
like VERB 18194338103975822726 like
to PART 3791531372978436496 to
run VERB 12767647472892411841 run
and CCONJ 2283656566040971221 and
I PRON 561228191312463089 -PRON-
ran VERB 12767647472892411841 run
today NOUN 11042482332948150395 today
. PUNCT 12646065887601541794 .


## Stop Words

##### Words that need to be removed/filtered from the processing so that it does not hurt our processing as they add no special value.

In [55]:
print(nlp.Defaults.stop_words)

{'themselves', 'us', '’ll', 'almost', 'could', 'keep', 'due', 'thereupon', 'otherwise', 'under', 'everywhere', 'still', 'if', 'beyond', 'please', 'their', 'among', 'behind', 'anyone', 'enough', 'yourselves', 'therefore', 'then', 'really', 'which', 'per', 'there', 'after', 'anyway', 'whoever', 'your', 'around', 'most', 'move', 'say', 'well', 'upon', 'when', 'n’t', 'anyhow', 'along', 'thereby', 'serious', 'everyone', 'might', 'at', 'myself', 'sixty', 'over', 'during', 'amount', 'his', 'quite', 'whereupon', 'used', 'front', 'whom', 'see', 'too', 'with', 'except', 'himself', 'through', 'thru', '’re', 'who', 'hers', 'using', 'namely', 'hereupon', 'indeed', 'being', '’ve', 'out', 'top', 'moreover', 'whatever', 'becoming', 'latterly', 'done', 'seem', 'further', 'whereby', 'without', '‘ve', 'mine', 'where', 'neither', 'besides', 'alone', 'one', 'whereas', 'now', 'for', 'nowhere', 'these', 'what', 'hereby', 'somewhere', "'ve", '‘re', 'eight', 'become', 'they', 'becomes', 'five', 'few', 'seeming

In [62]:
print(len(nlp.Defaults.stop_words))

327


In [63]:
# how to identify stop words in a vocabulary

nlp.vocab['is'].is_stop

True

In [64]:
nlp.vocab['btw'].is_stop

True

In [65]:
nlp.vocab['twelve'].is_stop

True

In [66]:
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True
nlp.vocab['btw'].is_stop

True

In [68]:
nlp.Defaults.stop_words.remove('twelve')
nlp.vocab['twelve'].is_stop = False
nlp.vocab['twelve'].is_stop

False