## Natural Language ToolKit (NLTK)

In [32]:
# Importing the NLTK library
import nltk

In [33]:
data = '''A man and a young teenage boy checked into a hotel and were shown to their room. The receptionist noted the quiet manner of the guests and the pale appearance of the boy. Later, the man and boy ate dinner in the hotel restaurant.

The staff again noticed that the two guests were very quiet and that the boy seemed disinterested in his food.

After eating, the boy went to his room and the man went to ask the receptionist to see the manager. The receptionist initially asked if there was a problem with the service or the room, and offered to fix things, but the man said that there was no problem of the sort and repeated his request.

When the manager appeared, he took him aside and explained that he was spending the night in the hotel with his fourteen-year-old son, who was seriously ill, probably terminally so. The boy was very soon to undergo therapy, which would cause him to lose his hair. They had come to the hotel to have a break together and also because the boy planned to shave his head, that night, rather than feel that the illness was beating him. The father said that he would be shaving his own head too, in support of his son.

He asked that staff be respectful when the two of them came to breakfast with their shaved heads.

The manager assured the father that he would inform all staff and that they would behave appropriately.

The following morning the father and son entered the restaurant for breakfast. There they saw the four male restaurant staff attending to their duties, perfectly normally, all with shaved heads.

No matter what business you are in, you can help people and you can make a difference.'''

In [34]:
#type of the data
type(data)

str

In [35]:
#returns total characters
len(data)

1638

### Frequency Distribution

In [36]:
#Tokenizer
from nltk.tokenize import sent_tokenize, word_tokenize, TreebankWordTokenizer, wordpunct_tokenize

In [37]:
#splitting the data into tokens
tokens = word_tokenize(data)

In [38]:
from nltk.probability import FreqDist
fdist = FreqDist()

for i in tokens:
    fdist[i.lower()]+=1
print(fdist.most_common(8))

[('the', 34), ('.', 15), (',', 14), ('and', 13), ('to', 12), ('that', 10), ('his', 8), ('boy', 7)]


In [39]:
#printing top 10 frequent words
fdist_top10 = fdist.most_common(10)
fdist_top10

[('the', 34),
 ('.', 15),
 (',', 14),
 ('and', 13),
 ('to', 12),
 ('that', 10),
 ('his', 8),
 ('boy', 7),
 ('a', 6),
 ('was', 6)]

### Tokenization

In [40]:
#word_tokenizer
from nltk.tokenize import word_tokenize
txt1 = 'Mumbai is one of the most populous city in the world'

print(word_tokenize(txt1))

['Mumbai', 'is', 'one', 'of', 'the', 'most', 'populous', 'city', 'in', 'the', 'world']


In [41]:
#sentence_Tokenizer
from nltk.tokenize import sent_tokenize
txt2 = 'All pencils are pens. Some books are pens.'

print(sent_tokenize(txt2))

['All pencils are pens.', 'Some books are pens.']


In [42]:
#Blank_line tokenizer splits the data into paragrahs
from nltk.tokenize import blankline_tokenize
bl_tokenize = blankline_tokenize(data)
len(bl_tokenize)

8

In [43]:
#Word Punct Tokenizer
from nltk.tokenize import wordpunct_tokenize
txt = 'Hello John, How are you doing ?'

print(wordpunct_tokenize(txt))

['Hello', 'John', ',', 'How', 'are', 'you', 'doing', '?']


In [44]:
treebank_tokenizer = TreebankWordTokenizer()
x = treebank_tokenizer.tokenize(data)

print(x)

['A', 'man', 'and', 'a', 'young', 'teenage', 'boy', 'checked', 'into', 'a', 'hotel', 'and', 'were', 'shown', 'to', 'their', 'room.', 'The', 'receptionist', 'noted', 'the', 'quiet', 'manner', 'of', 'the', 'guests', 'and', 'the', 'pale', 'appearance', 'of', 'the', 'boy.', 'Later', ',', 'the', 'man', 'and', 'boy', 'ate', 'dinner', 'in', 'the', 'hotel', 'restaurant.', 'The', 'staff', 'again', 'noticed', 'that', 'the', 'two', 'guests', 'were', 'very', 'quiet', 'and', 'that', 'the', 'boy', 'seemed', 'disinterested', 'in', 'his', 'food.', 'After', 'eating', ',', 'the', 'boy', 'went', 'to', 'his', 'room', 'and', 'the', 'man', 'went', 'to', 'ask', 'the', 'receptionist', 'to', 'see', 'the', 'manager.', 'The', 'receptionist', 'initially', 'asked', 'if', 'there', 'was', 'a', 'problem', 'with', 'the', 'service', 'or', 'the', 'room', ',', 'and', 'offered', 'to', 'fix', 'things', ',', 'but', 'the', 'man', 'said', 'that', 'there', 'was', 'no', 'problem', 'of', 'the', 'sort', 'and', 'repeated', 'his', 

### Bigrams, Trigrams and Ngrams

In [45]:
#importing bigrams, trigrams and ngrams
from nltk.util import bigrams, trigrams, ngrams

In [46]:
string = 'A man and a young teenage boy checked into a hotel and were shown to their room. The receptionist noted the quiet manner of the guests and the pale appearance of the boy.'

In [47]:
#Unigram or Word Tokenizer
string_tokens = nltk.word_tokenize(string)
string_tokens

['A',
 'man',
 'and',
 'a',
 'young',
 'teenage',
 'boy',
 'checked',
 'into',
 'a',
 'hotel',
 'and',
 'were',
 'shown',
 'to',
 'their',
 'room',
 '.',
 'The',
 'receptionist',
 'noted',
 'the',
 'quiet',
 'manner',
 'of',
 'the',
 'guests',
 'and',
 'the',
 'pale',
 'appearance',
 'of',
 'the',
 'boy',
 '.']

In [48]:
#creating list of tokens with two consecutive words 
string_bigrams = list(nltk.bigrams(string_tokens))
string_bigrams

[('A', 'man'),
 ('man', 'and'),
 ('and', 'a'),
 ('a', 'young'),
 ('young', 'teenage'),
 ('teenage', 'boy'),
 ('boy', 'checked'),
 ('checked', 'into'),
 ('into', 'a'),
 ('a', 'hotel'),
 ('hotel', 'and'),
 ('and', 'were'),
 ('were', 'shown'),
 ('shown', 'to'),
 ('to', 'their'),
 ('their', 'room'),
 ('room', '.'),
 ('.', 'The'),
 ('The', 'receptionist'),
 ('receptionist', 'noted'),
 ('noted', 'the'),
 ('the', 'quiet'),
 ('quiet', 'manner'),
 ('manner', 'of'),
 ('of', 'the'),
 ('the', 'guests'),
 ('guests', 'and'),
 ('and', 'the'),
 ('the', 'pale'),
 ('pale', 'appearance'),
 ('appearance', 'of'),
 ('of', 'the'),
 ('the', 'boy'),
 ('boy', '.')]

In [49]:
#creating list of tokens with four consecutive words
string_ngrams = list(nltk.ngrams(string_tokens,4))
string_ngrams

[('A', 'man', 'and', 'a'),
 ('man', 'and', 'a', 'young'),
 ('and', 'a', 'young', 'teenage'),
 ('a', 'young', 'teenage', 'boy'),
 ('young', 'teenage', 'boy', 'checked'),
 ('teenage', 'boy', 'checked', 'into'),
 ('boy', 'checked', 'into', 'a'),
 ('checked', 'into', 'a', 'hotel'),
 ('into', 'a', 'hotel', 'and'),
 ('a', 'hotel', 'and', 'were'),
 ('hotel', 'and', 'were', 'shown'),
 ('and', 'were', 'shown', 'to'),
 ('were', 'shown', 'to', 'their'),
 ('shown', 'to', 'their', 'room'),
 ('to', 'their', 'room', '.'),
 ('their', 'room', '.', 'The'),
 ('room', '.', 'The', 'receptionist'),
 ('.', 'The', 'receptionist', 'noted'),
 ('The', 'receptionist', 'noted', 'the'),
 ('receptionist', 'noted', 'the', 'quiet'),
 ('noted', 'the', 'quiet', 'manner'),
 ('the', 'quiet', 'manner', 'of'),
 ('quiet', 'manner', 'of', 'the'),
 ('manner', 'of', 'the', 'guests'),
 ('of', 'the', 'guests', 'and'),
 ('the', 'guests', 'and', 'the'),
 ('guests', 'and', 'the', 'pale'),
 ('and', 'the', 'pale', 'appearance'),
 ('th

### Stemming

In [50]:
#stemming the tokens into their root word
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
word = ['connecting','connected','connects']

for i in word:
    print(i ,":", ps.stem(i) )

connecting : connect
connected : connect
connects : connect


### Lemmatization

In [51]:
#Lemmatizing the tokens into their dictionary form
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
words = ['better','cleverer','farest']

for i in words:
    print(i ,":", lem.lemmatize(i,'a'))

better : good
cleverer : clever
farest : far


In [52]:
word = 'fries'
z = lem.lemmatize(word,'v')
y = ps.stem(word)
print(z)
print(y)

fry
fri


### StopWords

In [53]:
#Removing the stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

text = 'Hello John, How are you doing ?'
tokens = word_tokenize(text)

filtered_text = [] 
for w in tokens: 
    if w not in stop_words: 
        filtered_text.append(w) 
  
print(tokens) 
print(filtered_text) 

['Hello', 'John', ',', 'How', 'are', 'you', 'doing', '?']
['Hello', 'John', ',', 'How', '?']


### POS_Tagging

In [54]:
#POS_tagging
from nltk import pos_tag
txtt = 'She likes classical music'
tokens=nltk.word_tokenize(txtt)

print(nltk.pos_tag(tokens))

[('She', 'PRP'), ('likes', 'VBZ'), ('classical', 'JJ'), ('music', 'NN')]


### Named Entity Recognition

In [55]:
#Named Entity Recognition
from nltk import ne_chunk
file = 'Sundar Pichai is the CEO of Google'

ne_tokens = nltk.word_tokenize(file)
ne_tags = nltk.pos_tag(ne_tokens)
ne_ner = ne_chunk(ne_tags)
print(ne_ner)

(S
  (PERSON Sundar/NNP)
  (PERSON Pichai/NNP)
  is/VBZ
  the/DT
  (ORGANIZATION CEO/NN of/IN Google/NNP))


### TFIDF Vectorization

In [56]:
import re

In [57]:
senten = nltk.sent_tokenize(data)
senten

['A man and a young teenage boy checked into a hotel and were shown to their room.',
 'The receptionist noted the quiet manner of the guests and the pale appearance of the boy.',
 'Later, the man and boy ate dinner in the hotel restaurant.',
 'The staff again noticed that the two guests were very quiet and that the boy seemed disinterested in his food.',
 'After eating, the boy went to his room and the man went to ask the receptionist to see the manager.',
 'The receptionist initially asked if there was a problem with the service or the room, and offered to fix things, but the man said that there was no problem of the sort and repeated his request.',
 'When the manager appeared, he took him aside and explained that he was spending the night in the hotel with his fourteen-year-old son, who was seriously ill, probably terminally so.',
 'The boy was very soon to undergo therapy, which would cause him to lose his hair.',
 'They had come to the hotel to have a break together and also becaus

In [58]:
#cleaning the text
filtered = []
for i in range(len(senten)):
    c_txt = re.sub('[^a-zA-Z]',' ', senten[i])           #removing special characters, numbers and punctuations
    c_txt = c_txt.lower()                                #converting into lower case
    c_txt = c_txt.split()                                #splitting into tokens
    c_txt = [lem.lemmatize(word) for word in c_txt if not word in set(stopwords.words('english'))]   #removing stopwords
    c_txt = ' '.join(c_txt)                             #rejoining the words
    filtered.append(c_txt)

In [59]:
filtered

['man young teenage boy checked hotel shown room',
 'receptionist noted quiet manner guest pale appearance boy',
 'later man boy ate dinner hotel restaurant',
 'staff noticed two guest quiet boy seemed disinterested food',
 'eating boy went room man went ask receptionist see manager',
 'receptionist initially asked problem service room offered fix thing man said problem sort repeated request',
 'manager appeared took aside explained spending night hotel fourteen year old son seriously ill probably terminally',
 'boy soon undergo therapy would cause lose hair',
 'come hotel break together also boy planned shave head night rather feel illness beating',
 'father said would shaving head support son',
 'asked staff respectful two came breakfast shaved head',
 'manager assured father would inform staff would behave appropriately',
 'following morning father son entered restaurant breakfast',
 'saw four male restaurant staff attending duty perfectly normally shaved head',
 'matter business he

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [61]:
model = TfidfVectorizer()

In [62]:
x = model.fit_transform(filtered).toarray()

In [63]:
documentA = 'All pens are pencils.'
documentB = 'some pencils are books'
documentC = 'Some books are sketches'

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([documentA, documentB, documentC])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names)

print(df)

        all       are     books   pencils      pens  sketches      some
0  0.584483  0.345205  0.000000  0.444514  0.584483  0.000000  0.000000
1  0.000000  0.409123  0.526820  0.526820  0.000000  0.000000  0.526820
2  0.000000  0.373119  0.480458  0.000000  0.000000  0.631745  0.480458


### Count Vectorizer

In [64]:
text1 = ['You will be prompted for your password.','The password is your PAN number in upper case.']

In [65]:
from sklearn.feature_extraction.text import CountVectorizer

In [66]:
cv = CountVectorizer()

In [67]:
vect = cv.fit_transform(text1)

In [68]:
cnt_vect = cv.get_feature_names()

In [69]:
pd.DataFrame(data = vect.toarray(), index = ['doc1','doc2'], columns=cnt_vect)

Unnamed: 0,be,case,for,in,is,number,pan,password,prompted,the,upper,will,you,your
doc1,1,0,1,0,0,0,0,1,1,0,0,1,1,1
doc2,0,1,0,1,1,1,1,1,0,1,1,0,0,1
