# NLTK - Natural Language Tool Processing Toolkit

In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 811 kB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449905 sha256=703c52df4b27834f70bd7d686d01653d57e2dbe079846696d6cf30e0b279bbdb
  Stored in directory: /home/yatin/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.5


In [2]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /home/yatin/anaconda3/lib/python3.7/site-packages (20.0.2)


In [3]:
import nltk

## Download Corpora 

In [4]:
nltk.download()
#### downloads all the corpora

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
nltk.download('brown')
#### downloads only the brown corpora

[nltk_data] Downloading package brown to /home/yatin/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [6]:
from nltk.corpus import brown

In [8]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [9]:
## let's access the corpora related to adventure category and romance category and science_fiction

In [15]:
adventure_corpora = brown.sents(categories=['adventure'])
romance_corpora = brown.sents(categories=['romance'])
science_fiction_corpora = brown.sents(categories=['science_fiction'])

In [16]:
print(len(adventure_corpora))
print(len(romance_corpora))
print(len(science_fiction_corpora))

4637
4431
948


In [21]:
## let's print the first 2 sentences of each category

In [22]:
print(' '.join(adventure_corpora[0]))
print(' '.join(adventure_corpora[1]))

Dan Morgan told himself he would forget Ann Turner .
He was well rid of her .


In [24]:
print(' '.join(romance_corpora[0]))
print(' '.join(romance_corpora[1]))

They neither liked nor disliked the Old Man .
To them he could have been the broken bell in the church tower which rang before and after Mass , and at noon , and at six each evening -- its tone , repetitive , monotonous , never breaking the boredom of the streets .


In [25]:
print(' '.join(science_fiction_corpora[0]))
print(' '.join(science_fiction_corpora[1]))

Now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .
Self's integrity was and is and ever had been .


# Bag of Words Pipeline

### 1 - get the data

### 2 - tokenisation,step word removal

### 3 - lemmetisation and stemming

### 4 - vocabulary formation

### 5 - vectorization

### 6 - classification

# Tokenisation and Stop Word Removal

In [27]:
## there are 2 types of tokenizers
## 1 is a sentence tokenizer and 2 is a word tokenizer

In [28]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [32]:
document = """howdy dudy! Joey does'nt share food. I know!!We were on a break. Could you be more stupider."""

In [33]:
sentences = sent_tokenize(document)

In [34]:
print(sentences)

['howdy dudy!', "Joey does'nt share food.", 'I know!', '!We were on a break.', 'Could you be more stupider.']


In [35]:
sentence = "How you doing??"

In [36]:
words = word_tokenize(sentence)

In [37]:
print(words)

['How', 'you', 'doing', '?', '?']


In [38]:
## step word removal

In [39]:
from nltk.corpus import stopwords

In [53]:
sw = stopwords.words('english')

In [54]:
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [55]:
print(len(sw))

179


In [56]:
sw.remove('not')

In [57]:
print(len(sw))

178


In [65]:
test = 'Delhi is the capital of India'.split()
sw = set(sw)

In [66]:
def remove_stopwords(text,sw):
    unique_words = [w for w in text if w not in sw]
    return unique_words

In [67]:
result = remove_stopwords(test,sw)

In [68]:
print(result)

['Delhi', 'capital', 'India']


# Regex Based Tokenisation

In [69]:
## splitting the sentence into tokens on the basis of regex expressions

In [70]:
test = 'Please send me your 100 research paper at yatin88617@gmail.com'.split()
print(remove_stopwords(test,sw))

['Please', 'send', '100', 'research', 'paper', 'yatin88617@gmail.com']


In [72]:
## let's try to remove numbers from the test using regex

In [73]:
from nltk.tokenize import RegexpTokenizer

In [94]:
tokeniser = RegexpTokenizer('[a-zA-Z@.88617]+')

In [95]:
t = 'Please send me your 100 research papers at yatin88617@gmail.com'
test = tokeniser.tokenize(t)

In [144]:
print(test)

['Please', 'send', 'me', 'your', '1', 'research', 'papers', 'at', 'yatin88617@gmail.com']


In [97]:
print(remove_stopwords(test,sw))

['Please', 'send', '1', 'research', 'papers', 'yatin88617@gmail.com']


# Stemming and Lemmatization

In [98]:
## stemming

In [114]:
## first import the classes of different Stemmers from their respective packages
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

In [116]:
## create an object and then call the stem method
ps = PorterStemmer()
ls = LancasterStemmer()
ss = SnowballStemmer('english')

In [117]:
ps.stem('loving')

'love'

In [118]:
ps.stem('loved')

'love'

In [119]:
ps.stem('lovely')

'love'

In [120]:
ss.stem('lovely')

'love'

In [121]:
ss.stem('loved')

'love'

In [122]:
ss.stem('loving')

'love'

In [126]:
ps.stem('hating')

'hate'

In [127]:
ss.stem('hating')

'hate'

In [128]:
## Lemmatizer

In [129]:
from nltk.stem import WordNetLemmatizer

In [130]:
wn = WordNetLemmatizer()

In [136]:
wn.lemmatize('better',pos='a')

'good'

In [138]:
wn.lemmatize('smarter',pos='a')

'smart'

# Vocabulary formation and Vectorisation

In [192]:
corpus = ["Indian Prime Minister assures that his party will win the elections",
         "Team India emerged victorious in the World Cup",
         "Raazi is an Indian Spy Movie based on a true story",
         "Yes, We Can!!",
         "For the people, with the people and by the people"]

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

In [148]:
cv = CountVectorizer()

In [150]:
## we have trained the model for this particular corpus
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [154]:
## vectorisation
## converting the the given corpus into a vector on the basis of the vocabulary formed in the above step
vectorized_corpus = cv.transform(corpus).toarray()

In [155]:
print(vectorized_corpus)

[[0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0]]


In [169]:
## the CountVectoriser class by default does not remove stopwords and neither performs lemmatization
cv.vocabulary_
## it is a parametrized algorithm because we have learned new parameters

{'indian': 13,
 'prime': 20,
 'minister': 15,
 'assures': 2,
 'that': 25,
 'his': 10,
 'party': 18,
 'will': 30,
 'win': 31,
 'the': 26,
 'elections': 7,
 'team': 24,
 'india': 12,
 'emerged': 8,
 'victorious': 28,
 'in': 11,
 'world': 33,
 'cup': 6,
 'raazi': 21,
 'is': 14,
 'an': 0,
 'spy': 22,
 'movie': 16,
 'based': 3,
 'on': 17,
 'true': 27,
 'story': 23,
 'yes': 34,
 'we': 29,
 'can': 5,
 'for': 9,
 'people': 19,
 'with': 32,
 'and': 1,
 'by': 4}

In [162]:
print(len(vectorized_corpus[0]))

35


In [163]:
print(len(cv.vocabulary_))

35


In [165]:
## the size of the vectorized document must be same as the size of the dictionary/vocabulary
## because the length of the vectorized document depends upon the the number of unique features/words in the vocabulary

In [166]:
first_vector = vectorized_corpus[0]

In [168]:
print(cv.inverse_transform(first_vector))
## the order is jumbled because the dictionary assigns random numbers to different words instead of their 
## actual indexes
## that is why this pipeline is know as bag of words because only the words matter and not their order in classification

[array(['assures', 'elections', 'his', 'indian', 'minister', 'party',
       'prime', 'that', 'the', 'will', 'win'], dtype='<U10')]


# Vectorisation with StopWards Removal

In [170]:
def myTokeniser(document):
    
    ## converts the document to lower case so that the words Indian and indian are same
    ## and this will increase the frequency of the word indian
    words = tokeniser.tokenize(document.lower())
    return remove_stopwords(words,sw)

In [172]:
cv = CountVectorizer(tokenizer=myTokeniser)

In [173]:
cv.fit(corpus)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function myTokeniser at 0x7f57a0ac0158>,
        vocabulary=None)

In [177]:
vect_corpus = cv.transform(corpus).toarray()

In [178]:
print(len(vect_corpus[0]))

21


In [179]:
inv_transform = cv.inverse_transform(vect_corpus[0])

In [180]:
print(inv_transform)

[array(['assures', 'elections', 'indian', 'minister', 'party', 'prime',
       'win'], dtype='<U10')]


In [185]:
sent1 = ["this is a good movie"]
sent2 = ["this is not a good movie"]
docs = [sent1[0],sent2[0]]

In [186]:
print(docs)

['this is a good movie', 'this is not a good movie']


# Bag of Words Model

## unigram - single word is treated as a feature

## bigram - two words are treated as a single feature

## trigram - three words are treated as a single feature

## ngrams - n words are treated as a single feature

In [187]:
ps.stem("quiclky")

'quiclki'

In [188]:
word_tokenize("Hey ! Welcome To Coding Blocks ? .")

['Hey', '!', 'Welcome', 'To', 'Coding', 'Blocks', '?', '.']

# TF-IDF normalization

In [189]:
print(vectorized_corpus)

[[0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 1 0 0 0]
 [0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0]
 [1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 3 0 0 0 0 0 1 0 0]]


In [190]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [191]:
tfidf = TfidfVectorizer()

In [193]:
tfidf_vectorized_corpus = tfidf.fit(corpus)

In [194]:
tfidf.vocabulary_

{'indian': 13,
 'prime': 20,
 'minister': 15,
 'assures': 2,
 'that': 25,
 'his': 10,
 'party': 18,
 'will': 30,
 'win': 31,
 'the': 26,
 'elections': 7,
 'team': 24,
 'india': 12,
 'emerged': 8,
 'victorious': 28,
 'in': 11,
 'world': 33,
 'cup': 6,
 'raazi': 21,
 'is': 14,
 'an': 0,
 'spy': 22,
 'movie': 16,
 'based': 3,
 'on': 17,
 'true': 27,
 'story': 23,
 'yes': 34,
 'we': 29,
 'can': 5,
 'for': 9,
 'people': 19,
 'with': 32,
 'and': 1,
 'by': 4}

In [195]:
tfidf_vectorized_corpus = tfidf.transform(corpus)