# NLTK - Natural Language Tool Processing Toolkit

In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 811 kB/s eta 0:00:01
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25ldone
[?25h  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449905 sha256=703c52df4b27834f70bd7d686d01653d57e2dbe079846696d6cf30e0b279bbdb
  Stored in directory: /home/yatin/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built nltk
Installing collected packages: nltk
Successfully installed nltk-3.4.5


In [2]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /home/yatin/anaconda3/lib/python3.7/site-packages (20.0.2)


In [3]:
import nltk

## Download Corpora 

In [4]:
nltk.download()
#### downloads all the corpora

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [5]:
nltk.download('brown')
#### downloads only the brown corpora

[nltk_data] Downloading package brown to /home/yatin/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [6]:
from nltk.corpus import brown

In [8]:
print(brown.categories())
print(len(brown.categories()))

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
15


In [9]:
## let's access the corpora related to adventure category and romance category and science_fiction

In [15]:
adventure_corpora = brown.sents(categories=['adventure'])
romance_corpora = brown.sents(categories=['romance'])
science_fiction_corpora = brown.sents(categories=['science_fiction'])

In [16]:
print(len(adventure_corpora))
print(len(romance_corpora))
print(len(science_fiction_corpora))

4637
4431
948


In [21]:
## let's print the first 2 sentences of each category

In [22]:
print(' '.join(adventure_corpora[0]))
print(' '.join(adventure_corpora[1]))

Dan Morgan told himself he would forget Ann Turner .
He was well rid of her .


In [24]:
print(' '.join(romance_corpora[0]))
print(' '.join(romance_corpora[1]))

They neither liked nor disliked the Old Man .
To them he could have been the broken bell in the church tower which rang before and after Mass , and at noon , and at six each evening -- its tone , repetitive , monotonous , never breaking the boredom of the streets .


In [25]:
print(' '.join(science_fiction_corpora[0]))
print(' '.join(science_fiction_corpora[1]))

Now that he knew himself to be self he was free to grok ever closer to his brothers , merge without let .
Self's integrity was and is and ever had been .


# Bag of Words Pipeline

### 1 - get the data

### 2 - tokenisation,step word removal

### 3 - lemmetisation and stemming

### 4 - vocabulary formation

### 5 - vectorization

### 6 - classification

# Tokenisation and Stop Word Removal

In [27]:
## there are 2 types of tokenizers
## 1 is a sentence tokenizer and 2 is a word tokenizer

In [28]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [32]:
document = """howdy dudy! Joey does'nt share food. I know!!We were on a break. Could you be more stupider."""

In [33]:
sentences = sent_tokenize(document)

In [34]:
print(sentences)

['howdy dudy!', "Joey does'nt share food.", 'I know!', '!We were on a break.', 'Could you be more stupider.']


In [35]:
sentence = "How you doing??"

In [36]:
words = word_tokenize(sentence)

In [37]:
print(words)

['How', 'you', 'doing', '?', '?']


In [38]:
## step word removal

In [39]:
from nltk.corpus import stopwords

In [53]:
sw = stopwords.words('english')

In [54]:
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [55]:
print(len(sw))

179


In [56]:
sw.remove('not')

In [57]:
print(len(sw))

178


In [65]:
test = 'Delhi is the capital of India'.split()
sw = set(sw)

In [66]:
def remove_stopwords(text,sw):
    unique_words = [w for w in text if w not in sw]
    return unique_words

In [67]:
result = remove_stopwords(test,sw)

In [68]:
print(result)

['Delhi', 'capital', 'India']


# Regex Based Tokenisation

In [69]:
## splitting the sentence into tokens on the basis of regex expressions

In [70]:
test = 'Please send me your 100 research paper at yatin88617@gmail.com'.split()
print(remove_stopwords(test,sw))

['Please', 'send', '100', 'research', 'paper', 'yatin88617@gmail.com']


In [72]:
## let's try to remove numbers from the test using regex

In [73]:
from nltk.tokenize import RegexpTokenizer

In [90]:
tokeniser = RegexpTokenizer('[a-zA-Z@.]+')

In [91]:
t = 'Please send me your 100 research papers at yatin88617@gmail.com'
test = tokeniser.tokenize(t)

In [92]:
print(test)

['Please', 'send', 'me', 'your', 'research', 'papers', 'at', 'yatin', '@gmail.com']


In [93]:
print(remove_stopwords(test,sw))

['Please', 'send', 'research', 'papers', 'yatin', '@gmail.com']
