# NLP: Preprocessing
## Submitted by:
### Vibhakar Gupta(189301098)
### Mrityunjoy Chowdhury(189301071)

## Tokenization

In [1]:
paragraph = """ India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya),[23] is a country in South Asia. It is the second-most populous country, the seventh-largest country by land area, and the most populous democracy in the world. Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east. In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia."""

In [2]:
import nltk
nltk.download('punkt')
sentences = nltk.sent_tokenize(paragraph)
words = nltk.word_tokenize(paragraph)

[nltk_data] Downloading package punkt to C:\Users\Vibhakar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
sentences

[' India (Hindi: Bhārat), officially the Republic of India (Hindi: Bhārat Gaṇarājya),[23] is a country in South Asia.',
 'It is the second-most populous country, the seventh-largest country by land area, and the most populous democracy in the world.',
 'Bounded by the Indian Ocean on the south, the Arabian Sea on the southwest, and the Bay of Bengal on the southeast, it shares land borders with Pakistan to the west;[f] China, Nepal, and Bhutan to the north; and Bangladesh and Myanmar to the east.',
 'In the Indian Ocean, India is in the vicinity of Sri Lanka and the Maldives; its Andaman and Nicobar Islands share a maritime border with Thailand and Indonesia.']

## Stemming

In [6]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
stemmer = PorterStemmer()
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)
sentences

[nltk_data] Downloading package stopwords to C:\Users\Vibhakar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['india ( hindi : bhārat ) , offici republ india ( hindi : bhārat gaṇarājya ) , [ 23 ] countri south asia .',
 'It second-most popul countri , seventh-largest countri land area , popul democraci world .',
 'bound indian ocean south , arabian sea southwest , bay bengal southeast , share land border pakistan west ; [ f ] china , nepal , bhutan north ; bangladesh myanmar east .',
 'In indian ocean , india vicin sri lanka maldiv ; andaman nicobar island share maritim border thailand indonesia .']

## Lemmatization

In [10]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
sentences = nltk.sent_tokenize(paragraph)

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = ' '.join(words)

sentences

[nltk_data] Downloading package stopwords to C:\Users\Vibhakar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vibhakar
[nltk_data]     Gupta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


['India ( Hindi : Bhārat ) , officially Republic India ( Hindi : Bhārat Gaṇarājya ) , [ 23 ] country South Asia .',
 'It second-most populous country , seventh-largest country land area , populous democracy world .',
 'Bounded Indian Ocean south , Arabian Sea southwest , Bay Bengal southeast , share land border Pakistan west ; [ f ] China , Nepal , Bhutan north ; Bangladesh Myanmar east .',
 'In Indian Ocean , India vicinity Sri Lanka Maldives ; Andaman Nicobar Islands share maritime border Thailand Indonesia .']

## Count - Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
import nltk
sentences = nltk.sent_tokenize(paragraph)
vectorizer = CountVectorizer()
voc = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names())
print(voc.toarray())


['23', 'and', 'andaman', 'arabian', 'area', 'asia', 'bangladesh', 'bay', 'bengal', 'bhutan', 'bhārat', 'border', 'borders', 'bounded', 'by', 'china', 'country', 'democracy', 'east', 'gaṇarājya', 'hindi', 'in', 'india', 'indian', 'indonesia', 'is', 'islands', 'it', 'its', 'land', 'lanka', 'largest', 'maldives', 'maritime', 'most', 'myanmar', 'nepal', 'nicobar', 'north', 'ocean', 'of', 'officially', 'on', 'pakistan', 'populous', 'republic', 'sea', 'second', 'seventh', 'share', 'shares', 'south', 'southeast', 'southwest', 'sri', 'thailand', 'the', 'to', 'vicinity', 'west', 'with', 'world']
[[1 0 0 0 0 1 0 0 0 0 2 0 0 0 0 0 1 0 0 1 2 1 2 0 0 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 2 1 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 0 2 0
  0 0 0 0 0 0 0 0 2 0 0 1 1 0 0 0 0 0 0 0 4 0 0 0 0 1]
 [0 4 0 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1
  1 0 1 1 1 0 3 1 0 0 1 0 0 0 1 1 1 1 0 0 9 3 0 1 1 0]
 [0 3 1 0 0 0 0 0 0

## TF-IDF vectorizer

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
sentences = nltk.sent_tokenize(paragraph)
vectorizer = TfidfVectorizer()
voc = vectorizer.fit_transform(sentences)
print(vectorizer.get_feature_names())
print(voc.toarray())

['23', 'and', 'andaman', 'arabian', 'area', 'asia', 'bangladesh', 'bay', 'bengal', 'bhutan', 'bhārat', 'border', 'borders', 'bounded', 'by', 'china', 'country', 'democracy', 'east', 'gaṇarājya', 'hindi', 'in', 'india', 'indian', 'indonesia', 'is', 'islands', 'it', 'its', 'land', 'lanka', 'largest', 'maldives', 'maritime', 'most', 'myanmar', 'nepal', 'nicobar', 'north', 'ocean', 'of', 'officially', 'on', 'pakistan', 'populous', 'republic', 'sea', 'second', 'seventh', 'share', 'shares', 'south', 'southeast', 'southwest', 'sri', 'thailand', 'the', 'to', 'vicinity', 'west', 'with', 'world']
[[0.23424854 0.         0.         0.         0.         0.23424854
  0.         0.         0.         0.         0.46849707 0.
  0.         0.         0.         0.         0.18468424 0.
  0.         0.23424854 0.46849707 0.14951781 0.36936848 0.
  0.         0.14951781 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0. 

## Bag of words

In [16]:
import nltk
import re
import numpy as np
import heapq

sentences = nltk.sent_tokenize(paragraph)
words = nltk.word_tokenize(paragraph)
word_to_count = {}
for word in words:
    if word not in word_to_count.keys():
        word_to_count[word] = 1
    else:
        word_to_count[word] = word_to_count[word] + 1
freq_words = heapq.nlargest(100, word_to_count, key=word_to_count.get)    

X = [] 
for sentence in sentences: 
    vector = [] 
    for word in freq_words: 
        if word in nltk.word_tokenize(sentence): 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X) 
print(X)
print(freq_words)

[[1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]
 [1 1 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 1 1 1
  1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]
 [1 1 1 1 0 1 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]
 [1 1 1 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
  1 1]]
['the', ',', 'and', '.', 'India', 'of', 'is', 'country', 'in', 'on', 'to', ';', '(', 'Hindi', ':', 'Bhārat', ')', '[', ']', 'a', 'populous', 'by', 'land', 'Indian', 'Ocean', 'with', 'officially', 'Republic', 'Gaṇarājya', '23', 'South', 'Asia', 'It', 'second-most', 'seventh-largest', 'area', 'most', 'democracy', 'world', 'Bounded', 'south', 'Arabian', 'Sea', 'southwest', 'Bay',