## Binary Encoding

Uses 0 and 1 to represent the presence of words in the sentence

In [1]:
texts = [
    "blue car and blue window",
    "black crow in the window",
    "i see my reflection in the window"
]
vocab=sorted(set(word for sentence in texts for word in sentence.split()))
print(vocab)

['and', 'black', 'blue', 'car', 'crow', 'i', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [2]:
import numpy as np

def binary_transform(text):
    output=np.zeros(len(vocab))
    words=set(text.split())
    for i,v in enumerate(vocab):
        output[i]= v in words
    return output

In [3]:
binary_transform("i saw car")

array([0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.])

## CountVectorizer
How many times a word appear in a text

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vec=CountVectorizer(binary=True)
vec.fit(texts)
print([w for w in sorted(vec.vocabulary_.keys())])

['and', 'black', 'blue', 'car', 'crow', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [5]:
print([w for w in sorted(vec.vocabulary_.values())])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [6]:
import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,1,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


## Bag of Words

In [7]:
vec = CountVectorizer(binary=False) 
vec.fit(texts)

X = vec.transform(texts)
pd.DataFrame(X.toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,2,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


#### Limitation of Bag of Words (BoW)
1. Order of the context is ignored
2. Two join sentence were treated as independent but in reality they are related to each other.

## TF-IDF Vectorizer
tfidf(t, d, D) = tf(t, d) * idf(t, D)

It is based on the assumption that less frequent words are more important.

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer() # instantiate an object from the TfidfVectorizer class (constructer funciton)
vec.fit(texts) # fit the object to our data

import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,0.396875,0.0,0.793749,0.396875,0.0,0.0,0.0,0.0,0.0,0.0,0.2344
1,0.0,0.534093,0.0,0.0,0.534093,0.406192,0.0,0.0,0.0,0.406192,0.315444
2,0.0,0.0,0.0,0.0,0.0,0.358291,0.47111,0.47111,0.47111,0.358291,0.278245


## N-Gram Model
As the vocabulary size increases, more sophisticate way is to group the words.

* Markov assumption for condition probability.
* Probability estimation can be done by Maximum Likelihood Estimation.

In [9]:
import nltk

word_data = "The best performance can bring in sky high success."
nltk_tokens = nltk.word_tokenize(word_data)

print(list(nltk.bigrams(nltk_tokens)))

[('The', 'best'), ('best', 'performance'), ('performance', 'can'), ('can', 'bring'), ('bring', 'in'), ('in', 'sky'), ('sky', 'high'), ('high', 'success'), ('success', '.')]


#### Limitation
Curse of dimensionality : As the variable increases , the number of combination is also increases.

### NLP Preprocessing
* Tokenization
* Part of Speech (POS) tagging
* Stemming
* Lemmatization
* Chunking
* Removal of stopwords, punctuations and emojis.
* Parsing

These are the preprocessing method to refine the raw text, after that the above different kind vectorization method can be applied.

In [11]:
# Tokenization
# Sentence tokenize convert paragraph to sentence
from nltk.tokenize import sent_tokenize
text="""Hello Mr. Smith, how are you doing today? The weather is great, and city is awesome.
The sky is pinkish-blue. You shouldn't eat cardboard"""
tokenized_text=sent_tokenize(text)
print(tokenized_text)
print('---------------------------------------------------------------')
# Word tokenize convert sentence to words
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Hello Mr. Smith, how are you doing today?', 'The weather is great, and city is awesome.', 'The sky is pinkish-blue.', "You shouldn't eat cardboard"]
---------------------------------------------------------------
['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'city', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard']


In [13]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\avi00\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [14]:
# Pos tagging
import nltk
nltk.pos_tag(tokenized_word)

[('Hello', 'NNP'),
 ('Mr.', 'NNP'),
 ('Smith', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('today', 'NN'),
 ('?', '.'),
 ('The', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('great', 'JJ'),
 (',', ','),
 ('and', 'CC'),
 ('city', 'NN'),
 ('is', 'VBZ'),
 ('awesome', 'JJ'),
 ('.', '.'),
 ('The', 'DT'),
 ('sky', 'NN'),
 ('is', 'VBZ'),
 ('pinkish-blue', 'JJ'),
 ('.', '.'),
 ('You', 'PRP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('eat', 'VB'),
 ('cardboard', 'NN')]

# Stemming & Lemmatization
Stemming is the process to reduce the word into their root where words suffix are removed. Sometimes it's not good option when vocabulary meaning gets changed, Instead Lemmatization method can be used which used true word vocabulary.

Accuracy Priority: Lemmatization
Time Priority: Stemmming

In [15]:
#performing stemming and Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()

from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()

word = "flying"
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",stem.stem(word))

Lemmatized Word: fly
Stemmed Word: fli
