# Word Similarity (using Levenshtein distance)
Why is word similarity important? It can be used for the following:<br/>
▪ Spell check<br/>
▪ Speech recognition<br/>
▪ Plagiarism detection

In [27]:
# TextBlob : Wraps around NLTK and makes it easier to use

from textblob import TextBlob

In [28]:
# similar like word_tokenize but little bit different which can be observed from output

text=TextBlob("We're moving from NLTK to TextBlob. How fun!")
text.words

WordList(['We', "'re", 'moving', 'from', 'NLTK', 'to', 'TextBlob', 'How', 'fun'])

In [29]:
from nltk.tokenize import word_tokenize

word_tokenize("We're moving from NLTK to TextBlob. How fun!")

['We',
 "'re",
 'moving',
 'from',
 'NLTK',
 'to',
 'TextBlob',
 '.',
 'How',
 'fun',
 '!']

### Spell check

In [30]:
# carefully see the input and output

blob=TextBlob("I'm graat at speling.")
blob.correct()

TextBlob("I'm great at spelling.")

## How does the correct function work?
▪ Calculates the Levenshtein distance between the word ‘graat’ and all words in its word list<br/>
▪ Of the words with the smallest Levenshtein distance, it outputs the most popular word

# Document Similarity
When is document similarity used?<br/>
▪ When sifting through a large number of documents and trying to find similar ones<br/>
▪ When trying to group, or cluster, together similar documents

### 1.Tokenization,Document-Term matrix,Bag of words model

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [32]:
corpus2 = ['This is the first document.','This is the second document.','And the third one. One is fun.']

cv=CountVectorizer()          
x=cv.fit_transform(corpus2)
print(x) # not in matrix form
Document_term_matrix=pd.DataFrame(x.toarray(),columns=cv.get_feature_names())

# Known as Document-Term matrix
Document_term_matrix

  (0, 1)	1
  (0, 2)	1
  (0, 7)	1
  (0, 4)	1
  (0, 9)	1
  (1, 6)	1
  (1, 1)	1
  (1, 7)	1
  (1, 4)	1
  (1, 9)	1
  (2, 3)	1
  (2, 5)	2
  (2, 8)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1


Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,1,0,1
2,1,0,0,1,1,2,0,1,1,0


### 2. Cosine Similarity(It is a way to quantify the similarity between documents)

In [33]:
from numpy import dot
from numpy.linalg import norm

In [34]:
cosine=lambda v1,v2 : dot(v1,v2)/(norm(v1)*norm(v2))
cosine([1,1,1,0],[1,1,0,1])

0.6666666666666667

### Document Similarity: Example

In [35]:
corpus = ['The weather is hot under the sun','I make my hot chocolate with milk','One hot encoding',
'I will have a chai latte with milk',
'There is a hot sale today']
corpus

['The weather is hot under the sun',
 'I make my hot chocolate with milk',
 'One hot encoding',
 'I will have a chai latte with milk',
 'There is a hot sale today']

In [36]:
cv1=CountVectorizer(stop_words='english')
x1=cv1.fit_transform(corpus).toarray()
pd.DataFrame(x1,columns=cv1.get_feature_names())

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0,0,0,1,0,0,0,0,1,0,1
1,0,1,0,1,0,1,1,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,1,0


In [37]:
# calculate the cosine similarity between all combinations of documents

from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity


# list all of the combinations of 5 take 2 as well as the pairs of phrases
pairs = list(combinations(range(len(corpus)),2))
print(len(corpus))
print(pairs)
print('------------------------------------')

combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]
print(combos)
print('------------------------------------')

# calculate the cosine similarity for all pairs of phrases and sort by most similar
results = [cosine_similarity([x1[a_index]],[x1[b_index]]) for (a_index, b_index) in pairs]
print(results)
print('------------------------------------')

sorted(zip(results, combos), reverse=True)

5
[(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]
------------------------------------
[('The weather is hot under the sun', 'I make my hot chocolate with milk'), ('The weather is hot under the sun', 'One hot encoding'), ('The weather is hot under the sun', 'I will have a chai latte with milk'), ('The weather is hot under the sun', 'There is a hot sale today'), ('I make my hot chocolate with milk', 'One hot encoding'), ('I make my hot chocolate with milk', 'I will have a chai latte with milk'), ('I make my hot chocolate with milk', 'There is a hot sale today'), ('One hot encoding', 'I will have a chai latte with milk'), ('One hot encoding', 'There is a hot sale today'), ('I will have a chai latte with milk', 'There is a hot sale today')]
------------------------------------
[array([[0.28867513]]), array([[0.40824829]]), array([[0.]]), array([[0.33333333]]), array([[0.35355339]]), array([[0.28867513]]), array([[0.28867513]]), array([[0.]]), array([[0.408

[(array([[0.40824829]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.40824829]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.35355339]]),
  ('I make my hot chocolate with milk', 'One hot encoding')),
 (array([[0.33333333]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('The weather is hot under the sun', 'I make my hot chocolate with milk')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]

# 3. Term Frequency-Inverse Document Frequency(TF-IDF)

In [38]:
corpus1 = ['This is the first document.',
'This is the second document.',
'And the third one. One is fun.']

# compare the result with next code implementation

# without removing stop_words
cv2=CountVectorizer()
b=cv2.fit_transform(corpus1)
pd.DataFrame(b.toarray(),columns=cv2.get_feature_names())

Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,1,1,0,1
2,1,0,0,1,1,2,0,1,1,0


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf=TfidfVectorizer()
c=tf.fit_transform(corpus1)
pd.DataFrame(c.toarray(),columns=tf.get_feature_names())

Unnamed: 0,and,document,first,fun,is,one,second,the,third,this
0,0.0,0.450145,0.591887,0.0,0.349578,0.0,0.0,0.349578,0.0,0.450145
1,0.0,0.450145,0.0,0.0,0.349578,0.0,0.591887,0.349578,0.0,0.450145
2,0.36043,0.0,0.0,0.36043,0.212876,0.72086,0.0,0.212876,0.36043,0.0


In [40]:
# after removing stop_words

# compare with next code of tfidf

cv3=CountVectorizer(stop_words='english')
d=cv3.fit_transform(corpus1)
pd.DataFrame(d.toarray(),columns=cv3.get_feature_names())

Unnamed: 0,document,fun,second
0,1,0,0
1,1,0,1
2,0,1,0


In [41]:
tf1=TfidfVectorizer(stop_words='english')
e=tf1.fit_transform(corpus1)
pd.DataFrame(e.toarray(),columns=tf1.get_feature_names())

Unnamed: 0,document,fun,second
0,1.0,0.0,0.0
1,0.605349,0.0,0.795961
2,0.0,1.0,0.0


### Document Similarity: Example(solving this time using TF-IDF and compare with previous result that was using CountVectorizer)

In [42]:
tf_idf=TfidfVectorizer(stop_words='english')
f=tf_idf.fit_transform(corpus)
pd.DataFrame(f.toarray(),columns=tf_idf.get_feature_names())

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0.0,0.0,0.0,0.370086,0.0,0.0,0.0,0.0,0.6569,0.0,0.6569
1,0.0,0.580423,0.0,0.327,0.0,0.580423,0.468282,0.0,0.0,0.0,0.0
2,0.0,0.0,0.871247,0.490845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.614189,0.0,0.0,0.0,0.614189,0.0,0.495524,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.370086,0.0,0.0,0.0,0.6569,0.0,0.6569,0.0


In [43]:
# some part of the code is present in line number 11

results_tfidf = [cosine_similarity(f[a_index], f[b_index]) for (a_index, b_index) in pairs]
sorted(zip(results_tfidf, combos), reverse=True)

[(array([[0.23204486]]),
  ('I make my hot chocolate with milk', 'I will have a chai latte with milk')),
 (array([[0.18165505]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.18165505]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.16050661]]),
  ('I make my hot chocolate with milk', 'One hot encoding')),
 (array([[0.1369638]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.12101835]]),
  ('The weather is hot under the sun', 'I make my hot chocolate with milk')),
 (array([[0.12101835]]),
  ('I make my hot chocolate with milk', 'There is a hot sale today')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]

### Note:
Compare this last result with line number 11 result, both are different because countvectorizer give more preference to most occuring word and Tfidf give more preference to less occuring word