### EXERCISE-1: Print TFIDF values

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer 
import pandas as pd

In [25]:
docs = ["good movie", "not a good movie", "did not like","i like it", "good one" ]

In [26]:
# using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer (min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform (docs)
print (features)

  (0, 0)	0.7071067811865476
  (0, 2)	0.7071067811865476
  (1, 3)	0.5773502691896257
  (1, 0)	0.5773502691896257
  (1, 2)	0.5773502691896257
  (2, 1)	0.7071067811865476
  (2, 3)	0.7071067811865476
  (3, 1)	1.0


In [28]:
# Pretty printing
feature_names = tfidf.get_feature_names_out()
df = pd.DataFrame(features.todense(), columns=feature_names)
print (df)

   good movie      like     movie       not
0    0.707107  0.000000  0.707107  0.000000
1    0.577350  0.000000  0.577350  0.577350
2    0.000000  0.707107  0.000000  0.707107
3    0.000000  1.000000  0.000000  0.000000
4    0.000000  0.000000  0.000000  0.000000


### EXERCISE-2:
#### 1. Change the values of min_df and ngram_range and observe various outputs

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=1, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform(docs)
print(features)


  (0, 2)	0.7071067811865476
  (0, 7)	0.7071067811865476
  (1, 9)	0.5819514978671799
  (1, 8)	0.46951480421464364
  (1, 2)	0.46951480421464364
  (1, 7)	0.46951480421464364
  (2, 10)	0.4821401170833009
  (2, 1)	0.4821401170833009
  (2, 5)	0.3889876106617681
  (2, 0)	0.4821401170833009
  (2, 8)	0.3889876106617681
  (3, 6)	0.6141889663426562
  (3, 4)	0.6141889663426562
  (3, 5)	0.49552379079705033
  (4, 3)	0.7071067811865475
  (4, 11)	0.7071067811865475


In [34]:
# Pretty printing
feature_names = tfidf.get_feature_names_out()
df = pd.DataFrame(features.todense(),columns=feature_names)
print (df)

       did  did not  good movie  good one        it      like   like it  \
0  0.00000  0.00000    0.707107  0.000000  0.000000  0.000000  0.000000   
1  0.00000  0.00000    0.469515  0.000000  0.000000  0.000000  0.000000   
2  0.48214  0.48214    0.000000  0.000000  0.000000  0.388988  0.000000   
3  0.00000  0.00000    0.000000  0.000000  0.614189  0.495524  0.614189   
4  0.00000  0.00000    0.000000  0.707107  0.000000  0.000000  0.000000   

      movie       not  not good  not like       one  
0  0.707107  0.000000  0.000000   0.00000  0.000000  
1  0.469515  0.469515  0.581951   0.00000  0.000000  
2  0.000000  0.388988  0.000000   0.48214  0.000000  
3  0.000000  0.000000  0.000000   0.00000  0.000000  
4  0.000000  0.000000  0.000000   0.00000  0.707107  


### EXERCISE-3: Compute Cosine Similarity between 2 Documents

In [35]:
from sklearn.metrics.pairwise import linear_kernel

In [36]:
# cosine score between 1st and 2nd doc
doc1 = features [0:1]
doc2 = features [1:2]
score = linear_kernel (doc1, doc2)
print (score)

[[0.6639942]]


In [37]:
# cosine score between 1st and all other docs 
scores = linear_kernel (doc1, features)
print (scores)

[[1.        0.6639942 0.        0.        0.       ]]


In [38]:
# Cosine Similarity for a new doc
query = "I like this good movie"
qfeature = tfidf.transform([query])
scores2 = linear_kernel (qfeature, features)
print (scores2)

[[0.81649658 0.542149   0.2245821  0.28609079 0.        ]]


### EXERCISE-4: Find Top-N similar documents
#### Question-1. Consider the following documents and compute TFIDF values

In [39]:
docs=["the house had a tiny little mouse",
"the cat saw the mouse",
"the mouse ran away from the house",
"the cat finally ate the mouse",
"the end of the mouse story"
     ]

In [40]:
tfidf = TfidfVectorizer (min_df=2, max_df=0.5, ngram_range=(1, 2))
features = tfidf.fit_transform (docs)
print (features)

  (0, 3)	0.7071067811865476
  (0, 1)	0.7071067811865476
  (1, 2)	0.7071067811865476
  (1, 0)	0.7071067811865476
  (2, 3)	0.7071067811865476
  (2, 1)	0.7071067811865476
  (3, 2)	0.7071067811865476
  (3, 0)	0.7071067811865476


#### Question-2. Compute cosine similarity between 3rd document ("the mouse ran away from the house") with all other documents. Which is the most similar document?.

In [41]:
doc3 = features [2:3]
scores3 = linear_kernel (doc3, features)
print (scores3)

[[1. 0. 1. 0. 0.]]


#### Question-3. Find Top-2 similar documents for the 3rd document based on Cosine similarity values

In [42]:
doc4 =features[0:2]
scores4 = linear_kernel (doc3,doc4)
print (scores4)

[[1. 0.]]
