Task-1

Implementation using Sklearn

Corpus

In [0]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

sklearn Implementation

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

sklearn Feature Names in the given corpus after applying transform method. By Default they will be in the sorted order

In [4]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


shape of sklearn tfidf vectorizer output after applying transform method.

In [5]:
skl_output.shape

(4, 9)

Here we will print the sklearn tfidf vectorizer idf values after applying the fit method

 After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

In [6]:
print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


sklearn tfidf values for first line of the above corpus.

Here the output is a sparse matrix

In [7]:
print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.

This output is normalized using L2 normalization. sklearn does this by default.

In [8]:
print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


Custom Implementation

Importing the libraries

In [0]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn import preprocessing
import numpy
from sklearn.preprocessing import normalize

In [0]:
def fit(dataset):
  unique_words = set() 
  if isinstance(dataset, (list,)):
    for row in dataset:
       for word in row.split(" "):
         if len(word) < 2:
           continue
         unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab = {j:i for i,j in enumerate(unique_words)}
    IDF_values=[]
    for i in vocab.keys():
      c=0
      for row in corpus:
        if i in row.split(" "):
          c+=1
      ans1=1+math.log((1+len(corpus))/(1+c))
      IDF_values.append(ans1)
    return vocab,IDF_values
  else:
     print("you need to pass list of sentance")

In [5]:
vocab,IDF_values=fit(corpus)
print("vocab =",vocab.keys())
print("IDF_values =",IDF_values)

vocab = dict_keys(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'])
IDF_values = [1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [0]:
def transform(dataset,vocab):
    rows = []
    columns = []
    values = []
    if isinstance(dataset, (list,)):
        for idx, row in enumerate(tqdm(dataset)):
            word_freq = dict(Counter(row.split()))
            for word, freq in word_freq.items():             
                if len(word) < 2:
                    continue
                no_of_document_with_t=0
                for rowsp in dataset:
                  if word in rowsp.split(" "):
                    no_of_document_with_t=no_of_document_with_t+1
                ans=(freq/len(row.split()))*(math.log((1+len(corpus))/(no_of_document_with_t+1))+1)
                col_index = vocab.get(word, -1)
                if col_index !=-1:
                    rows.append(idx)
                    columns.append(col_index)
                    values.append(ans)
        nor= csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
        X_normalized=preprocessing.normalize(nor.toarray(), norm='l2')
        return X_normalized
    else:
        print("you need to pass list of strings")

In [9]:
vocab,IDF_values = fit(corpus)
print()
print()
print("IDF_values : ",IDF_values)
print()
x=transform(corpus, vocab)
print(x)
print()
print("x shape: ",x.shape)

100%|██████████| 4/4 [00:00<00:00, 1422.52it/s]



IDF_values :  [1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

x shape:  (4, 9)





Task 2

Loading the Document

In [24]:
import pickle
with open('cleaned_strings', 'rb') as f:
    corpus = pickle.load(f)
    
# printing the length of the corpus loaded
print("Number of documents in corpus = ",len(corpus))

Number of documents in corpus =  746


In [0]:
def fit1(dataset):
  unique_words = set() 
  if isinstance(dataset, (list,)):
    for row in dataset:
       for word in row.split(" "):
         if len(word) < 2:
           continue
         unique_words.add(word)
    unique_words = sorted(list(unique_words))
    vocab1 = {j:i for i,j in enumerate(unique_words)}
    IDF_values1=[]
    for i in vocab1.keys():
      c=0
      for row in corpus:
        if i in row.split(" "):
          c+=1
      ans1=1+math.log((1+len(corpus))/(1+c))
      IDF_values1.append(ans1)
    zip1=dict(zip(list(vocab1.keys()),IDF_values1))
    sorted1=sorted(zip1.items(),key= lambda x:x[1],reverse=True)
    sorted1=dict(sorted1[:50])
    unique_words=list(sorted1.keys())
    vocab1={j:i for i,j in enumerate(unique_words)}
    IDF_values1=list(sorted1.values())

    return vocab1,IDF_values1
  else:
     print("you need to pass list of sentance")

In [36]:
part2_vocab1,part2_IDF_values=fit1(corpus)
print("TOP 50 IDF values:\n",part2_IDF_values)

TOP 50 IDF values:
 [6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872, 6.922918004572872]


In [37]:
part2_vocab1


{'aailiyah': 0,
 'abandoned': 1,
 'abroad': 2,
 'abstruse': 3,
 'academy': 4,
 'accents': 5,
 'accessible': 6,
 'acclaimed': 7,
 'accolades': 8,
 'accurate': 9,
 'accurately': 10,
 'achille': 11,
 'ackerman': 12,
 'actions': 13,
 'adams': 14,
 'add': 15,
 'added': 16,
 'admins': 17,
 'admiration': 18,
 'admitted': 19,
 'adrift': 20,
 'adventure': 21,
 'aesthetically': 22,
 'affected': 23,
 'affleck': 24,
 'afternoon': 25,
 'aged': 26,
 'ages': 27,
 'agree': 28,
 'agreed': 29,
 'aimless': 30,
 'aired': 31,
 'akasha': 32,
 'akin': 33,
 'alert': 34,
 'alike': 35,
 'allison': 36,
 'allow': 37,
 'allowing': 38,
 'alongside': 39,
 'amateurish': 40,
 'amaze': 41,
 'amazed': 42,
 'amazingly': 43,
 'amusing': 44,
 'amust': 45,
 'anatomist': 46,
 'angel': 47,
 'angela': 48,
 'angelina': 49}

In [39]:
a=transform(corpus,part2_vocab1)
print(a[0])


100%|██████████| 746/746 [00:04<00:00, 176.11it/s]

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]



