<a href="https://colab.research.google.com/github/vishnuvardhan-jadava/TF-IDF_Assignment/blob/main/TF_IDF_FIT_TRANSFORM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [128]:
#importing modules

from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy

In [129]:
# Collection of string documents - Corpus

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [131]:
corpus

['this is the first document',
 'this document is the second document',
 'and this is the third one',
 'is this the first document']

In [137]:
def fit(corpus):

  """fit method for TFIDF that returns vocab in alphabetically sorted order"""
  uniq_words = set() #set of unique words
  if isinstance(corpus,(list)):
    for sent in corpus:
      for word in sent.split(' '):
        if len(word) < 2:
          continue
        uniq_words.add(word)
    uniq_words=sorted(uniq_words)
    vocab = {j:i for i,j in enumerate(uniq_words)}
    return vocab
  else:
    return 'pass a list'

In [136]:
def find_idf_Val(word):
  """returns the count of the appearance of given string in the corpus"""
  if word.isalpha():
    count=0
    for i in corpus:
      if word in i.split():
        count+=1
    return count
  else:
    return 'passed vales is not string'

In [138]:
def transform(corpus,vocab):
  """transform method for TFIDF which returns normalized(l2) sparse matrix of TF-IDF"""
  row = []
  col = []
  tf_idf_li = []
  if isinstance(corpus,(list)):
    for row_idx,sent in enumerate(corpus):
      word_freq = dict(Counter(sent.split()))
      for word,w_freq in word_freq.items():
        col_idx = vocab.get(word,-1)
        if col_idx != -1:
          row.append(row_idx)
          col.append(col_idx)
          #print(f'word is ~~{word}~~, in the sentence ~~{sent}~~')
          tf=word_freq[word]/len(sent.split()) #calculating TF value
          idf= 1+ math.log((1+len(corpus))/(1+find_idf_Val(word))) #calculating IDF value
          tf_idf_li.append(tf*idf) #TFIDF values
          #print(f'word is ~~{word}~~, in the sentence ~~{sent}~~ and idf is {idf}')
          #print(f'tf value is {tf} and idf value is {idf} with tfidf value as : {tf*idf}')  
      #print('****************************'*4)
    return normalize(csr_matrix((tf_idf_li,(row,col)),shape=(len(corpus),len(vocab))),norm='l2') #normalizing(l2) sparsematrix of TFIDF
  else:
    return 'pass a list'

In [140]:
#applying fit and transform on corpus
vocab = fit(corpus)
vec = transform(corpus,vocab)
print(vec)

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149


In [142]:
#applying fit and transform from sklearn and comparing with our custom code's output
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)
print(skl_output)

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 5)	0.5386476208856763
  (1, 3)	0.281088674033753
  (1, 1)	0.6876235979836938
  (2, 8)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 0)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045
