# TF-IDF Vectorizer from Scratch

In [1]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [2]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
import operator
from sklearn.preprocessing import normalize
import numpy


# Fit function: To display the word with its dimension pair in the corpus.
def fit(dataset):
    unique_words = set()
    if isinstance(dataset, (list,)):
        for row in dataset:
            for word in row.split(" "):
                if len(word)<2:#and word not in unique_words:
                    continue
                unique_words.add(word) # Add each unique word of length>2 to the list
        unique_words=sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)} # Enumerate the list, i.e., give consecutive numbers to each item, store in a dict
        l=len(dataset)+1  # total number of documents in corpus
        idf=[]   # to store idf value of every word 
        w=0       # to find number of docs. containing the word  
        for i in list(vocab):    #T iterate every word in vocab calculated
            w=1 
            for row in dataset:    # iterate each document to find the word
                if i in row.split():
                    w=w+1     # increase the count when word is found
            idf1=1+math.log(l/w)  # IDF formula: 1+(log(total no. of docs.(N) /1+no. of docs. containing the word)
            idf.append(idf1)
        vocab_dict={i:j for i,j in zip(list(vocab),idf)}  #Storing word and idf in a dictonary 
    return vocab,vocab_dict

In [3]:
vocab,vocab_dict=fit(corpus)  
print(vocab_dict)

{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


In [4]:
# Transform function to find Tf-Idf matrix
def transform(dataset,vocab):
    rows = []
    columns =[]
    values = []
    for idx, row in enumerate(tqdm(dataset)):  #iterate through each document in the corpus
        word_freq = dict(Counter(row.split())) 
        for word, freq in word_freq.items():
            rows.append(idx) #to store index of the doc.
            columns.append(vocab.get(word)) #to store dimensions of word                  
            #TF=no. of times word occurs/total words in a document
            #IDF=1+(log(total no. of docs.(N) /1+no. of docs. containing the word)
            tfidf=(freq/len(row.split()))*vocab_dict.get(word)#TF*IDF
            values.append(tfidf)
    tfid= csr_matrix((values, (rows,columns)), shape=(len(dataset),len(vocab)))
    tfid=normalize(tfid)
    return tfid

In [5]:
tfidfvec=transform(corpus, vocab)
print(tfidfvec.shape)
print(list(vocab.keys()))
print(tfidfvec.toarray())

100%|████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


(4, 9)
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


# Comparing results with TfidfVectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer 
vec= TfidfVectorizer()
vec.fit(corpus)
feature_matrix_2 = vec.transform(corpus)
print(feature_matrix_2.toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
