## Here, I will be implementing TF-IDF vectorizer on my own and will be comapring my results to the TF-IDF vectorizer given in sklearn

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import math

In [2]:
corpus = pd.read_csv("cleaned_string.csv")
corpus.head()

Unnamed: 0,Texts
0,slow moving aimless movie distressed drifting ...
1,not sure lost flat characters audience nearly ...
2,attempting artiness black white clever camera ...
3,little music anything speak
4,best scene movie gerardo trying find song keep...


## Implementing 'get_feature_names'

In [3]:
def fit(corpus):
    features = []
    temp = []
    for i in range(len(corpus["Texts"])):
        temp = corpus["Texts"][i].split()
        for word in temp:
            if word not in features:
                features.append(word)
    features.sort()
    return features

In [4]:
features1 = fit(corpus)
print(features1)



Let's find the features using sklearn and check if our result is same or not.

In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus["Texts"])
features2 = vectorizer.get_feature_names()

In [6]:
if features1 == features2:
    print("Our results match")
else:
    print("No, they do not match.")

No, they do not match.


In [7]:
print("{}, {}".format(len(features1), len(features2)))

2897, 2886


Looks like there are some words in my implementation that were excluded by sklearn. Let's look at these words and remove them.

In [8]:
toRemove = []
for word in features1:
    if word not in features2:
        toRemove.append(word)
print(toRemove)

['b', 'e', 'f', 'g', 'n', 'q', 'r', 'u', 'v', 'x', 'z']


In [9]:
for word in toRemove:
    features1.remove(word)
print(len(features1))

2886


Let's check again

In [10]:
if features1 == features2:
    print("Our results match")

Our results match


## Implementing '.idf' function.

In [11]:
def count(corpus, word):
    c = 0
    for i in corpus["Texts"]:
        if word in i:
            c += 1
    return c

def idf(corpus):
    idfValues = []
    features = fit(corpus)
    toRemove = ['b', 'e', 'f', 'g', 'n', 'q', 'r', 'u', 'v', 'x', 'z']
    for word in toRemove:
        features.remove(word)
    for i in features:
        c = count(corpus, i)
        idfValue = 1 + math.log((1+corpus.shape[0])/(1+c))
        idfValues.append(idfValue)
    return np.array(idfValues)

In [12]:
idfValues1 = idf(corpus)

Let's find the idf values using sklearn.

In [13]:
idfValues2 = vectorizer.idf_

In [14]:
t = (idfValues1 == idfValues2)
if False in t:
    print("Our results do not match.")
else:
    print("We got matching results.")

Our results do not match.


### For some reason, I am unable to get the matching values.

In [15]:
vocab = {}
for i in range(len(features1)):
    vocab[features1[i]] = idfValues1[i]
print(len(vocab))

2886


In [16]:
vocab = sorted(vocab.items(), key = lambda kv:(kv[1], kv[0]))
vocab = vocab[:50]
print(len(vocab))

50


In [27]:
print("Following are the top 50 most important words:\n")
print("S.No\tWord\t\tIDF-Score")
for i in range(len(vocab)):
    print(f"{i+1}\t{vocab[i][0]}\t\t{vocab[i][1]}")

Following are the top 50 most important words:

S.No	Word		IDF-Score
1	en		1.88921743754562
2	ed		2.0515447778101237
3	fi		2.369041112972331
4	no		2.390318511419616
5	co		2.406579032291396
6	de		2.406579032291396
7	th		2.4570098859182883
8	la		2.592184664286541
9	movie		2.625632598354081
10	film		2.6532205548729104
11	ho		2.6532205548729104
12	di		2.681591252002126
13	act		2.6960842593046923
14	not		2.7408678619316658
15	us		2.7640349212132
16	im		3.0310977064622455
17	ever		3.0727704028628136
18	one		3.094276608083777
19	go		3.105205678615967
20	see		3.3965574799567104
21	end		3.505191320959506
22	art		3.5217206229107165
23	bad		3.573013917298267
24	real		3.573013917298267
25	ive		3.590713494397668
26	pi		3.590713494397668
27	time		3.6457732715806954
28	man		3.6842395524084917
29	ue		3.6842395524084917
30	like		3.7040421797046714
31	thing		3.7242448870221905
32	character		3.7874237886437223
33	good		3.7874237886437223
34	era		3.854865069439255
35	watch		3.9024931184285094
36	acting		3