# TF-IDF

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from tf_idf import tf, idf, tf_idf, Tfidf

We have a corpus 'D' below named 'corpus', comprised of 3 sentences (documents)
and a term 't' as 'ball'

In [2]:
corpus = [
    "The dog chased the cat around the house", # document 1
    "The cat played with the ball of yarn", # document 2
    "Both the dog and cat sleep in the house" # document 3
]

term = "ball"

# TF

we can display the term frequency (TF) for the term in each document.

![image.png](../../../images/tf.png)

We can see that ball did not appear in document 1 nor 3 so the term frequency is 0 as expected.
However, it appears once in document 2, giving us: 0.125


In [3]:
for idx, d in enumerate(corpus):
    print(f"TF of '{term}' in document {idx + 1} is: {tf(d, term)}")

TF of 'ball' in document 1 is: 0.0
TF of 'ball' in document 2 is: 0.125
TF of 'ball' in document 3 is: 0.0


# Step 1: Build Vocab

we can make a set of all the terms present in the corpus in order to systematically go through them,
calculating the IDF scores.

In [4]:
vocab = {word.lower() for d in corpus for word in d.split()}
print(len(vocab))
vocab

15


{'and',
 'around',
 'ball',
 'both',
 'cat',
 'chased',
 'dog',
 'house',
 'in',
 'of',
 'played',
 'sleep',
 'the',
 'with',
 'yarn'}

# Step 2: IDF

In [5]:
idfs = {}
for term in vocab:
    idfs[term] = idf(corpus, term)
idfs_df = pd.DataFrame.from_dict(idfs, orient='index', columns=['IDF'])
idfs_df

Unnamed: 0,IDF
in,2.609438
the,2.466337
ball,2.609438
cat,2.466337
around,2.609438
with,2.609438
and,2.609438
house,2.504077
dog,2.504077
chased,2.609438


# Step 3: TF

In [6]:
tfs = defaultdict(dict)
for idx, d in enumerate(corpus):
    for term in vocab:
        tfs[idx][term] = tf(d, term)
pd.DataFrame(tfs)

Unnamed: 0,0,1,2
in,0.0,0.0,0.111111
the,0.25,0.125,0.222222
ball,0.0,0.125,0.0
cat,0.125,0.125,0.111111
around,0.125,0.0,0.0
with,0.0,0.125,0.0
and,0.0,0.0,0.111111
house,0.125,0.0,0.111111
dog,0.125,0.0,0.111111
chased,0.125,0.0,0.0


# Step 4: TF * IDF

In [7]:
result = defaultdict(dict)
for key, word_values in tfs.items():
    for word, value in word_values.items():
        if word in idfs:
            result[key][word] = value * idfs[word]

pd.DataFrame(result)

Unnamed: 0,0,1,2
in,0.0,0.0,0.289938
the,0.616584,0.308292,0.548075
ball,0.0,0.32618,0.0
cat,0.308292,0.308292,0.274037
around,0.32618,0.0,0.0
with,0.0,0.32618,0.0
and,0.0,0.0,0.289938
house,0.31301,0.0,0.278231
dog,0.31301,0.0,0.278231
chased,0.32618,0.0,0.0


# Step 5: Make Array to represent Text

Now we can build vectors for each document.

In [8]:
array = np.zeros((len(corpus), len(vocab)))
for i, term in enumerate(vocab):
    for j, elem in enumerate(array):
        array[j][i] = result[j][term]


## Using TfidfVectorizer from sklearn

In [9]:
tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(corpus)
tf_idf_array = tf_idf_vector.toarray()
tf_idf_array

array([[0.        , 0.38792711, 0.        , 0.        , 0.229116  ,
        0.38792711, 0.29502856, 0.29502856, 0.        , 0.        ,
        0.        , 0.        , 0.687348  , 0.        , 0.        ],
       [0.        , 0.        , 0.38506745, 0.        , 0.22742704,
        0.        , 0.        , 0.        , 0.        , 0.38506745,
        0.38506745, 0.        , 0.45485408, 0.38506745, 0.38506745],
       [0.38066768, 0.        , 0.        , 0.38066768, 0.22482846,
        0.        , 0.28950758, 0.28950758, 0.38066768, 0.        ,
        0.        , 0.38066768, 0.44965693, 0.        , 0.        ]])

## Compare a smaller sample
very, very, very similar results.

In [10]:
# New data set
corpus_2 = ["The dog chased the cat around the house"]
vocab_2 = {word.lower() for d in corpus_2 for word in d.split()}

In [12]:
# My class
tr_idf_model_1  = Tfidf()
tf_idf_vector_1 = tr_idf_model_1.fit_transform(corpus_2)
tf_idf_vector_1

array([[0.52465307, 0.26232654, 0.26232654, 0.26232654, 0.26232654,
        0.26232654]])

In [13]:
# Sklearn 
tr_idf_model_2  = TfidfVectorizer()
tf_idf_vector_2 = tr_idf_model.fit_transform(corpus_2)
tf_idf_array_2 = tf_idf_vector_2.toarray()
tf_idf_array_2

array([[0.26726124, 0.26726124, 0.26726124, 0.26726124, 0.26726124,
        0.80178373]])