<a href="https://colab.research.google.com/github/michaelwnau/ai_academy_notebooks/blob/main/TF_IDF_Calculations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import pandas as pd
import math

from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
#The corpus of text with four different documents
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

In [7]:
#This section will calculate the term frequency for each sentence and the whole corpus

word_freq = {} #this is the total word frequency for all documents
sent_freq = {} #this is the frequency of each word in each sentence
sent_word_freq = [] #this is a list of all sent freq together

for sentence in corpus:
    words = sentence.strip(".?").split(" ")
    sent_freq = {} #reset the individual sentence frequency
    for word in words:
        if word.lower() not in sent_freq.keys():
            sent_freq[word.lower()] = 0
        sent_freq[word.lower()] += 1
        
        if word.lower() not in word_freq.keys():
            word_freq[word.lower()] = 0
        word_freq[word.lower()] += 1
    sent_word_freq.append(sent_freq)

print(word_freq)
print(sent_word_freq)


{'this': 4, 'is': 4, 'the': 4, 'first': 2, 'document': 4, 'second': 1, 'and': 1, 'third': 1, 'one': 1}
[{'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}, {'this': 1, 'document': 2, 'is': 1, 'the': 1, 'second': 1}, {'and': 1, 'this': 1, 'is': 1, 'the': 1, 'third': 1, 'one': 1}, {'is': 1, 'this': 1, 'the': 1, 'first': 1, 'document': 1}]


In [8]:
total_documents = len(corpus) #get the total number of documents
inv_doc_freq = {} #dictionary of each word with the inverse document frequency
#doc_contain_word = 0

#compute the inverse document frequency via: ln((1+total documents)/(1+ # docs containing word))+1
for key in word_freq.keys():
    doc_contain_word = 0
    for sentence in sent_word_freq:
        if key in sentence:
            doc_contain_word += 1

    inv_doc_freq[key] = math.log(float(1+total_documents)/float(doc_contain_word+1))+1
    
print(inv_doc_freq)

{'this': 1.0, 'is': 1.0, 'the': 1.0, 'first': 1.5108256237659907, 'document': 1.2231435513142097, 'second': 1.916290731874155, 'and': 1.916290731874155, 'third': 1.916290731874155, 'one': 1.916290731874155}


In [9]:
un_norm_tfidf = {} #this will contain the un normalized TF IDF for 1 sentence
un_norm_sent_tfidf = [] #this will contain a list of all sentences

#this will compute the TF IDF by multiplying the TF with the IDF computed above
#it is a raw score
for sentence in sent_word_freq:
    un_norm_tfidf = {}
    for key in sentence.keys():
        un_norm_tfidf[key] = float(sentence[key]) * inv_doc_freq[key]
    un_norm_sent_tfidf.append(un_norm_tfidf)

print(un_norm_sent_tfidf)

[{'this': 1.0, 'is': 1.0, 'the': 1.0, 'first': 1.5108256237659907, 'document': 1.2231435513142097}, {'this': 1.0, 'document': 2.4462871026284194, 'is': 1.0, 'the': 1.0, 'second': 1.916290731874155}, {'and': 1.916290731874155, 'this': 1.0, 'is': 1.0, 'the': 1.0, 'third': 1.916290731874155, 'one': 1.916290731874155}, {'is': 1.0, 'this': 1.0, 'the': 1.0, 'first': 1.5108256237659907, 'document': 1.2231435513142097}]


In [10]:
#this section will normailize the TF IDF scores

normalized_sent_tfidf = []
sentence_SS = []

for sentence in un_norm_sent_tfidf:
    squared_total_sum_sent = 0
    for key in sentence:
        squared_total_sum_sent += sentence[key]**2
    squared_total_sum_sent = math.sqrt(squared_total_sum_sent)
    sentence_SS.append(squared_total_sum_sent)

index = 0
for sentence in un_norm_sent_tfidf:
    temp_sentence = {}
    for key in sentence:
        temp_sentence[key] = sentence[key]/sentence_SS[index]
        
    normalized_sent_tfidf.append(temp_sentence)
    index += 1
    
print(normalized_sent_tfidf)

[{'this': 0.38408524091481483, 'is': 0.38408524091481483, 'the': 0.38408524091481483, 'first': 0.5802858236844359, 'document': 0.46979138557992045}, {'this': 0.281088674033753, 'document': 0.6876235979836938, 'is': 0.281088674033753, 'the': 0.281088674033753, 'second': 0.5386476208856763}, {'and': 0.511848512707169, 'this': 0.267103787642168, 'is': 0.267103787642168, 'the': 0.267103787642168, 'third': 0.511848512707169, 'one': 0.511848512707169}, {'is': 0.38408524091481483, 'this': 0.38408524091481483, 'the': 0.38408524091481483, 'first': 0.5802858236844359, 'document': 0.46979138557992045}]


In [11]:
#This is the code for calculating TF IDF with python libraries
#The array generated is not normalized - if you normalize it, you will get the same values as above
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.shape)
np.set_printoptions(precision=4)
print(X.toarray())

(4, 9)
[[0.     0.4698 0.5803 0.3841 0.     0.     0.3841 0.     0.3841]
 [0.     0.6876 0.     0.2811 0.     0.5386 0.2811 0.     0.2811]
 [0.5118 0.     0.     0.2671 0.5118 0.     0.2671 0.5118 0.2671]
 [0.     0.4698 0.5803 0.3841 0.     0.     0.3841 0.     0.3841]]


In [12]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [13]:
print(corpus[0])
dict(zip(vectorizer.get_feature_names_out(),X.toarray()[0]))

This is the first document.


{'and': 0.0,
 'document': 0.46979138557992045,
 'first': 0.5802858236844359,
 'is': 0.38408524091481483,
 'one': 0.0,
 'second': 0.0,
 'the': 0.38408524091481483,
 'third': 0.0,
 'this': 0.38408524091481483}