In [30]:
# Lint notebook
%load_ext jupyter_black

For more examples and intutition see [here](https://www.kaggle.com/code/yassinehamdaoui1/creating-tf-idf-model-from-scratch) and [here](https://www.capitalone.com/tech/machine-learning/understanding-tf-idf/) and [here](https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/03-TF-IDF-Scikit-Learn.html)

In [55]:
import math

# Process documents into individual words
document_a = [
    "Hold",
    "fast",
    "to",
    "dreams",
    "for",
    "if",
    "dreams",
    "die",
    "life",
    "is",
    "a",
    "broken-winged",
    "bird",
    "that",
    "cannot",
    "fly",
]
document_b = [
    "No",
    "bird",
    "soars",
    "too",
    "high",
    "if",
    "he",
    "soars",
    "with",
    "his",
    "own",
    "wings",
]

# Total set of words
total_corpus = set(document_a).union(set(document_b))

In [60]:
# Simple frequency counts of words per document by initializing a dict
import pandas as pd
dict_a = dict.fromkeys(total_corpus, 0)
dict_b = dict.fromkeys(total_corpus, 0)

for word in document_a:
    dict_a[word] += 1

for word in document_b:
    dict_b[word] += 1

frequency = pd.DataFrame([dict_a, dict_b])
frequency.T

Unnamed: 0,0,1
a,1,0
dreams,2,0
No,0,1
Hold,1,0
for,1,0
die,1,0
he,0,1
fast,1,0
his,0,1
life,1,0


In [57]:
def tf(doc_dict: dict, doc_elements: list[str]) -> dict:
    """Term frequency of a word in a document  over total words in document"""
    tf_dict = {}
    corpus_count = len(doc_elements)
    
    for word, count in doc_dict.items():
        tf_dict[word] = count / float(corpus_count)
        
    return tf_dict


def idf(doc_list: list[dict[str, int]]) -> dict:
    """Given N documents (doc_list), the number of documents in which the term appears per a term"""
    idf_dict = {}
    N = len(doc_list)

    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)

    for word in idf_dict.keys():
        idf_dict[word] = sum(doc[word] > 0 for doc in doc_list)
    for word, val in idf_dict.items():
        idf_dict[word] = math.log10((N + 1.0) / (val + 1.0))

    return idf_dict

# All inverse document frequencies for all words 
idfs = idf([dict_a, dict_b])


def tfidf(doc_elements: dict[str, int], idfs: dict[str, int])-> dict:
    """TF * IDF per word given a single word in a single document and number of docs the term appears in"""
    tfidf_dict = {}
    
    for word, val in doc_elements.items():
        tfidf_dict[word] = val * idfs[word]

    return tfidf_dict

# Calculate the term frequency for each document individually
tf_a = tf(dict_a, document_a)
tf_b = tf(dict_b, document_b)

# Calculate the inverse document frequency given each term frequency
tfidf_a = tfidf(tf_a, idfs)
tfidf_b = tfidf(tf_b, idfs)

# Return weight of each word in each document wrt to the total corpus
document_tfidf = pd.DataFrame([tfidf_a, tfidf_b])
document_tfidf.T

Unnamed: 0,0,1
a,0.018814,0.0
dreams,0.037629,0.0
No,0.0,0.025086
Hold,0.018814,0.0
for,0.018814,0.0
die,0.018814,0.0
he,0.0,0.025086
fast,0.018814,0.0
his,0.0,0.025086
life,0.018814,0.0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "Hold fast to dreams, for if dreams die, life is a broken-winged bird that cannot fly.",
    "No bird soars too high if he soars with his own wings.",
]

text_titles = ["quote_langstonhughes", "quote_william_blake"]

vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(corpus)
dict(zip(vectorizer.get_feature_names_out(), vector.toarray()[0]))

tfidf_df = pd.DataFrame(vector.toarray(), index=text_titles, columns=vectorizer.get_feature_names_out())

In [8]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [59]:
tfidf_df.T

Unnamed: 0,dreams_langstonhughes,quote_william_blake,00_Document Frequency
bird,0.172503,0.197242,2.0
broken,0.242447,0.0,1.0
cannot,0.242447,0.0,1.0
die,0.242447,0.0,1.0
dreams,0.484893,0.0,1.0
fast,0.242447,0.0,1.0
fly,0.242447,0.0,1.0
for,0.242447,0.0,1.0
he,0.0,0.277217,1.0
high,0.0,0.277217,1.0
