In [35]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample documents
corpus= ['the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
    ]


In [36]:
# --- Manual TF-IDF ---
def compute_tf(doc):
    tf = {}
    words = doc.lower().split()
    for word in words:
        tf[word] = tf.get(word, 0) + 1
    total_words = len(words)
    for word in tf:
        tf[word] = tf[word] / total_words
    return tf

In [37]:

def compute_idf(corpus):
    N = len(corpus)
    idf = {}
    all_words = set(word for doc in docs for word in doc.lower().split())
    for word in all_words:
        containing_docs = sum(1 for doc in docs if word in doc.lower().split())
        idf[word] = math.log(N / (1 + containing_docs)) + 1  # Smoothed IDF
    return idf

In [38]:
def compute_tfidf(corpus):
    idf = compute_idf(corpus)
    tfidf_docs = []
    for doc in corpus:
        tf = compute_tf(doc)
        tfidf = {word: tf[word] * idf[word] for word in tf}
        tfidf_docs.append(tfidf)
    return tfidf_docs, idf

In [39]:
#  manual TF-IDF
manual_tfidf, manual_idf = compute_tfidf(corpus)

In [40]:

# scikit-learn TF-IDF and CountVectorizer 
count_vec = CountVectorizer()
X_count = count_vec.fit_transform(corpus)
count_words = count_vec.get_feature_names_out()

tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(corpus)
tfidf_words = tfidf_vec.get_feature_names_out()

In [41]:
# Display results
print("Manual TF-IDF (corpus):")
print(manual_tfidf[0])


Manual TF-IDF (corpus):
{'the': 0.14246358550964383, 'sun': 0.2, 'is': 0.2, 'a': 0.2, 'star': 0.2810930216216329}


In [42]:

print("\nCountVectorizer (corpus):")
print(dict(zip(count_words, X_count.toarray()[0])))



CountVectorizer (corpus):
{'and': 0, 'are': 0, 'bodies': 0, 'celestial': 0, 'is': 1, 'moon': 0, 'satellite': 0, 'star': 1, 'sun': 1, 'the': 1}


In [43]:
print("\nTfidfVectorizer (corpus):")
print(dict(zip(tfidf_words, X_tfidf.toarray()[0])))



TfidfVectorizer (corpus):
{'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.4804583972923858, 'moon': 0.0, 'satellite': 0.0, 'star': 0.6317450542765208, 'sun': 0.4804583972923858, 'the': 0.3731188059313277}
