In [24]:
import math
from collections import defaultdict

def compute_tf(doc):
    tf = defaultdict(float)
    for word in doc:
        tf[word] += 1
    total_words = len(doc)
    for word in tf:
        tf[word] /= total_words
    return tf

def compute_idf(corpus, vocab):
    N = len(corpus)
    idf = {}
    for word in vocab:
        containing_docs = sum(1 for doc in corpus if word in doc)
        idf[word] = math.log(N / (1 + containing_docs))  # Smoothed IDF
    return idf

def compute_tfidf(tf, idf, vocab):
    tfidf = [0] * len(vocab)
    word_to_index = {word: idx for idx, word in enumerate(vocab)}
    for word, value in tf.items():
        word_idx = word_to_index[word]
        tfidf[word_idx] = value * idf[word]
    return tfidf

# Sample corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are friends"
]

# Tokenize the corpus
tokenized_docs = [doc.lower().split() for doc in corpus]

# Build vocabulary
vocab = sorted(set(word for doc in tokenized_docs for word in doc))

# Compute TF for each document
tf_list = [compute_tf(doc) for doc in tokenized_docs]

# Compute IDF for the whole corpus
idf = compute_idf(tokenized_docs, vocab)

# Compute TF-IDF vectors for each document
tfidf_vectors = [compute_tfidf(tf, idf, vocab) for tf in tf_list]

# Output results
print("Vocabulary:", vocab)
for i, vec in enumerate(tfidf_vectors):
    print(f"Document {i+1} TF-IDF:")
    print(vec)


Vocabulary: ['and', 'are', 'cat', 'cats', 'dog', 'dogs', 'friends', 'log', 'mat', 'on', 'sat', 'the']
Document 1 TF-IDF:
[0, 0, 0.06757751801802739, 0, 0, 0, 0, 0, 0.06757751801802739, 0.0, 0.0, 0.0]
Document 2 TF-IDF:
[0, 0, 0, 0, 0.06757751801802739, 0, 0, 0.06757751801802739, 0, 0.0, 0.0, 0.0]
Document 3 TF-IDF:
[0.08109302162163289, 0.08109302162163289, 0, 0.08109302162163289, 0, 0.08109302162163289, 0.08109302162163289, 0, 0, 0, 0, 0]


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the log",
    "cats and dogs are friends"
]

# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus to get the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(corpus)

# Convert to dense array
tfidf_array = tfidf_matrix.toarray()

# Get the vocabulary
vocab = vectorizer.get_feature_names_out()

# Print results
print("Vocabulary:", vocab)
print("TF-IDF Vectors:")
print(tfidf_array)


Vocabulary: ['and' 'are' 'cat' 'cats' 'dog' 'dogs' 'friends' 'log' 'mat' 'on' 'sat'
 'the']
TF-IDF Vectors:
[[0.         0.         0.42755362 0.         0.         0.
  0.         0.         0.42755362 0.32516555 0.32516555 0.6503311 ]
 [0.         0.         0.         0.         0.42755362 0.
  0.         0.42755362 0.         0.32516555 0.32516555 0.6503311 ]
 [0.4472136  0.4472136  0.         0.4472136  0.         0.4472136
  0.4472136  0.         0.         0.         0.         0.        ]]
