In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

corpus = [
    'The quick brown fox jumps over the lazy dog.',
    'A dog is lazy, but a fox is quick.',
    'The new movie features a quick-witted fox.',
    'Jumping is fun, but a lazy dog likes to sleep.',
    'The quick brown dog is not lazy.'
]

print(f"Total documents (corpus size): {len(corpus)}\n")

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and Transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get the feature names (words/tokens)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse TF-IDF matrix to a dense array and then to a DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=[f'Document {i+1}' for i in range(len(corpus))],
    columns=feature_names
)

# Display the IDF values calculated for each unique term
print("\n--- Inverse Document Frequency (IDF) per Term ---")
idf_values = pd.DataFrame({
    'Term': feature_names,
    'IDF': vectorizer.idf_
})
print(idf_values)

print("\n--- TF-IDF Feature Matrix (Document-Term Matrix) ---")
print(tfidf_df.round(3))

# Example: Finding the top 3 most important words in Document 1
doc_index = 0
top_indices = tfidf_df.iloc[doc_index].nlargest(3).index.tolist()
top_scores = tfidf_df.iloc[doc_index].nlargest(3).values

print(f"\n--- Top 3 Key Features in Document 1 ---")
for word, score in zip(top_indices, top_scores):
    print(f" - '{word}': {score:.3f}")


Total documents (corpus size): 5


--- Inverse Document Frequency (IDF) per Term ---
        Term       IDF
0      brown  1.693147
1        dog  1.182322
2   features  2.098612
3        fox  1.405465
4        fun  2.098612
5    jumping  2.098612
6      jumps  2.098612
7       lazy  1.182322
8      likes  2.098612
9      movie  2.098612
10       new  2.098612
11     quick  1.182322
12     sleep  2.098612
13    witted  2.098612

--- TF-IDF Feature Matrix (Document-Term Matrix) ---
            brown    dog  features    fox    fun  jumping  jumps   lazy  \
Document 1  0.462  0.323     0.000  0.383  0.000    0.000  0.572  0.323   
Document 2  0.000  0.476     0.000  0.566  0.000    0.000  0.000  0.476   
Document 3  0.000  0.000     0.458  0.307  0.000    0.000  0.000  0.000   
Document 4  0.000  0.262     0.000  0.000  0.464    0.464  0.000  0.262   
Document 5  0.637  0.445     0.000  0.000  0.000    0.000  0.000  0.445   

            likes  movie    new  quick  sleep  witted  
Document 