In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Data
df = pd.DataFrame({'text': [
    "Jim and Pam traveled by bus.",
    "The train was not on time.",
    "The flight was full. Traveling by flight is expensive."
]})

print("Input DataFrame:")
print(df)
print('-' * 100)

# Initialize TfidfVectorizer with key features
vectorizer = TfidfVectorizer(
    # ngram_range=(1, 2),              # Generate unigrams, bigrams, and trigrams
    stop_words='english',            # Remove common English stop words
    max_features=20,                 # Limit the number of features to the top 20 by TF-IDF score
    norm='l2',                       # Apply L2 normalization to the vectors
    use_idf=True,                    # Use inverse document frequency
    smooth_idf=True,                 # Smooth IDF weights by adding 1 to document frequencies
    sublinear_tf=True                # Apply sublinear scaling (1 + log(tf)) to term frequency
)

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Extract feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()
print("Feature Names (Vocabulary):")
print(feature_names)
print('-' * 100)

# Convert the TF-IDF matrix to a dense DataFrame for better readability
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)
print("TF-IDF Matrix:")
print(tfidf_df)
print('-' * 100)

# Test the vectorizer with a new sentence
new_text = ["Traveling by train is more convenient than flight and it is not expensive but it takes time."]
test_vector = vectorizer.transform(new_text)
test_tfidf_df = pd.DataFrame(test_vector.toarray(), columns=feature_names)

print("TF-IDF for New Text:")
print(test_tfidf_df)


Input DataFrame:
                                                text
0                       Jim and Pam traveled by bus.
1                         The train was not on time.
2  The flight was full. Traveling by flight is ex...
----------------------------------------------------------------------------------------------------
Feature Names (Vocabulary):
['bus' 'expensive' 'flight' 'jim' 'pam' 'time' 'train' 'traveled'
 'traveling']
----------------------------------------------------------------------------------------------------
TF-IDF Matrix:
   bus  expensive    flight  jim  pam      time     train  traveled  traveling
0  0.5   0.000000  0.000000  0.5  0.5  0.000000  0.000000       0.5   0.000000
1  0.0   0.000000  0.000000  0.0  0.0  0.707107  0.707107       0.0   0.000000
2  0.0   0.453295  0.767495  0.0  0.0  0.000000  0.000000       0.0   0.453295
----------------------------------------------------------------------------------------------------
TF-IDF for New Text:
   bus  