# Write a program to extract features – TF, TF-IDF score from text

TF-IDF (Term Frequency–Inverse Document Frequency) is a statistical method used in natural language processing and information retrieval to evaluate how important a word is to a document in relation to a larger collection of documents

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documents = [
    "i am henry.",
    "i like college.",
    "do henry like college?",
    "i am do i like college?",
    "i do like henry.",
    "do i like henry?",
]

In [3]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()
tfidf_values = {}

print("=== TF-IDF Matrix ===")
print(f"{'':4}", end="")
for i, feature in enumerate(feature_names):
    if i == len(feature_names) - 1:
        print(f"{feature:10}", end="")
    else:
        print(f"{feature:11}", end="")
print()

for doc_index in range(len(documents)):
    print(f"D{str(doc_index + 1)} {tfidf_matrix[doc_index, :].toarray()[0]}")
print()

for doc_index, doc in enumerate(documents):
    feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
    tfidf_doc_values = zip(
        feature_index, [tfidf_matrix[doc_index, x] for x in feature_index]
    )
    tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}

for doc_index, values in tfidf_values.items():
    print(f"Document {doc_index + 1}:")
    for word, tfidf_value in values.items():
        print(f"{word}: {tfidf_value}")
    print("\n")

=== TF-IDF Matrix ===
    am         college    do         henry      like      
D1 [0.81019752 0.         0.         0.58615696 0.        ]
D2 [0.         0.80383327 0.         0.         0.59485466]
D3 [0.         0.57579095 0.4934091  0.4934091  0.42609823]
D4 [0.61703105 0.52094001 0.44640601 0.         0.38550729]
D5 [0.         0.         0.60348696 0.60348696 0.52115926]
D6 [0.         0.         0.60348696 0.60348696 0.52115926]

Document 1:
am: 0.8101975203608325
henry: 0.5861569567966913


Document 2:
like: 0.5948546604855911
college: 0.8038332743166161


Document 3:
henry: 0.49340910033240876
like: 0.4260982255952437
college: 0.5757909530054383
do: 0.49340910033240876


Document 4:
am: 0.6170310457438672
like: 0.3855072948390351
college: 0.5209400071446677
do: 0.4464060070946989


Document 5:
henry: 0.6034869604104566
like: 0.5211592628257661
do: 0.6034869604104566


Document 6:
henry: 0.6034869604104566
like: 0.5211592628257661
do: 0.6034869604104566


