In [1]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
# Folder containing your text files
folder_path = "DM-9-TXTs"

# Read all files and store their contents
documents = []
file_names = []

for file_name in os.listdir(folder_path):
    if file_name.endswith(".txt"):  # Ensure only text files are read
        file_names.append(file_name)
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-8') as file:
            documents.append(file.read())

# Compute TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (vocabulary)
feature_names = vectorizer.get_feature_names_out()

# Print TF-IDF vectors
for i, file_name in enumerate(file_names):
    print(f"TF-IDF for {file_name}:")
    vector = tfidf_matrix[i]
    for index, value in zip(vector.indices, vector.data):
        print(f"  {feature_names[index]}: {value}")
    print()


TF-IDF for 1.txt:
  the: 0.4110606835074282
  forest: 0.21081809191887804
  whispers: 0.21081809191887804
  with: 0.13939899588883134
  rustling: 0.21081809191887804
  of: 0.18716944898980267
  leaves: 0.21081809191887804
  and: 0.10276517087685705
  chirping: 0.21081809191887804
  birds: 0.21081809191887804
  morning: 0.21081809191887804
  sunlight: 0.21081809191887804
  filters: 0.21081809191887804
  through: 0.1792146384673949
  trees: 0.21081809191887804
  painting: 0.21081809191887804
  golden: 0.21081809191887804
  paths: 0.21081809191887804
  life: 0.12518817794638443
  thrives: 0.21081809191887804
  in: 0.13939899588883134
  harmony: 0.21081809191887804
  nature: 0.1792146384673949

TF-IDF for 10.txt:
  the: 0.10107374319898528
  of: 0.1840887983785881
  life: 0.12312768656283402
  in: 0.13710460647749625
  nature: 0.1762649173002227
  inspires: 0.1762649173002227
  technology: 0.2073482046737627
  to: 0.2073482046737627
  solve: 0.2073482046737627
  modern: 0.2073482046737627


In [4]:
from sklearn.cluster import KMeans

In [5]:
num_clusters = 3  # We want three clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Interpret the Results
clusters = kmeans.labels_

# Display the files in each cluster
for cluster_num in range(num_clusters):
    print(f"Cluster {cluster_num + 1}:")
    for i, label in enumerate(clusters):
        if label == cluster_num:
            print(f"  {file_names[i]}")
    print()

Cluster 1:
  1.txt
  2.txt
  3.txt
  4.txt
  7.txt
  9.txt

Cluster 2:
  6.txt

Cluster 3:
  10.txt
  5.txt
  8.txt

