In [1]:
import os
import nltk
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Ensure the necessary NLTK resources are downloaded
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize and remove stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    return ' '.join(tokens)

# Function to implement single-pass clustering
def single_pass_clustering(documents, threshold=0.2):
    clusters = []  # List of clusters, each cluster is a list of document indices
    
    # Vectorize the documents using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    
    # Convert TF-IDF matrix to dense matrix
    tfidf_matrix_dense = tfidf_matrix.toarray()
    
    # Process each document one by one
    for i, doc_vector in enumerate(tfidf_matrix_dense):
        if not clusters:
            # If no clusters exist, create the first one
            clusters.append([i])
        else:
            # Calculate the similarity with the centroids of existing clusters
            best_cluster = None
            max_similarity = 0
            
            for cluster in clusters:
                # Get the centroid of the cluster (mean of the document vectors in the cluster)
                cluster_vectors = tfidf_matrix_dense[cluster]
                centroid = np.mean(cluster_vectors, axis=0)  # Calculate the centroid
                
                # Calculate similarity between the document and the centroid
                similarity = cosine_similarity([doc_vector], [centroid]).flatten()[0]
                
                if similarity > max_similarity:
                    max_similarity = similarity
                    best_cluster = cluster
            
            # If the maximum similarity exceeds the threshold, add the document to the best cluster
            if max_similarity > threshold:
                best_cluster.append(i)
            else:
                # Otherwise, create a new cluster
                clusters.append([i])
    
    return clusters

# Main function
def main():
    # Path to the folder containing the text files
    folder_path = 'textfiles_2/'
    
    # Read and preprocess the documents
    documents = []
    file_names = []
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.txt'):
            file_names.append(file_name)
            with open(os.path.join(folder_path, file_name), 'r') as file:
                text = file.read()
                preprocessed_text = preprocess_text(text)
                documents.append(preprocessed_text)
    
    # Apply the single-pass clustering algorithm
    clusters = single_pass_clustering(documents, threshold=0.2)
    
    # Output the clusters
    for cluster_idx, cluster in enumerate(clusters):
        print(f"Cluster {cluster_idx + 1}:")
        for doc_index in cluster:
            print(f" - {file_names[doc_index]}")

# Run the main function
if __name__ == '__main__':
    main()


Cluster 1:
 - file1.txt
 - file3.txt
Cluster 2:
 - file2.txt
 - file4.txt
 - file5.txt
 - file6.txt


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ameyp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
