In [None]:
!pip install scikit-learn
!pip install nltk
!pip install matplotlib



In [None]:
import os
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster
import shutil

In [None]:
base_folder = 'bbc'

files = []
filenames = []

#Search all files in folders
for sub_folder in os.listdir(base_folder):
    for file in os.listdir(os.path.join(base_folder,sub_folder)):
        if file.endswith('.txt'):
            file_path = os.path.join(base_folder,sub_folder, file)
            filenames.append(file_path)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()
                files.append(content)

In [None]:
print(len(files))

In [None]:
#Tokenize
token_files = [word_tokenize(doc.lower()) for doc in files]

#Create tf matrix
vectorizer = CountVectorizer()
tf_matrix = vectorizer.fit_transform([' '.join(doc) for doc in token_files])

vocabulario = vectorizer.get_feature_names_out()

In [None]:
#Print the tf matrix and vocab
print("Matriz TF:")
print(tf_matrix.toarray())
print("\nVocabulario:")
print(vocabulario)

In [None]:
#Transform tf matrix to idf matrix
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(tf_matrix)

In [None]:
#Print tfidfmatrix
print("Matriz TF-IDF:")
print(tfidf_matrix.toarray())

In [None]:
#Create the distance matrix
dis_matrix = linkage(tfidf_matrix.toarray(), method='ward')
# Determine the cut height to achieve 5 clusters
num_clusters = 5
clusters = fcluster(dis_matrix, num_clusters, criterion='maxclust')

In [None]:
#Print the cluster assignments
print("\nAsignaciones de clúster:")
print(clusters)

In [None]:
# Visualize the dendrogram with cut lines for 5 clusters
plt.figure(figsize=(10, 7))
plt.title('Dendrograma Jerárquico con 5 Grupos')
plt.xlabel('Índice del Documento')
plt.ylabel('Distancia')
dendrogram(dis_matrix, truncate_mode='level', p=5, leaf_rotation=90., leaf_font_size=8)
plt.axhline(y=dis_matrix[-(num_clusters - 1), 2], color='r', linestyle='--')  # Horizontal line at cut height
plt.show()

In [None]:
#Separate dendrogram groups into diferent lists
labels = fcluster(dis_matrix, num_clusters, criterion='maxclust')
num_grups = max(labels)
groups = [[] for _ in range(num_grups)]
final_groups = [[] for _ in range(num_grups)]

for i, label in enumerate(labels):
    groups[label - 1].append(i)  #Fcluster starts with 1
    final_groups[label - 1].append(filenames[i])

In [None]:
#Delete new_categories existing folders
new_folder_path = "new_categories/"
folders = os.listdir(new_folder_path)

for folder in folders:
    final_path = os.path.join(new_folder_path, folder)
    if os.path.isdir(final_path):
        try:
            shutil.rmtree(final_path)
        except Exception as e:
            print(f'Error deleting {final_path}: {str(e)}')

In [None]:
#Copy files of different groups into new folders

folder_paths = []

for i in range(len(final_groups)):
    os.makedirs(new_folder_path + "category_" + str(i), exist_ok=True)
    folder_paths.append(new_folder_path + "category_" + str(i))

j = 0
for group in groups:
    for element in group:
        shutil.copy(filenames[element], folder_paths[j])
    j+=1