In [1]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
file_path = 'data_group.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [3]:
cleaned_lines = [re.sub(r'[^a-zA-Z0-9\s.,!?]', '', line).strip() for line in lines if line.strip()]

In [4]:
cleaned_lines = [line for line in cleaned_lines if len(line.split()) > 2]

In [5]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = vectorizer.fit_transform(cleaned_lines)

In [6]:
# KMeans Clustering
optimal_clusters = range(2, 6)
for n_clusters in optimal_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)
    silhouette_avg = silhouette_score(X, labels)
    
    print(f"\nClustering dengan {n_clusters} klaster:")
    print(f"Silhouette Score: {silhouette_avg:.4f}")
    
    # Menampilkan kata kunci di setiap klaster
    terms = vectorizer.get_feature_names_out()
    for i in range(n_clusters):
        top_terms = [terms[ind] for ind in kmeans.cluster_centers_.argsort()[i, -5:]]
        print(f"Klaster {i}: {', '.join(top_terms)}")


Clustering dengan 2 klaster:
Silhouette Score: 0.0227
Klaster 0: 11, image, 10, omitted, pm
Klaster 1: yang, mahasiswa, whatsapp, dan, ums

Clustering dengan 3 klaster:
Silhouette Score: 0.0223
Klaster 0: 12, 190822, 11, 10, pm
Klaster 1: informatika, httpschat, mahasiswa, whatsapp, ums
Klaster 2: 11, pm, sticker, image, omitted

Clustering dengan 4 klaster:
Silhouette Score: 0.0176
Klaster 0: robby, yang, 130922, 120922, pm
Klaster 1: untuk, whatsapp, mahasiswa, dan, ums
Klaster 2: 170822, 190822, 210922, omitted, 11
Klaster 3: 12, image, 10, omitted, pm

Clustering dengan 5 klaster:
Silhouette Score: 0.0193
Klaster 0: ada, yang, 130922, 120922, pm
Klaster 1: untuk, whatsapp, dan, mahasiswa, ums
Klaster 2: 170822, 190822, 210922, omitted, 11
Klaster 3: sticker, 12, image, omitted, pm
Klaster 4: 190822, pm, omitted, 300822, 10


In [7]:
# Menyimpan hasil clustering ke file
output_file = 'clustered_data.txt'
with open(output_file, 'w', encoding='utf-8') as file:
    for cluster_id, text in zip(labels, cleaned_lines):
        file.write(f"Cluster {cluster_id}: {text}\n")

print(f"Hasil clustering disimpan di: {output_file}")

Hasil clustering disimpan di: clustered_data.txt
