In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd

# Load Data
file_path = 'cleaned_data_group.csv'
data = pd.read_csv(file_path)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Pesan'].fillna(""))

# Perform clustering for 3, 4, and 5 clusters
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data[f'Cluster_{n_clusters}'] = kmeans.fit_predict(tfidf_matrix)
    
    # Save the results for each cluster count
    output_path = f'clustered_data_group_{n_clusters}_clusters.csv'
    data[['Pesan', f'Cluster_{n_clusters}']].to_csv(output_path, index=False)
    print(f"Clustered data for {n_clusters} clusters saved to {output_path}")
    
    # Display 3 data samples for each cluster
    print(f"\nTop 3 data for each cluster in {n_clusters}-cluster result:")
    for cluster in range(n_clusters):
        top_samples = data[data[f'Cluster_{n_clusters}'] == cluster].head(3)
        print(f"\nCluster {cluster} (Top 3 messages):")
        print(top_samples[['Pesan', f'Cluster_{n_clusters}']])


Clustered data for 3 clusters saved to clustered_data_group_3_clusters.csv

Top 3 data for each cluster in 3-cluster result:

Cluster 0 (Top 3 messages):
                           Pesan  Cluster_3
0  6283863404275 Intro dulu bang          0
2  6283863404275 intro dulu bang          0
9              info gartic phone          0

Cluster 1 (Top 3 messages):
                                                 Pesan  Cluster_3
196                                         yang bulat          1
382  mung apa yang masuk ke web mbe apa yang dikelu...          1
484  Halo! Sepertinya kamu ingin berbicara dalam ba...          1

Cluster 2 (Top 3 messages):
           Pesan  Cluster_3
1  Media omitted          2
3  Media omitted          2
4  Media omitted          2
Clustered data for 4 clusters saved to clustered_data_group_4_clusters.csv

Top 3 data for each cluster in 4-cluster result:

Cluster 0 (Top 3 messages):
                          Pesan  Cluster_4
79               bisa mabar ga?        