In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load the dataset
file_path = 'data_group_cleaned.csv'
data = pd.read_csv(file_path)

# Step 1: Preprocess the text data (cleaning the "Pesan" column)
data['Pesan'] = data['Pesan'].fillna("").str.lower()  # Fill NaN and convert to lowercase

# Step 2: Convert the "Pesan" column into numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(data['Pesan'])

# Step 3: Perform KMeans clustering with 3, 4, and 5 clusters
clusters = {}
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters[n_clusters] = kmeans.fit_predict(X)

# Add cluster labels to the dataset for each KMeans run
for n_clusters, labels in clusters.items():
    data[f'Cluster_{n_clusters}'] = labels

# Save the updated dataset to a new CSV file
data.to_csv('data_group_clustered.csv', index=False)

# Show a sample of the updated dataset with cluster labels
print(data.head())


   Tanggal   Waktu          Pengirim                         Pesan  Cluster_3  \
0   100924  5:27pm   Mas Putra Guard               pam bola liga 1          2   
1   100924  5:27pm  62 821-3760-6858               pam bola liga 1          2   
2   100924  5:27pm  62 831-0471-1145               pam bola liga 1          2   
3   100924  5:27pm  62 814-6976-8009               pam bola liga 1          2   
4   100924  5:28pm   Mas Putra Guard  wait tak rapikan dulu, pelan          0   

   Cluster_4  Cluster_5  
0          2          2  
1          2          2  
2          2          2  
3          2          2  
4          0          4  
