In [16]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load the dataset
file_path = 'data_group_cleaned.csv'
data = pd.read_csv(file_path)

# Step 1: Preprocess the text data (cleaning the "Pesan" column)
data['Pesan'] = data['Pesan'].fillna("").str.lower()  # Fill NaN and convert to lowercase

# Step 2: Convert the "Pesan" column into numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X = vectorizer.fit_transform(data['Pesan'])

# Step 3: Perform KMeans clustering with 3, 4, and 5 clusters
clusters = {}
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters[n_clusters] = kmeans.fit_predict(X)

# Add cluster labels to the dataset for each KMeans run
for n_clusters, labels in clusters.items():
    data[f'Cluster_{n_clusters}'] = labels

# Save the updated dataset to a new CSV file
data.to_csv('data_group_clustered.csv', index=False)

# Show a sample of the updated dataset with cluster labels
print(data.head())


    Tanggal  Waktu Pengirim  \
0       NaN    NaN      NaN   
1       NaN    NaN      NaN   
2       NaN    NaN      NaN   
3  03/10/24  15.53     Atha   
4       NaN    NaN      NaN   

                                               Pesan  Is_System_Message  \
0  03/10/24 14.31 - pesan dan panggilan dienkrips...               True   
1      03/10/24 14.31 - â€žatha membuat grup "apsih"               True   
2          03/10/24 14.31 - â€žatha menambahkan anda               True   
3                     @6283863404275 intro dulu bang              False   
4          03/10/24 15.52 - â€žatha menambahkan bayu               True   

   Cluster_3  Cluster_4  Cluster_5  
0          1          3          4  
1          1          3          3  
2          1          3          3  
3          1          3          3  
4          1          3          3  
