In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load the dataset
file_path = 'data_group_cleaned.csv'
data = pd.read_csv(file_path)

# Step 1: Preprocess the text data (cleaning the "Pesan" column)
try:
    data['message'] = data['message'].fillna("").str.lower()  # Fill NaN and convert to lowercase
except KeyError:
    print("Column 'Pesan' not found. Check your CSV file for the correct column name.")

# Step 2: Convert the "Pesan" column into numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# Replace 'column_name' with the actual column name ('Pesan' in this case)
X = vectorizer.fit_transform(data['message'])

# Step 3: Perform KMeans clustering with 3, 4, and 5 clusters
clusters = {}
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters[n_clusters] = kmeans.fit_predict(X)

# Add cluster labels to the dataset for each KMeans run
for n_clusters, labels in clusters.items():
    data[f'Cluster_{n_clusters}'] = labels

# Save the updated dataset to a new CSV file
data.to_csv('data_group_clustered.csv', index=False)

# Show a sample of the updated dataset with cluster labels
print(data.head())

   date_time            sender  \
0        NaN  62 858-0354-3008   
1        NaN  62 858-0354-3008   
2        NaN  62 812-1526-7243   
3        NaN  62 812-3354-4829   
4        NaN  62 812-1526-7243   

                                             message  Cluster_3  Cluster_4  \
0                             media tidak disertakan          1          1   
1   open recruitment pengurus himatif ums 2022   ...          0          0   
2       halo kawan" ntar malem kumpul nongkrong yokk          0          0   
3                                        dimana tuh?          0          0   
4                                         besti kopi          0          0   

   Cluster_5  
0          1  
1          0  
2          0  
3          0  
4          0  
