In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np

# Load the dataset
file_path = 'data_group_cleaned.csv'
data = pd.read_csv(file_path)

# Step 1: Preprocess the text data (cleaning the "Pesan" column)
try:
    data['message'] = data['message'].fillna("").str.lower()  # Fill NaN and convert to lowercase
except KeyError:
    print("Column 'Pesan' not found. Check your CSV file for the correct column name.")

# Step 2: Convert the "Pesan" column into numerical representation using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
# Replace 'column_name' with the actual column name ('Pesan' in this case)
X = vectorizer.fit_transform(data['message'])

# Step 3: Perform KMeans clustering with 3, 4, and 5 clusters
clusters = {}
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters[n_clusters] = kmeans.fit_predict(X)

# Add cluster labels to the dataset for each KMeans run
for n_clusters, labels in clusters.items():
    data[f'Cluster_{n_clusters}'] = labels

# Save the updated dataset to a new CSV file
data.to_csv('data_group_clustered.csv', index=False)

# Show a sample of the updated dataset with cluster labels
print(data.head())

   date_time            sender  \
0        NaN  62 858-0354-3008   
1        NaN  62 858-0354-3008   
2        NaN  62 812-1526-7243   
3        NaN  62 812-3354-4829   
4        NaN  62 812-1526-7243   

                                             message  Cluster_3  Cluster_4  \
0                             media tidak disertakan          1          1   
1   open recruitment pengurus himatif ums 2022   ...          0          0   
2       halo kawan" ntar malem kumpul nongkrong yokk          0          0   
3                                        dimana tuh?          0          0   
4                                         besti kopi          0          0   

   Cluster_5  
0          1  
1          0  
2          0  
3          0  
4          0  


In [4]:
import tarfile
import pandas as pd
from collections import Counter
import re

# Fungsi untuk membaca file hasil klastering dari tar
def read_clustered_data_from_tar(tar_file_path, csv_file_name):
    with tarfile.open(tar_file_path, "r") as tar:
        extracted_file = tar.extractfile(csv_file_name)
        if extracted_file:
            df = pd.read_csv(extracted_file)
            return df
        else:
            raise FileNotFoundError(f"{csv_file_name} not found in {tar_file_path}")

# Fungsi untuk mendapatkan 3 kata teratas dari setiap kluster
def analyze_clusters(data, cluster_column, message_column):
    analysis = []
    for cluster_id in data[cluster_column].unique():
        cluster_messages = data[data[cluster_column] == cluster_id][message_column].fillna("").astype(str)
        all_words = " ".join(cluster_messages).split()
        clean_words = [re.sub(r'[^a-zA-Z0-9]', '', word).lower() for word in all_words if len(word) > 1]
        most_common_words = Counter(clean_words).most_common(3)
        analysis.append({
            "Cluster": cluster_id,
            "Top Words": ", ".join([f"{word} ({count})" for word, count in most_common_words])
        })
    return pd.DataFrame(analysis)

# File TAR dan nama file hasil klastering di dalamnya
tar_file_path = "data_group.tar"
clustered_file_name = "data_group_clustered.csv"

# Analisis data hasil klastering
try:
    df = read_clustered_data_from_tar(tar_file_path, clustered_file_name)
    cluster_analysis = analyze_clusters(df, cluster_column="Cluster_3", message_column="message")

    # Simpan hasil analisis ke CSV
    analysis_csv_path = "cluster_analysis.csv"
    cluster_analysis.to_csv(analysis_csv_path, index=False, encoding="utf-8")
    print(f"Hasil analisis telah disimpan ke {analysis_csv_path}")
    print(cluster_analysis)
except Exception as e:
    print(f"Terjadi kesalahan: {e}")

Hasil analisis telah disimpan ke cluster_analysis.csv
   Cluster                                   Top Words
0        1  media (543), tidak (543), disertakan (543)
1        0            ini (293), yang (231), dan (202)
2        2                cina (3), istri (1), aja (1)
