In [8]:
import pandas as pd

# Load the CSV file to examine its contents
file_path = 'chatgrup.csv'
data = pd.read_csv(file_path)

# Display the first few rows and basic information about the dataset
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 354 entries, 0 to 353
Data columns (total 1 columns):
 #   Column                                                                                                                                                                                             Non-Null Count  Dtype 
---  ------                                                                                                                                                                                             --------------  ----- 
 0   08/04/23 07.31 - Pesan dan panggilan dienkripsi secara end-to-end. Tidak seorang pun di luar chat ini, termasuk WhatsApp, yang dapat membaca atau mendengarkannya. Ketuk untuk info selengkapnya.  310 non-null    object
dtypes: object(1)
memory usage: 2.9+ KB


(  08/04/23 07.31 - Pesan dan panggilan dienkripsi secara end-to-end. Tidak seorang pun di luar chat ini, termasuk WhatsApp, yang dapat membaca atau mendengarkannya. Ketuk untuk info selengkapnya.
 0  24/04/23 08.57 - ‎+62 852-6441-5694 bergabung ...                                                                                                                                               
 1  02/05/23 20.18 - +62 822-3849-2861: <Media tid...                                                                                                                                               
 2  02/05/23 20.18 - +62 822-3849-2861: [COMFEASTS...                                                                                                                                               
 3                                                NaN                                                                                                                                               
 4             

In [9]:
import re

# Define a function to parse and clean the data
def parse_message(row):
    # Pattern to identify timestamp, sender, and message
    pattern = r"(\d{2}/\d{2}/\d{2} \d{2}\.\d{2}) - (.+?): (.+)"
    match = re.match(pattern, row)
    if match:
        return match.groups()
    return None, None, None

# Apply parsing to the dataset
data_cleaned = data.iloc[:, 0].dropna().apply(parse_message)
parsed_data = pd.DataFrame(data_cleaned.tolist(), columns=["Timestamp", "Sender", "Message"])

# Drop rows where parsing failed (empty rows)
parsed_data = parsed_data.dropna().reset_index(drop=True)

# Display the first few rows of the cleaned data
parsed_data.head()


Unnamed: 0,Timestamp,Sender,Message
0,02/05/23 20.18,+62 822-3849-2861,<Media tidak disertakan>
1,02/05/23 20.18,+62 822-3849-2861,[COMFEASTSPORT CHAMPIONSHIP]
2,15/06/23 18.14,+62 819-3531-1585,Inpo mabar lurr
3,15/06/23 18.59,+62 813-9098-7833,Ready 24 jam
4,26/06/23 01.35,+62 895-3272-00111,ada on pubg?


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Step 1: Preprocessing - Convert messages into numerical representation using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=500)  # Limit features to top 500 words for simplicity
X = vectorizer.fit_transform(parsed_data["Message"])

# Step 2: Define a function to perform clustering and analyze results
def perform_clustering(X, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Top keywords per cluster
    top_keywords = []
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    for i in range(n_clusters):
        top_keywords.append([terms[ind] for ind in order_centroids[i, :3]])  # Top 3 words per cluster
    
    return clusters, top_keywords

# Perform clustering for 3, 4, and 5 clusters
results = {}
for n in [3, 4, 5]:
    clusters, keywords = perform_clustering(X, n_clusters=n)
    parsed_data[f"Cluster_{n}"] = clusters  # Add cluster labels to the dataframe
    results[n] = keywords

# Display the top keywords for each clustering result
results


{3: [['pesan', 'dihapus', 'ini'],
  ['yang', 'kalian', 'kami'],
  ['disertakan', 'media', 'tidak']],
 4: [['pesan', 'dihapus', 'ini'],
  ['kami', 'akan', 'kalian'],
  ['disertakan', 'media', 'tidak'],
  ['dana', 'sekarang', 'kaget']],
 5: [['pesan', 'dihapus', 'ini'],
  ['kami', 'akan', 'kalian'],
  ['𝗖𝗹𝗮𝗶𝗺', '𝗕𝘂𝗿𝘂𝗮𝗻', 'beasiswa'],
  ['dana', 'sekarang', 'kaget'],
  ['disertakan', 'media', 'tidak']]}