In [1]:
# Mengonversi file teks ke format CSV dan mengekstraknya ke dalam file tar
import re
import csv 
import tarfile
from collections import defaultdict

# Membaca kembali file teks
with open("chat.txt", 'r', encoding='utf-8') as file:
    lines = file.readlines()

group_comments = defaultdict(list)
current_group = None

for line in lines:
    # Deteksi pembuatan grup baru
    group_match = re.search(r'membuat grup "(.*?)"', line)
    if group_match:
        current_group = group_match.group(1)
    
    # Menyimpan komentar ke grup saat ini
    if current_group and re.search(r':', line):
        group_comments[current_group].append(line)

# Menentukan grup dengan komentar terbanyak
most_active_group = max(group_comments, key=lambda x: len(group_comments[x]))
most_active_comments = group_comments[most_active_group]

print("Grup yang paling aktif : ",most_active_group)

# Menyimpan dalam format CSV
csv_file = "data_group.csv"
with open(csv_file, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Timestamp", "Sender", "Message"])
    
    for line in lines:
        match = re.match(r'^(.*?) - (.*?): (.*)', line)
        if match:
            timestamp, sender, message = match.groups()
            writer.writerow([timestamp, sender, message])

tar_file = "data_group.tar"
with tarfile.open(tar_file, "w") as tar:
    tar.add(csv_file, arcname="data_group.csv")



Grup yang paling aktif :  Peserta Fakultaria 2022


In [2]:
import pandas as pd

def convert_csv_to_xlsx(input_csv: str, output_xlsx: str):
    """
    Converts a CSV file to an XLSX file.

    Parameters:
    - input_csv: Path to the input CSV file.
    - output_xlsx: Path to the output XLSX file.
    """
    # Read the CSV file
    data = pd.read_csv(input_csv)
    print(f"CSV file '{input_csv}' read successfully with {len(data)} rows and {len(data.columns)} columns.")
    
    # Save as XLSX
    data.to_excel(output_xlsx, index=False)
    print(f"Data has been saved to '{output_xlsx}'.")




In [4]:
# Usage
convert_csv_to_xlsx('data_group.csv', 'data_group.xlsx')

CSV file 'data_group.csv' read successfully with 3168 rows and 3 columns.
Data has been saved to 'data_group.xlsx'.


In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def scale_and_cluster_data(input_file, output_file, n_clusters):
    # Read data from XLSX file
    data = pd.read_excel(input_file)
    print(f"Data read successfully with {len(data)} rows and {len(data.columns)} columns.")
    
    # Handle missing values in the 'message' column
    if 'message' not in data.columns:
        raise ValueError("The input file does not have a 'message' column.")
    data['message'] = data['message'].fillna("")  # Replace NaN with empty strings
    
    # Convert text data to numerical format using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    vectorized_data = vectorizer.fit_transform(data['message'])
    print(f"Vectorized data shape: {vectorized_data.shape}")
    
    # Perform clustering with KMeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    data['cluster'] = kmeans.fit_predict(vectorized_data)
    print(f"Clustering completed with {n_clusters} clusters.")
    
    # Save clustered data to a new XLSX file
    data.to_excel(output_file, index=False)
    print(f"Clustered data saved to '{output_file}'.")


In [8]:
# Usage
scale_and_cluster_data('data_group.xlsx', 'clustered_data.xlsx', n_clusters=5)

Data read successfully with 3168 rows and 3 columns.
Vectorized data shape: (3168, 3504)
Clustering completed with 5 clusters.
Clustered data saved to 'clustered_data.xlsx'.


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def scale_and_cluster_data_with_top_words(input_file, output_file, n_clusters_list):
    # Read data from XLSX file
    data = pd.read_excel(input_file)
    print(f"Data read successfully with {len(data)} rows and {len(data.columns)} columns.")
    
    # Handle missing values in the 'message' column
    if 'message' not in data.columns:
        raise ValueError("The input file does not have a 'message' column.")
    data['message'] = data['message'].fillna("")  # Replace NaN with empty strings
    
    # Convert text data to numerical format using TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    vectorized_data = vectorizer.fit_transform(data['message'])
    print(f"Vectorized data shape: {vectorized_data.shape}")
    
    top_words = {}
    
    for n_clusters in n_clusters_list:
        # Perform clustering with KMeans
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        data[f'cluster_{n_clusters}'] = kmeans.fit_predict(vectorized_data)
        print(f"Clustering completed with {n_clusters} clusters.")
        
        # Extract top words for each cluster
        cluster_centers = kmeans.cluster_centers_
        feature_names = vectorizer.get_feature_names_out()
        top_words[n_clusters] = [
            [feature_names[i] for i in cluster_center.argsort()[-3:][::-1]]
            for cluster_center in cluster_centers
        ]
    
    # Save clustered data to a new XLSX file
    data.to_excel(output_file, index=False)
    print(f"Clustered data saved to '{output_file}'.")
    
    return top_words


In [10]:
# Cluster and extract top words
top_words = scale_and_cluster_data_with_top_words(
    input_file='data_group.xlsx',
    output_file='clustered_data.xlsx',
    n_clusters_list=[3, 4, 5]
)

# Display top words for 3, 4, and 5 clusters
k3 = top_words[3]
k4 = top_words[4]
k5 = top_words[5]

# Print top 3 words from each cluster
print("Top words for 3 clusters:")
for i, words in enumerate(k3):
    print(f"Cluster {i}: {words[:3]}")

print("\nTop words for 4 clusters:")
for i, words in enumerate(k4):
    print(f"Cluster {i}: {words[:3]}")

print("\nTop words for 5 clusters:")
for i, words in enumerate(k5):
    print(f"Cluster {i}: {words[:3]}")


Data read successfully with 3168 rows and 3 columns.
Vectorized data shape: (3168, 3504)
Clustering completed with 3 clusters.
Clustering completed with 4 clusters.
Clustering completed with 5 clusters.
Clustered data saved to 'clustered_data.xlsx'.
Top words for 3 clusters:
Cluster 0: ['azka', 'losss', 'knp']
Cluster 1: ['disertakan', 'media', 'tidak']
Cluster 2: ['kak', 'waalaikumsalam', 'ini']

Top words for 4 clusters:
Cluster 0: ['azka', 'losss', 'knp']
Cluster 1: ['disertakan', 'media', 'tidak']
Cluster 2: ['kak', 'waalaikumsalam', 'ini']
Cluster 3: ['aku', 'ya', 'info']

Top words for 5 clusters:
Cluster 0: ['azka', 'losss', 'knp']
Cluster 1: ['disertakan', 'media', 'tidak']
Cluster 2: ['kak', 'waalaikumsalam', 'ini']
Cluster 3: ['aku', 'ya', 'info']
Cluster 4: ['wa', 'alaikumussalam', 'alaikumsalam']


In [14]:
from metaflow import Flow

# Mengambil hasil dari workflow terakhir
run = Flow('ManyKmeansFlow').latest_run

# Mengambil hasil clustering untuk 3, 4, dan 5 cluster
k3 = run.data.top[3]
k4 = run.data.top[4]
k5 = run.data.top[5]

# Menampilkan 3 kata teratas untuk setiap cluster
print("Top words for 3 clusters:")
print(k3[0][:3])  # Cluster pertama
print(k3[1][:3])  # Cluster kedua
print(k3[2][:3])  # Cluster ketiga

print("\nTop words for 4 clusters:")
print(k4[0][:3])  # Cluster pertama
print(k4[1][:3])  # Cluster kedua
print(k4[2][:3])  # Cluster ketiga
print(k4[3][:3])  # Cluster keempat

print("\nTop words for 5 clusters:")
for i in range(5):
    print(f"Cluster {i}: {k5[i][:3]}")


Top words for 3 clusters:
['kak', 'malam', 'malam kak']
['pesan', 'ya', 'dihapus']
['disertakan', 'media disertakan', 'media']

Top words for 4 clusters:
['senin', 'tm imm', 'temanÂ²']
['disertakan', 'media disertakan', 'media']
['kak', 'ya', 'pesan']
['malam kak', 'malam', 'waalaikumsalam malam']

Top words for 5 clusters:
Cluster 0: ['ya', 'info', 'ga']
Cluster 1: ['disertakan', 'media disertakan', 'media']
Cluster 2: ['pesan dihapus', 'dihapus', 'pesan']
Cluster 3: ['waalaikumsalam', 'waalaikumsalam malam', 'malam kak']
Cluster 4: ['kak', 'waalaikumussalam', 'malam kak']
