In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
x = '/content/drive/My Drive/ipsd/data_group.txt'

In [7]:
import re
from collections import defaultdict

In [8]:
with open(x, 'r', encoding='utf-8') as file:
    lines = file.readlines()

In [9]:
# Proses komentar berdasarkan grup
group_data = defaultdict(list)
current_group = None

In [10]:
for line in lines:
    if "MENTORING FKI" in line:
        current_group = "MENTORING FKI"
    elif "GROUP NAME XYZ" in line:  # Tambahkan nama grup lainnya
        current_group = "GROUP NAME XYZ"

    if current_group:
        group_data[current_group].append(line)

In [11]:
# Pilih grup dengan jumlah komentar terbanyak
selected_group = max(group_data, key=lambda k: len(group_data[k]))

In [12]:
# Simpan data ke file teks
with open('data_group.txt', 'w', encoding='utf-8') as file:
    file.writelines(group_data[selected_group])

In [14]:
import tarfile

with tarfile.open("data_group.tar", "w") as tar:
    tar.add("data_group.csv")


In [15]:
cleaned_lines = []
for line in lines:
    cleaned_line = re.sub(r'[^a-zA-Z0-9\s.,!?]', '', line)
    cleaned_lines.append(cleaned_line)

# Simpan versi bersih
with open('data_group_cleaned.txt', 'w', encoding='utf-8') as file:
    file.writelines(cleaned_lines)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Membaca data
with open('data_group_cleaned.txt', 'r', encoding='utf-8') as file:
    data = file.readlines()

# TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data)

# KMeans Clustering
for n_clusters in [3, 4, 5]:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    labels = kmeans.fit_predict(X)

    print(f"Cluster {n_clusters}")
    terms = vectorizer.get_feature_names_out()
    for i in range(n_clusters):
        top_terms = [terms[ind] for ind in kmeans.cluster_centers_.argsort()[0][-3:]]
        print(f"Cluster {i}: {', '.join(top_terms)}")


Cluster 3
Cluster 0: pengecapan, stampel, panitia
Cluster 1: pengecapan, stampel, panitia
Cluster 2: pengecapan, stampel, panitia
Cluster 4
Cluster 0: oleh, panitia, pengecapan
Cluster 1: oleh, panitia, pengecapan
Cluster 2: oleh, panitia, pengecapan
Cluster 3: oleh, panitia, pengecapan
Cluster 5
Cluster 0: oleh, panitia, pengecapan
Cluster 1: oleh, panitia, pengecapan
Cluster 2: oleh, panitia, pengecapan
Cluster 3: oleh, panitia, pengecapan
Cluster 4: oleh, panitia, pengecapan


In [17]:
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Laporan Clustering</title>
    <style>
        body {{ font-family: Arial, sans-serif; margin: 20px; }}
        h1 {{ color: #2c3e50; }}
        .cluster {{ margin-bottom: 20px; }}
    </style>
</head>
<body>
    <h1>Laporan Hasil Clustering</h1>
    <p>Data diproses dan dianalisis dengan metode TF-IDF dan algoritma KMeans untuk membuat 3, 4, dan 5 klaster.</p>
    <h2>Analisis</h2>
    <div>
        <h3>Hasil Clustering (3 Klaster)</h3>
        <p>... Analisis dan kata teratas ...</p>
        <h3>Hasil Clustering (4 Klaster)</h3>
        <p>... Analisis dan kata teratas ...</p>
        <h3>Hasil Clustering (5 Klaster)</h3>
        <p>... Analisis dan kata teratas ...</p>
    </div>
</body>
</html>
"""

report_file = "laporan.html"
with open(report_file, 'w', encoding='utf-8') as file:
    file.write(html_content)

print(f"Laporan HTML tersimpan di: {report_file}")


Laporan HTML tersimpan di: laporan.html
