In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import torch.nn.functional as F
from sklearn.metrics import f1_score, classification_report

import warnings
warnings.filterwarnings('ignore')

# Data Loading

### Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Download Dataset from Google Drive

In [None]:
%%capture

!pip install gdown

In [None]:
import gdown

file_id = '1WFPrMvJ9tVBP3_ulCdJn40NPpHS2r8vA'
output_filename = 'data/dataset.csv'

gdown.download(f'https://drive.google.com/uc?id={file_id}', output_filename, quiet=False)

### Load Dataset

In [None]:
df = pd.read_csv('./dataset.csv')

df.head()

### Take the only relevants column

In [None]:
df.index = df['id']
df = df[['content', 'category', 'created_at', 'longitude', 'latitude']]

df.head()

# Exploratory Data Analysis

In [None]:
df.info()

### Check for Copy-paste Spammer

In [None]:
df['content'].duplicated().sum()

In [None]:
df[df['content'].duplicated()].head()

Oke, data kita so-far banyak spammer copy-paste. Jadi kita harus filtering spammer-spammer ini.

### Data Distributions

In [None]:
plt.figure(figsize=(10, 6))
df['category'].value_counts().head(30).plot(kind='barh')
plt.tight_layout()
plt.show()

Disini distribusi kategori cenderung tidak imbang, sehingga kalau kita pakai metode supervised learning buat generate label based on text, model bakal cenderung bias ke arah label mayoritas. Makanya, perlu approach yang lebih efektif buat handling bias ini, jadi kita bakal pakai metode unsupervised buat generate label bersamaan dengan hidden sub-topic based on text laporan yang dibuat pada kolom 'content'.

### Noise Analysis

- Word Count Analysis of each content (panjang rata-rata laporan warga buat semua kategori)

In [None]:
df['word_count'] = df['content'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 5))
sns.histplot(df['word_count'], kde=True)
plt.title('Distribusi Jumlah Kata per Laporan')
plt.xlabel('Jumlah Kata')
plt.xlim(0, 100)
plt.show()

- Summary Statistics of Word Count

In [None]:
df['word_count'].describe().T

Kita bisa liat ada noise di data yang ditunjukkan oleh adanya laporan yang hanya tersusun dari 1 kata aja.

In [None]:
df[df['word_count'] == 1]['content']

Diatas terlihat bahwa laporan-laporan yang dibuat adalah laporan yang basically bisa kita anggap sebagai noise karena laporan jenis ini tidak akan memiliki `semantics context` yang cukup buat model kita untuk bekerja secara efektif. Jadi, kita harus ngelakuin filtering di bagian data preprocessing terhadap data dengan word count yang sedikit.

# Data Preprocessing

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('words')

### regex

In [None]:
def clean_text(text):
    # Ubah jadi lowercase
    text = str(text).lower()

    # Hapus URL
    text = re.sub(r'http\S+|www\.\S+', '', text)

    # Hapus Mentions & Hashtags
    text = re.sub(r'@\w+|#\w+', '', text)

    # Hapus Emoji & Simbol Aneh (ganti dgn spasi)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)

    # Hapus spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df['content'] = df['content'].apply(clean_text)

### Delete Copy-Paste Spammer

In [None]:
initial_count = len(df)
df = df.drop_duplicates(subset=['content'])

In [None]:
df['content'].duplicated().sum()

### Delete Noise (Laporan warga yang cuman 1 kata (ga-detailed))

In [None]:
df = df[df['word_count'] >= 4].copy()

In [None]:
df['word_count'].describe()

### Simpen ke .csv biar cepet kalau mau re-training notebook.

In [None]:
df.columns

In [None]:
df = df[['content', 'category', 'created_at', 'longitude', 'latitude']]

df.to_csv('cleaned_dataset.csv', index=False)

### Final Results on Cleaned Dataset

In [None]:
df.info()

Setelah Filtering dkk, dataset kita dari 53898 jadi sisa 39816 baris.

# Fine-Tuning (IndoBERT)

In [None]:
%%capture

!pip install sentence-transformers

In [None]:
import torch
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, models, losses, datasets
import gc
import os

# biar ga diminta Login pas Fine Tuning
os.environ["WANDB_MODE"] = "disabled"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

- Reload cleaned csv

In [None]:
df = pd.read_csv('cleaned_dataset.csv')

df['content'] = df['content'].astype(str)

df.head()

In [None]:
X = df['content'].tolist()

### TSDAE (Transformer-based Sequential Denoising Auto-Encoder)

Denoising AutoEncoder (Model belajar menebak kalimat asli dari kalimat yang dirusak dengan diberi noise)

Ini contoh metode fine-tuning dengan tujuan supaya model kita bisa ubah text kita ke vector embedding yang benar-benar merepresentasikan text tersebut.

- Load Model (IndoBERT)

In [None]:
word_embedding_model = models.Transformer('indobenchmark/indobert-base-p1', max_seq_length=128)

- Tambahin Pooling Layer

In [None]:
# Tambahkan Pooling Layer (Wajib buat IndoBERT biar jadi vektor kalimat)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

In [None]:
# Gabungkan jadi satu model siap pakai
model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)

- Create a training data (Convert sentences into Sentence_transformers Input Example Data Types)

In [None]:
train_dataset = datasets.DenoisingAutoEncoderDataset(X)

- Convert it into DataLoader

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

- Specify Loss Function: TSDAE menggunakan DenoisingAutoEncoderLoss

In [None]:
train_loss = losses.DenoisingAutoEncoderLoss(
    model,
    tie_encoder_decoder=True
)

In [None]:
# Start Fine-Tuning (Retraining Pretrained IndoBERT Model into our Dataset)
history = model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=1,
    weight_decay=0,
    scheduler='constantlr',
    optimizer_params={'lr': 3e-5},
    show_progress_bar=True,
    use_amp=True
)

# Text Representation

In [None]:
embeddings = model.encode(
    X,
    batch_size=32,
    show_progress_bar=True,
    device=device,
    convert_to_numpy=True,
    normalize_embeddings=True
)

- Store Embedding Vector Results

Biar tar kalau mau ulang training atau runtime T4 Colab abis gaush Retrain, tapi pake aja via Embedding Vectornya.

In [None]:
# store results
os.makedirs('../output', exist_ok=True)

# Simpan embedding vector (dalam bentuk numpy) & metadata laporan
np.save('../output/embeddings_indobert.npy', embeddings)
df.to_csv('../output/metadata_laporan.csv', index=False)

# Graph Construction

In [None]:
%%capture

!pip install python-louvain networkx

In [None]:
import networkx as nx
import community.community_louvain as community_louvain

### Data Reloading

In [None]:
embeddings = np.load('../output/embeddings_indobert.npy')
df = pd.read_csv('../output/metadata_laporan.csv')

### Threshold

Logika simpelnya, when similarity hasil dari cosine similarity > threshold, buat edgesnya di graphnya

In [None]:
THRESHOLD = 0.75

### Graph Initialization

In [None]:
# buat graph tanpa edges (buat nodesnya doang sebanyak panjang dataset kita)
G = nx.Graph()
num_nodes = len(df)
G.add_nodes_from(range(len(df)))

### Cosine Similarity

In [None]:
batch_size = 1000
for i in tqdm(range(0, num_nodes, batch_size)):
    # ambil data per batch
    batch_emb = embeddings[i:min(i + batch_size, num_nodes)]

    # hitung similarity batch vs semua data-nya
    sim_matrix = cosine_similarity(batch_emb, embeddings)

    # filter sim_matrix > threshold, catet index buat setiap sim_matrix yang similarity-nya diatas threshold
    rows, cols = np.where(sim_matrix > THRESHOLD)

    for r, c in zip(rows, cols):
        # tambahin edge-nya di graph dengan weight dari similaritynya.
        if i + r < c:
            weight = sim_matrix[r, c]
            G.add_edge(i + r, c, weight=weight)

### Final Graph

In [None]:
print(f"Jumlah Nodes: {G.number_of_nodes()}")
print(f"Jumlah Edges: {G.number_of_edges()}")

### Plot

# Louvain Algorithm

Clustering nodes yang ada di graph. Clustering dilakukan terhadap seberapa rapat node-node saling terhubung di dalam cluster dibanding antar-cluster.

- **Node yang banyak edge similarity tinggi** bakal **masuk cluster yang sama**.

### Graph Clustering

In [None]:
partition = community_louvain.best_partition(G, resolution=0.5, random_state=42)

### Mapping

In [None]:
df['cluster'] = df.index.map(partition)

In [None]:
df['cluster'].nunique()

In [None]:
output_file = '../output/laporan_warga_clustered.csv'
df.to_csv(output_file, index=False)

# Topic Extraction (TF-IDF)

- Ambil laporan yang udah clustered

In [None]:
df = pd.read_csv('../output/laporan_warga_clustered.csv')

### Stopwords removal

In [None]:
# izin custom stopwords (yang indo gaada soalnya :v)
id_stop_words = [
    'dan', 'di', 'yang', 'untuk', 'ini', 'itu', 'dari', 'ke', 'saya', 'mohon',
    'ada', 'tidak', 'sudah', 'akan', 'pada', 'juga', 'dengan', 'karena', 'bisa',
    'tolong', 'terima', 'kasih', 'segera', 'tindak', 'lanjuti', 'atau', 'agar',
    'apakah', 'seperti', 'namun', 'tapi', 'kalo', 'kalau', 'banyak', 'sangat',
    'terdapat', 'kami', 'para', 'adalah', 'sebagai', 'laporan', 'warga', 'jakarta',
    'dki', 'kelurahan', 'kecamatan', 'melalui', 'kepada', 'yth', 'bapak', 'ibu',
    'masalah', 'terkait', 'sebuah', 'satu', 'dua', 'tiga', 'hari', 'saat', 'jam',
    'lokasi', 'depan', 'belakang', 'samping', 'jalan', 'jl', 'rt', 'rw', 'jk',
    'https', 'http', 'www', 'com', 'id', 'co', 'html', 'lebih', 'kurang', 'hal', 'ternyata',
    'tahun', 'thn', 'th', 'tanggal', 'tgl', 'bulan', 'bln',
    'wib', 'wit', 'wita', 'pukul',
    '00', '01', '02', '03', '2023', '2024', '2022'
]

### Data Grouping (by Cluster)

In [None]:
df['content'] = df['content'].astype(str)

In [None]:
docs_per_class = df.groupby(['cluster'], as_index=False).agg({'content': ' '.join})

In [None]:
docs_per_class.head()

### TF-IDF (Keyword Extractor)

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words=id_stop_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(docs_per_class['content'])
feature_names = tfidf_vectorizer.get_feature_names_out()

### Top-5 Keywords Extractions

Dari banyaknya keyword yang di-extract, ambil 1 aja.

In [None]:
top_n_words = 5 # jaga-jaga aja, tetep ambil top 5 (siapa tau gaada top 1 bisa ambil top 2 wkwkwk)
cluster_labels = {}

In [None]:
for i in range(len(docs_per_class)):
    cluster_id = docs_per_class.iloc[i]['cluster']

    row = tfidf_matrix[i].toarray().flatten()
    top_indices = row.argsort()[-top_n_words:][::-1]
    top_features = [feature_names[idx] for idx in top_indices]

    # Ambil top 5 features aja
    final_label = top_features[:5] if len(top_features) >= 5 else top_features

    cluster_labels[cluster_id] = final_label

### Mapping

In [None]:
df['topic_label'] = df['cluster'].map(cluster_labels)

### Simpen lagi ke .csv

In [None]:
output_file = '../output/final_laporan_warga_labeled.csv'
df.to_csv(output_file, index=False)

# Zero-Shot Classification

In [None]:
# TODO: buat Zero-Shot Classification buat jadiin 5 label jadi 1.

# Evaluation

### F1-Score (Alignment Analysis)

Buat ngukur seberapa selaras topik-topik baru yang terbentuk (Cluster) align dengan Kategori lama.

Contoh: Kalau Cluster 0 isinya 90% kategori "Jalan", maka Cluster 0 = "Jalan"

Interpretasi: 
- Kluster sangat selaras dengan kategori asli (Discovery rendah).
- F1 Rendah = BAGUS. Artinya model menemukan topik baru yang lebih spesifik/lintas kategori.

In [None]:
df = pd.read_csv('../output/final_laporan_warga_labeled.csv')

In [None]:
cluster_to_category = df.groupby('cluster')['category'].agg(lambda x: x.mode()[0]).to_dict()
df['predicted_category_baseline'] = df['cluster'].map(cluster_to_category)

In [None]:
y_true = df['category']
y_pred = df['predicted_category_baseline']

In [None]:
f1 = f1_score(y_true, y_pred, average='macro')
f1

In [None]:
print(classification_report(y_true, y_pred))

### Modularity Score

Interpretasinya:
- Modularity Score > 0.3 dianggap struktur clustering yang baik/solid.

In [None]:
modularity = community_louvain.modularity(partition, G)
modularity