# Setting Environment

In [13]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [14]:
# set folder tempat kerja (current working directory)
import os
cwd = "/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita"
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

# MAIN CODE

## EDIT PARAMETER UNTUK LLM

In [19]:
# %% Save LLM params to config.json (OpenAI / DeepSeek)
import json, os, tempfile, shutil
from pathlib import Path
cfg_path = cwd + "/config.json"
CONFIG_PATH = Path(cfg_path)

OPENAI_PARAMS = {
    "enable_llm": True,
    "enable_llm_cluster_label": True,
    "llm_provider": "openai",
    "llm_model": "gpt-4o-mini",
    "llm_temperature": 0.2,
    "llm_max_tokens": 200,
    "filter_to_three_issues": True
}

DEEPSEEK_PARAMS = {
    "enable_llm": True,
    "enable_llm_cluster_label": True,
    "llm_provider": "deepseek",         # ganti provider
    "llm_model": "deepseek-chat",       # contoh; sesuaikan dg model yang kamu pakai (mis. "deepseek-reasoner")
    "llm_temperature": 0.2,
    "llm_max_tokens": 200,
    "filter_to_three_issues": True
}

def load_config_safe(path: Path) -> dict:
    if not path.exists():
        path.parent.mkdir(parents=True, exist_ok=True)
        return {}
    try:
        with path.open("r", encoding="utf-8") as f:
            return json.load(f)
    except Exception:
        # jika rusak, simpan cadangan dan mulai baru
        backup = path.with_suffix(path.suffix + ".corrupt.bak")
        shutil.copy2(path, backup)
        return {}

def atomic_write_json(path: Path, data: dict):
    tmp_fd, tmp_path = tempfile.mkstemp(prefix="cfg_", suffix=".json", dir=str(path.parent))
    os.close(tmp_fd)
    try:
        with open(tmp_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        # backup lama
        if path.exists():
            shutil.copy2(path, path.with_suffix(path.suffix + ".bak"))
        # replace atomik
        os.replace(tmp_path, path)
    finally:
        if os.path.exists(tmp_path):
            try: os.remove(tmp_path)
            except Exception: pass

def update_llm_params(config_path: Path, new_params: dict) -> dict:
    cfg = load_config_safe(config_path)
    # merge (overwrite hanya keys yg kita set)
    cfg.update(new_params)
    atomic_write_json(config_path, cfg)
    return cfg

# === Pilih salah satu ===
#read params from config.json
with open(cfg_path, 'r') as f:
    config = json.load(f)

ai_name = config.get("AI_name")

if ai_name == "openai":
  cfg_after = update_llm_params(CONFIG_PATH, OPENAI_PARAMS)
elif ai_name == "deepseek":
  cfg_after = update_llm_params(CONFIG_PATH, DEEPSEEK_PARAMS)

print("Config saved to:", CONFIG_PATH)
print("Provider:", cfg_after.get("llm_provider"))
print("Model   :", cfg_after.get("llm_model"))

# --- Catatan kredensial (JANGAN simpan API key di config.json) ---
# Untuk OpenAI:
# import os
# os.environ["OPENAI_API_KEY"] = "sk-..."  # set di environment/secret manager

# Untuk DeepSeek:
# import os
# os.environ["DEEPSEEK_API_KEY"] = "sk-..."  # set di environment/secret manager


Config saved to: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/config.json
Provider: deepseek
Model   : deepseek-chat


## PROSES ANALISIS

In [20]:
# %% [markdown]
# === Pipeline: Baca Konten -> Embeddings -> Clustering -> Isu -> LLM Cluster Label ===
# Output: seluruh kolom awal + cluster_id + kategori_isu + cluster_label
# Simpan CSV + update config.json (last_output_path)

# %% [code]
!pip -q install sentence-transformers hdbscan umap-learn scikit-learn tqdm python-dateutil orjson tenacity requests

# %% [code]
import os, re, json, orjson, warnings, tempfile, shutil, math
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from dateutil import parser as dateparser

from sentence_transformers import SentenceTransformer
import hdbscan
import umap

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer

import requests
from tenacity import retry, wait_exponential, stop_after_attempt

warnings.filterwarnings("ignore")
tqdm.pandas()

# =============================================================================
# 0) KONFIGURASI
# =============================================================================

def load_config_safe(path: Path) -> dict:
    if not path.exists():
        path.parent.mkdir(parents=True, exist_ok=True)
        return {}
    with path.open("rb") as f:
        return orjson.loads(f.read())

def save_config_safe(path: Path, data: dict):
    tmp = Path(tempfile.mkstemp(prefix="cfg_", suffix=".json", dir=path.parent.as_posix())[1])
    try:
        with tmp.open("wb") as f:
            f.write(orjson.dumps(data, option=orjson.OPT_INDENT_2))
        if path.exists():
            shutil.copy2(path, path.with_suffix(path.suffix + ".bak"))
        os.replace(tmp, path)
    finally:
        if tmp.exists():
            try: tmp.unlink()
            except: pass

config = load_config_safe(CONFIG_PATH)

# Input CSV (prioritas last_output_path, fallback OUTPUT_CSV)
INPUT_CSV = config.get("last_output_path") or config.get("OUTPUT_CSV")
if not INPUT_CSV or not Path(INPUT_CSV).exists():
    raise FileNotFoundError(
        f"Input CSV tidak ditemukan. Isi 'last_output_path' atau 'OUTPUT_CSV' di {CONFIG_PATH}"
    )

# Output CSV
out_dir = Path(INPUT_CSV).parent
OUTPUT_CSV = str(out_dir / "news_clustered_labeled.csv")

# Opsi dari config.json (bisa diubah)
EMB_MODEL_NAME = config.get("embedding_model", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
USE_UMAP = bool(config.get("use_umap", True))     # otomatis dimatikan utk N<50 (lihat bawah)
FILTER_TO_THREE_ISSUES = bool(config.get("filter_to_three_issues", True))
ENABLE_LLM = bool(config.get("enable_llm", False))
ENABLE_LLM_CLUSTER_LABEL = bool(config.get("enable_llm_cluster_label", True)) and ENABLE_LLM
RESET_CLUSTER_LABEL_CACHE = bool(config.get("reset_cluster_label_cache", False))

@dataclass
class LLMConfig:
    provider: str = "openai"     # "openai" atau "deepseek"
    model: str = "gpt-4o-mini"
    temperature: float = 0.2
    max_tokens: int = 200

llm_cfg = LLMConfig(
    provider = config.get("llm_provider", "openai"),
    model = config.get("llm_model", "gpt-4o-mini"),
    temperature = float(config.get("llm_temperature", 0.2)),
    max_tokens = int(config.get("llm_max_tokens", 200)),
)

print("CONFIG OK")
print("INPUT_CSV :", INPUT_CSV)
print("OUTPUT_CSV:", OUTPUT_CSV)
print("LLM:", {"enabled": ENABLE_LLM, "cluster_label_via_llm": ENABLE_LLM_CLUSTER_LABEL, "provider": llm_cfg.provider, "model": llm_cfg.model})

# =============================================================================
# 1) UTIL
# =============================================================================
def normalize_whitespace(s: str) -> str:
    return re.sub(r"\s+", " ", s).strip()

def final_cleanup(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = re.sub(r"[ \t]+", " ", text)
    lines = [ln.strip() for ln in text.splitlines() if len(ln.strip()) > 2]
    return "\n".join(lines).strip()

def safe_parse_dt(x) -> Optional[pd.Timestamp]:
    try:
        return pd.to_datetime(dateparser.parse(str(x)), errors="coerce")
    except Exception:
        return None

# =============================================================================
# 2) LOAD DATA
# =============================================================================
df = pd.read_csv(INPUT_CSV)
expected_cols = {"judul_berita", "url_berita", "artikel_berita"}
missing = expected_cols - set(df.columns)
if missing:
    raise KeyError(f"Kolom wajib hilang: {missing}. Pastikan CSV punya {expected_cols}")

# kolom tanggal_berita_dt (opsional)
if "tanggal_berita" in df.columns and "tanggal_berita_dt" not in df.columns:
    df["tanggal_berita_dt"] = df["tanggal_berita"].apply(safe_parse_dt)

initial_columns = df.columns.tolist()

df["artikel_berita"] = df["artikel_berita"].fillna("").map(final_cleanup)
df["is_empty"] = df["artikel_berita"].str.len().fillna(0).lt(60)
df = df.loc[~df["is_empty"]].drop(columns=["is_empty"]).reset_index(drop=True)
print("Records:", len(df))

# =============================================================================
# 3) EMBEDDINGS
# =============================================================================
emb_model = SentenceTransformer(EMB_MODEL_NAME)

def doc_text(row) -> str:
    return normalize_whitespace(f"{row.get('judul_berita','')}\n\n{row.get('artikel_berita','')}")

texts = df.apply(doc_text, axis=1).tolist()

embeddings = []
for i in tqdm(range(0, len(texts), 64), desc="Embedding"):
    batch = texts[i:i+64]
    vec = emb_model.encode(batch, show_progress_bar=False, normalize_embeddings=True)
    embeddings.append(vec)
embeddings = np.vstack(embeddings)
print("Embeddings:", embeddings.shape)

# =============================================================================
# 4) CLUSTERING ADAPTIF (HDBSCAN -> fallback KMeans)
# =============================================================================
N = len(df)

use_umap = (USE_UMAP and N >= 50)
X = embeddings
metric = "cosine"

if use_umap:
    reducer = umap.UMAP(
        n_neighbors=15, n_components=15, min_dist=0.0,
        metric="cosine", random_state=42
    )
    X = reducer.fit_transform(embeddings)
    metric = "euclidean"

def run_hdbscan_try(X, min_cluster_size, min_samples=1, eps=0.0, method="leaf"):
    return hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric=metric,
        cluster_selection_epsilon=eps,
        cluster_selection_method=method,
        prediction_data=True,
    ).fit_predict(X)

# Heuristik min_cluster_size adaptif
mcs_candidates = [max(2, int(round(N*0.2))), 3, 2] if N < 30 else [max(5, int(round(N*0.03))), max(5, int(round(N*0.02)))]

labels = np.full(N, -1)
used_params = None
for mcs in mcs_candidates:
    for eps in [0.0, 0.05, 0.1]:
        for method in ["leaf", "eom"]:
            try:
                lab = run_hdbscan_try(X, min_cluster_size=mcs, min_samples=1, eps=eps, method=method)
                if (lab != -1).any():
                    labels = lab
                    used_params = {"algo": "hdbscan", "min_cluster_size": mcs, "eps": eps, "method": method, "umap": use_umap}
                    break
            except Exception:
                pass
        if (labels != -1).any(): break
    if (labels != -1).any(): break

# Fallback KMeans
if not (labels != -1).any():
    if N <= 2:
        km_labels = np.zeros(N, dtype=int)
    else:
        k_min, k_max = 2, min(5, N-1)
        best = (-1.0, None)
        for k in range(k_min, k_max+1):
            km = KMeans(n_clusters=k, n_init=10, random_state=42)
            pred = km.fit_predict(X)
            if len(set(pred)) > 1:
                try:
                    sc = silhouette_score(X, pred, metric="euclidean" if use_umap else "cosine")
                except Exception:
                    sc = -1.0
            else:
                sc = -1.0
            if sc > best[0]:
                best = (sc, pred)
        km_labels = best[1] if best[1] is not None else np.zeros(N, dtype=int)
    labels = km_labels
    used_params = {"algo": "kmeans", "umap": use_umap}

df["cluster_id"] = labels
n_clusters = len(set(labels)) - (1 if -1 in set(labels) else 0)
print(f"[Clustering] Params: {used_params} | unique labels: {sorted(set(labels))} | noise: {(labels==-1).sum()}")

# =============================================================================
# 5) EKSTRAKSI TOP-TERMS PER CLUSTER (anti stopwords/boilerplate)
# =============================================================================
# Stopwords Indonesia ringkas + bisa ditambah
STOPWORDS_ID = {
    "yang","dan","di","ke","dari","pada","untuk","dengan","ini","itu","atau","juga","karena","sebagai",
    "dalam","oleh","akan","telah","sudah","adalah","bagi","para","sebuah","tersebut","hingga","antara",
    "kami","kita","mereka","ia","dia","sehingga","agar","yakni","yaitu","selain","bahwa","saat","ketika",
    "setelah","sebelum","lebih","masih","namun","tapi","serta","pun","tiap","setiap","jadi","ingat",
    "bisa","dapat","maupun","kepada","atas","tahun","bulan","hari","kemarin","besok","senin","selasa",
    "rabu","kamis","jumat","sabtu","minggu","jakarta","indonesia"
}
PORTAL_WORDS = {"liputan6","liputan6.com","detik","detikcom","cnbc","cnbcindonesia","okezone","kompas","cnn","cnnindonesia"}
KEEP_TOKENS = {
    "apbn","apbd","pnbp","kemenkeu","djp","djbc","djkn","djppr","sbn","bea","cukai","pajak",
    "mk","dpr","kpk","polri","tni","imf","asean","g20","apec","pbb","oecd","ue","wto",
    "rupiah","usd","ekspor","impor","bansos","bea cukai","surat berharga negara"
}
TOKEN_RE = re.compile(r"[a-zA-Z]{3,}")

def _normalize_portal(text: str) -> str:
    text = re.sub(r"(?i)\b(liputan6\.com|detikcom|cnnindonesia|cnbcindonesia|okezone|kompas)\b", " ", text)
    text = re.sub(r"^[A-Za-z\. ]+,\s*[A-Za-z ]+\s*-\s*", " ", text)  # "Liputan6.com, Jakarta - "
    return text

def make_custom_analyzer():
    def analyzer(doc: str):
        doc = _normalize_portal(doc.lower())
        toks = []
        for t in TOKEN_RE.findall(doc):
            if (t in KEEP_TOKENS) or (t not in STOPWORDS_ID and t not in PORTAL_WORDS):
                if any(ch.isdigit() for ch in t):
                    continue
                if t in {"rp","idr","000"}:
                    continue
                toks.append(t)
        bigrams = [f"{toks[i]} {toks[i+1]}" for i in range(len(toks)-1)
                   if toks[i] not in STOPWORDS_ID and toks[i+1] not in STOPWORDS_ID]
        return toks + bigrams
    return analyzer

def extract_cluster_top_terms(docs: List[str], labels: np.ndarray, topk: int = 20) -> Dict[int, List[Tuple[str, float]]]:
    clusters: Dict[int, List[str]] = {}
    for t, lab in zip(docs, labels):
        if lab == -1:   # abaikan noise
            continue
        clusters.setdefault(lab, []).append(t)

    cluster_docs, cluster_ids = [], []
    for lab, items in clusters.items():
        cluster_ids.append(lab)
        cluster_docs.append(" ".join(items))

    if not cluster_docs:
        return {}

    min_df_val = 1 if len(cluster_docs) < 30 else 2
    vectorizer = TfidfVectorizer(
        analyzer=make_custom_analyzer(),
        ngram_range=(1,2),
        max_df=0.80,
        min_df=min_df_val,
        max_features=30000,
        lowercase=True,
        strip_accents="unicode",
        sublinear_tf=True
    )
    tfidf = vectorizer.fit_transform(cluster_docs)
    vocab = np.array(vectorizer.get_feature_names_out())

    top_terms: Dict[int, List[Tuple[str, float]]] = {}
    for i, lab in enumerate(cluster_ids):
        row = tfidf.getrow(i).toarray().ravel()
        if not row.any():
            continue
        idx = np.argsort(-row)[:topk]
        terms = [(vocab[j], float(row[j])) for j in idx]
        top_terms[lab] = terms
    return top_terms

cluster_top_terms = extract_cluster_top_terms(texts, labels, topk=20)

# =============================================================================
# 6) MAPPING ISU (Kemenkeu / Nasional / Internasional)
# =============================================================================
KEMENKEU_TERMS = {
    "kemenkeu","kementerian keuangan","sri mulyani","apbn","apbd","pajak","djp","bea cukai","djbc",
    "pnbp","djkn","lkpp","djppr","lelang","surat berharga negara","sbn","penerimaan negara",
    "bea masuk","cukai","insentif fiskal","tax","perpajakan"
}
NASIONAL_TERMS = {
    "presiden","wakil presiden","sekretariat negara","istana","kemendagri","kemenaker","kemenhub","kemenkes",
    "kemendikbud","kemenkumham","kemenlu","kemendag","kemenperin","dpr","mk","kpk","polri","tni",
    "pemilu","pilkada","perppu","permen","perpres","perda","menteri"
}
INTERNASIONAL_TERMS = {
    "asean","g20","apec","imf","world bank","pbb","un","oecd","who","wto","fta","perjanjian dagang",
    "bilateral","multilateral","kunjungan kenegaraan","duta besar","kedutaan","ekspor","impor",
    "china","tiongkok","amerika serikat","usa","jepang","korea","australia","singapura","malaysia",
    "eropa","uni eropa","ue","timur tengah","india","rusia","ukraina","palestina","israel"
}

def compile_terms(terms):
    items = sorted(list(terms), key=len, reverse=True)
    pat = r"\b(" + "|".join(re.escape(x) for x in items) + r")\b"
    return re.compile(pat, flags=re.IGNORECASE)

RE_KEMENKEU = compile_terms(KEMENKEU_TERMS)
RE_NASIONAL = compile_terms(NASIONAL_TERMS)
RE_INTERNASIONAL = compile_terms(INTERNASIONAL_TERMS)

def infer_cluster_issue(lab: int, top_terms: Dict[int, List[Tuple[str,float]]]) -> Optional[str]:
    if lab == -1 or lab not in top_terms:
        return None
    joined = " ".join([t for t, _ in top_terms[lab]]).lower()
    sc = {
        "Kemenkeu": len(RE_KEMENKEU.findall(joined)),
        "Nasional": len(RE_NASIONAL.findall(joined)),
        "Internasional": len(RE_INTERNASIONAL.findall(joined)),
    }
    best = max(sc, key=sc.get)
    return best if sc[best] > 0 else None

def score_text_issue(text: str) -> Dict[str, int]:
    text = text.lower()
    return {
        "Kemenkeu": len(RE_KEMENKEU.findall(text)),
        "Nasional": len(RE_NASIONAL.findall(text)),
        "Internasional": len(RE_INTERNASIONAL.findall(text)),
    }

def infer_issue_for_row(row) -> Optional[str]:
    lab = row["cluster_id"]
    content = f"{row.get('judul_berita','')}\n{row.get('artikel_berita','')}"
    # 1) isu dari cluster (top terms)
    ci = infer_cluster_issue(lab, cluster_top_terms)
    if ci:
        return ci
    # 2) fallback skor per artikel
    sc = score_text_issue(content)
    best = max(sc, key=sc.get)
    return best if sc[best] > 0 else None

df["kategori_isu"] = df.apply(infer_issue_for_row, axis=1)

if FILTER_TO_THREE_ISSUES:
    df = df[df["kategori_isu"].isin(["Kemenkeu","Nasional","Internasional"])].copy().reset_index(drop=True)

# =========================
# 7) LLM-BASED CLUSTER LABEL (kalimat manusiawi) + CACHE
# =========================
import re, json, unicodedata

# 7.1 Normalisasi/ekspansi istilah agar LLM paham konteks (anti "keywordy")
ABBR_MAP = {
    r"\bmenkeu\b": "Menteri Keuangan",
    r"\bdjp\b": "Direktorat Jenderal Pajak",
    r"\bdjbc\b": "Direktorat Jenderal Bea dan Cukai",
    r"\bdjkn\b": "Direktorat Jenderal Kekayaan Negara",
    r"\bdjppr\b": "Direktorat Jenderal Pengelolaan Pembiayaan dan Risiko",
    r"\bsbn\b": "Surat Berharga Negara",
    r"\bapbn\b": "APBN",
    r"\boecd\b": "OECD",
    r"\bimf\b": "IMF",
    r"\bpbb\b": "PBB",
}

def expand_terms_for_prompt(terms: list[str]) -> list[str]:
    out = []
    for t in terms:
        s = t.lower()
        for pat, rep in ABBR_MAP.items():
            s = re.sub(pat, rep, s, flags=re.IGNORECASE)
        # rapikan spasi & kapitalisasi nama lembaga
        s = re.sub(r"\s+", " ", s).strip()
        out.append(s)
    # utamakan bigram/phrase dulu agar konteks lebih utuh
    out_sorted = sorted(out, key=lambda x: (0 if " " in x else 1, len(x)))
    return out_sorted

def term_has_stopword(term: str) -> bool:
    parts = term.lower().split()
    return any((p in STOPWORDS_ID or p in PORTAL_WORDS) and (p not in KEEP_TOKENS) for p in parts)

def top_keywords_for_prompt(cid: int, k: int = 8) -> list[str]:
    terms = [t for t,_ in cluster_top_terms.get(cid, []) if not term_has_stopword(t)]
    terms = expand_terms_for_prompt(terms)
    # buang token yang terlalu umum seperti "kebijakan", "pemerintah" jika berdiri sendiri
    generic = {"kebijakan","pemerintah","nasional","internasional","ekonomi"}
    terms = [t for t in terms if t not in generic]
    return terms[:k]

# 7.2 Ambil 2–3 sampel (judul + potongan isi) untuk memberi konteks
def pick_cluster_samples(df_in: pd.DataFrame, cluster_id: int, n: int = 3, excerpt_len: int = 700) -> str:
    sub = df_in[df_in["cluster_id"] == cluster_id].head(n)
    samples = []
    for _, row in sub.iterrows():
        title = normalize_whitespace(str(row.get("judul_berita","")))
        text  = normalize_whitespace(str(row.get("artikel_berita","")))
        excerpt = text[:excerpt_len]
        samples.append(f"- {title}\n  Cuplikan: {excerpt}")
    return "\n".join(samples).strip()

# 7.3 Sistem & prompt: wajib keluaran JSON + kalimat manusiawi (6–14 kata)
CLUSTER_LABEL_SYSTEM = (
    "Kamu adalah asisten pelabelan topik berita.\n"
    "Tugas: Buat SATU KALIMAT PENDEK (6–14 kata), bahasa Indonesia, mudah dipahami manusia.\n"
    "Syarat kalimat: mengandung subjek + aksi + objek/konteks (mis. 'Menteri Keuangan umumkan aturan pajak baru untuk UMKM').\n"
    "Larangan: jangan pakai pemisah pipa (|), jangan hanya daftar kata, jangan akronim mentah (gunakan bentuk lengkap jika mungkin), jangan pakai tanda kutip.\n"
    "Gaya: faktual, ringkas, huruf kecil kecuali nama/jabatan/lembaga.\n"
    "Keluaran WAJIB JSON valid (response_format JSON): {\"label\":\"...\",\"confidence\":0-1}."
)

def build_cluster_label_prompt(keywords: list[str], samples_block: str) -> list[dict]:
    kw_str = ", ".join(keywords[:8]) if keywords else "-"
    user = (
        f"Kata kunci utama cluster:\n{kw_str}\n\n"
        f"Sampel artikel (judul + cuplikan):\n{samples_block}\n\n"
        "Hasilkan SATU kalimat (6–14 kata) yang merangkum tema utama cluster.\n"
        "Berikan HANYA JSON valid: {\"label\":\"...\",\"confidence\":0.xx}"
    )
    return [{"role":"system","content":CLUSTER_LABEL_SYSTEM},
            {"role":"user","content":user}]

# 7.4 Panggil OpenAI/Deepseek dengan JSON mode (tetap kompatibel dgn kode kamu)
def _openai_chat(model, messages, temperature, max_tokens):
    import openai
    client = openai.OpenAI()
    resp = client.chat.completions.create(
        model=model,
        temperature=temperature,
        max_tokens=max_tokens,
        messages=messages,
        response_format={"type":"json_object"}  # JSON mode
    )
    return resp.choices[0].message.content

def _deepseek_chat(model, messages, temperature, max_tokens):
    headers = {
        "Authorization": f"Bearer {os.environ.get('DEEPSEEK_API_KEY','')}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": model,
        "messages": messages,
        "temperature": float(temperature),
        "max_tokens": int(max_tokens)
    }
    r = requests.post("https://api.deepseek.com/chat/completions", headers=headers, json=payload, timeout=60)
    r.raise_for_status()
    data = r.json()
    # DeepSeek belum dukung json_schema penuh; tetap parse JSON object jika tersedia
    return data["choices"][0]["message"]["content"]

@retry(wait=wait_exponential(multiplier=1, min=2, max=20), stop=stop_after_attempt(3))
def call_llm_cluster_label(keywords: list[str], samples_block: str, cfg: LLMConfig) -> dict:
    messages = build_cluster_label_prompt(keywords, samples_block)
    if cfg.provider == "openai":
        out = _openai_chat(cfg.model, messages, cfg.temperature, cfg.max_tokens)
    elif cfg.provider == "deepseek":
        out = _deepseek_chat(cfg.model, messages, cfg.temperature, cfg.max_tokens)
    else:
        raise ValueError(f"Provider LLM tidak dikenali: {cfg.provider}")
    # parse JSON
    try:
        data = orjson.loads(out)
    except Exception:
        m = re.search(r"\{.*\}", out, flags=re.DOTALL)
        data = orjson.loads(m.group(0)) if m else {}
    return {"label": str(data.get("label","")).strip(), "confidence": float(data.get("confidence", 0.6))}

# 7.5 Pasca-proses agar selalu “kalimat manusiawi”
GENERIC_FILLER = [
    ("kebijakan", "kebijakan baru"),
    ("regulasi", "regulasi baru"),
    ("aturan", "aturan terbaru"),
]

def humanize_sentence(label: str) -> str:
    if not isinstance(label, str) or not label.strip():
        return ""
    s = label.strip()

    # hapus pipe/daftar & karakter aneh
    s = s.replace("|", ",")
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r"\s+", " ", s).strip()

    # jika tidak ada spasi (sangat keywordy), ubah jadi frasa minimal
    if " " not in s:
        for a,b in GENERIC_FILLER:
            if s.lower() == a:
                s = b
                break

    # kapitalisasi kalimat: huruf pertama kapital; kata nama tetap ditulis oleh model
    s = s[0].upper() + s[1:] if s else s

    # tambahkan titik akhir kalau belum ada
    if not re.search(r"[.!?]$", s):
        s += "."

    # potong jika > 22 kata (jaga ringkas)
    words = s.split()
    if len(words) > 22:
        s = " ".join(words[:22]) + "."

    return s

# 7.6 Orkestrasi + cache
CACHE_PATH = Path(OUTPUT_CSV).with_name("cluster_labels_llm.json")
if RESET_CLUSTER_LABEL_CACHE and CACHE_PATH.exists():
    CACHE_PATH.unlink()

rule_labels = {cid: " / ".join([t for t,_ in cluster_top_terms.get(cid, [])][:5]) or f"Cluster {cid}"
               for cid in cluster_top_terms.keys()}

llm_label_cache: dict[str,str] = {}
if CACHE_PATH.exists():
    try:
        llm_label_cache = json.loads(CACHE_PATH.read_text(encoding="utf-8"))
    except Exception:
        llm_label_cache = {}

cluster_ids = sorted([cid for cid in set(df["cluster_id"]) if cid != -1])
cluster_labels_final: dict[int, str] = {}

print(f"Menamai {len(cluster_ids)} cluster | via LLM={ENABLE_LLM_CLUSTER_LABEL} (fallback rule-based)")
for cid in tqdm(cluster_ids):
    cache_key = str(cid)
    if cache_key in llm_label_cache and llm_label_cache[cache_key]:
        cluster_labels_final[cid] = humanize_sentence(llm_label_cache[cache_key])
        continue

    top_kw = top_keywords_for_prompt(cid, k=8)
    samples = pick_cluster_samples(df, cid, n=3, excerpt_len=700)
    if not top_kw or not samples or not ENABLE_LLM_CLUSTER_LABEL:
        cluster_labels_final[cid] = humanize_sentence(rule_labels.get(cid, f"Cluster {cid}"))
        continue

    try:
        resp = call_llm_cluster_label(top_kw, samples, llm_cfg)
        label = humanize_sentence(resp.get("label",""))
        if not label:
            label = humanize_sentence(rule_labels.get(cid, f"Cluster {cid}"))
        cluster_labels_final[cid] = label
        llm_label_cache[cache_key] = label
        CACHE_PATH.write_text(json.dumps(llm_label_cache, ensure_ascii=False, indent=2), encoding="utf-8")
    except Exception:
        cluster_labels_final[cid] = humanize_sentence(rule_labels.get(cid, f"Cluster {cid}"))

# Map ke df + noise
df["cluster_label"] = df["cluster_id"].map(cluster_labels_final)
df.loc[df["cluster_id"] == -1, "cluster_label"] = "Campuran/Umum (noise)."


# =============================================================================
# 8) SUSUN KOLOM & SIMPAN CSV + UPDATE CONFIG
# =============================================================================
new_cols = ["cluster_id", "kategori_isu", "cluster_label"]
for c in new_cols:
    if c not in df.columns:
        df[c] = None

ordered_cols = [c for c in initial_columns if c in df.columns] + [c for c in new_cols if c not in initial_columns]
df = df[ordered_cols]

df.to_csv(OUTPUT_CSV, index=False)

# update last_output_path
config = load_config_safe(CONFIG_PATH)
config["last_output_path"] = OUTPUT_CSV
save_config_safe(CONFIG_PATH, config)

print("=== DONE ===")
print("Saved CSV:", OUTPUT_CSV)
print("Updated config.last_output_path:", config.get("last_output_path"))
print("Cluster label cache:", str(CACHE_PATH))

# -----------------------------------------------------------------------------
# Catatan kredensial (set di environment, JANGAN di config.json)
# OpenAI   : os.environ["OPENAI_API_KEY"] = "sk-..."
# DeepSeek : os.environ["DEEPSEEK_API_KEY"] = "sk-..."
# -----------------------------------------------------------------------------


CONFIG OK
INPUT_CSV : /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/hasil_baca_berita/hasil_scraping_artikel_20250926_084047.csv
OUTPUT_CSV: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/hasil_baca_berita/news_clustered_labeled.csv
LLM: {'enabled': True, 'cluster_label_via_llm': True, 'provider': 'deepseek', 'model': 'deepseek-chat'}
Records: 100


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding:   0%|          | 0/2 [00:00<?, ?it/s]

Embeddings: (100, 384)
[Clustering] Params: {'algo': 'hdbscan', 'min_cluster_size': 5, 'eps': 0.0, 'method': 'leaf', 'umap': True} | unique labels: [-1, 0, 1, 2, 3, 4, 5, 6] | noise: 10
Menamai 7 cluster | via LLM=True (fallback rule-based)


  0%|          | 0/7 [00:00<?, ?it/s]

=== DONE ===
Saved CSV: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/hasil_baca_berita/news_clustered_labeled.csv
Updated config.last_output_path: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/hasil_baca_berita/news_clustered_labeled.csv
Cluster label cache: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/hasil_baca_berita/cluster_labels_llm.json


In [21]:
df



Unnamed: 0,judul_berita,url_berita,tanggal_berita,source_domain,artikel_berita,status,error,tanggal_berita_dt,cluster_id,kategori_isu,cluster_label
0,Potret Menkeu Sri Mulyani ke Sekolah Rakyat: M...,https://www.detik.com/edu/foto/d-8052823/potre...,2025-08-09T14:15:14+07:00,www.detik.com,Jakarta - Menteri Keuangan (Menkeu) Sri Mulyan...,ok_trafilatura,,2025-08-09 14:15:14+07:00,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
1,"Budi Gunawan dan Hendi Terpental dari Kabinet,...",https://www.tribunnews.com/nasional/7733131/bu...,2025-09-24T21:27:00+07:00,www.tribunnews.com,"TRIBUNNEWS.COM, JAKARTA - Perombakan kabinet b...",ok_site_specific,,2025-09-24 21:27:00+07:00,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
2,"Purbaya Bakal Kejar 200 Penunggak Pajak Jumbo,...",https://www.cnnindonesia.com/ekonomi/202509221...,2025-09-22T15:37:09+07:00,www.cnnindonesia.com,"CNN Indonesia\nSenin, 22 Sep 2025 15:37 WIB\nJ...",ok_trafilatura,,2025-09-22 15:37:09+07:00,0,Kemenkeu,Pemerintah kejar penunggak pajak besar dan kaj...
3,12:40 VIDEO: Air Mata dan Nyanyian Pegawai Kem...,https://www.cnnindonesia.com/tv/20250909165335...,2025-09-09T17:03:02+07:00,www.cnnindonesia.com,"CNN Indonesia TV | CNN Indonesia\nSelasa, 09 S...",ok_trafilatura,,2025-09-09 17:03:02+07:00,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
4,Mengurai Stagnansi Rasio Pajak Indonesia,https://finance.detik.com/foto-bisnis/d-792203...,2025-05-19T19:15:12+07:00,finance.detik.com,Jakarta - Ikatan Konsultan Pajak Indonesia men...,ok_trafilatura,,2025-05-19 19:15:12+07:00,0,Kemenkeu,Pemerintah kejar penunggak pajak besar dan kaj...
...,...,...,...,...,...,...,...,...,...,...,...
81,Cerita Prabowo Begadang Pelajari 9.000 Halaman...,https://news.okezone.com/read/2025/09/26/337/3...,"Nasional | Jum'at, 26 September 2025 06:55 WIB...",news.okezone.com,Presiden Prabowo Subianto dan PM Kanada Mark C...,ok_trafilatura,,NaT,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
82,Rocky Gerung Sebut Prabowo Mulai Tampak Milite...,https://www.tribunnews.com/nasional/7733148/ro...,2025-09-24T22:01:00+07:00,www.tribunnews.com,TRIBUNNEWS.COM - Pengamat politik Rocky Gerung...,ok_site_specific,,2025-09-24 22:01:00+07:00,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
83,"Waka DPR Buka Peluang Kodifikasi UU Pemilu, Pi...",https://news.detik.com/berita/d-8129011/waka-d...,2025-09-24T22:58:00+07:00,news.detik.com,Wakil Ketua DPR RI Saan Mustopa mengatakan pih...,ok_site_specific,,2025-09-24 22:58:00+07:00,5,Nasional,DPR setujui anggaran triliunan rupiah untuk be...
84,Rencana Dasco Bentuk Badan Pelaksana Reforma A...,https://www.tribunnews.com/nasional/7733130/re...,2025-09-24T21:21:00+07:00,www.tribunnews.com,"TRIBUNNEWS.COM, JAKARTA – Ketua Umum Pimpinan ...",ok_site_specific,,2025-09-24 21:21:00+07:00,3,Internasional,Sri Mulyani mengakhiri jabatan sebagai Menteri...
