# SETTING ENVIRONMENT


In [None]:
# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# set folder tempat kerja (current working directory)
import os

cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [None]:
# =========================================================
# Judul: Analisis Keterkaitan Judul Berita vs Keywords (Kontekstual)
# Deskripsi: Membaca config.json (last_output_path, keywords),
#            memuat CSV berita, lalu menilai relevansi judul
#            terhadap daftar keywords menggunakan pendekatan hybrid:
#            - Lexical match (regex),
#            - Fuzzy similarity (RapidFuzz/difflib),
#            - Semantic similarity (embeddings - SentenceTransformers).
# Output: DataFrame dengan kolom skor & label relevansi.
# =========================================================

import os
import json
import re
import logging
from typing import List, Tuple, Dict, Any, Optional

import pandas as pd
import numpy as np

# Fuzzy library (dengan fallback)
try:
    from rapidfuzz import fuzz, process as rf_process
    _USE_RAPIDFUZZ = True
except Exception:
    import difflib
    _USE_RAPIDFUZZ = False

# Embeddings
try:
    from sentence_transformers import SentenceTransformer, util
    _EMBEDDINGS_AVAILABLE = True
except Exception:
    _EMBEDDINGS_AVAILABLE = False


# -----------------------------
# Konfigurasi logging
# -----------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)


# -----------------------------
# Util: Baca config.json dengan aman
# -----------------------------
def load_config(path: str = "config.json") -> Dict[str, Any]:
    """
    Memuat file config JSON.
    Wajib mengandung kunci: 'last_output_path' dan 'topic_keywords'.
    - 'topic_keywords' boleh berupa string atau list[str].
    Return dict config.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"config.json tidak ditemukan di: {os.path.abspath(path)}")

    try:
        with open(path, "r", encoding="utf-8") as f:
            cfg = json.load(f)
    except json.JSONDecodeError as e:
        raise ValueError(f"Gagal parse JSON pada {path}: {e}") from e

    # Validasi kunci
    if "last_output_path" not in cfg:
        raise KeyError("config.json tidak memiliki kunci 'last_output_path'")
    if "topic_keywords" not in cfg:
        raise KeyError("config.json tidak memiliki kunci 'topic_keywords'")

    return cfg


# -----------------------------
# Util: Normalisasi keywords
# -----------------------------
def normalize_keywords(kws: Any) -> List[str]:
    """
    Normalisasi keywords agar menjadi list[str] non-kosong.
    Mendukung:
      - string tunggal: "purbaya"
      - string dengan koma: "ekonomi, investasi, rupiah"
      - list: ["ekonomi", "investasi"]
    """
    if kws is None:
        return []
    if isinstance(kws, list):
        norm = [str(k).strip() for k in kws if str(k).strip()]
        return norm
    # Jika string
    if isinstance(kws, str):
        # Pisah koma jika ada
        parts = [p.strip() for p in kws.split(",")] if "," in kws else [kws.strip()]
        return [p for p in parts if p]
    # Tipe lain
    return [str(kws).strip()] if str(kws).strip() else []


# -----------------------------
# Util: Baca CSV dengan aman
# -----------------------------
def read_news_csv(csv_path: str, required_cols: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Membaca CSV berita dan memvalidasi kolom minimal.
    required_cols default sesuai spesifikasi.
    """
    if required_cols is None:
        required_cols = ["judul_berita", "penulis_berita", "url_berita", "media_online"]

    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV tidak ditemukan di: {os.path.abspath(csv_path)}")

    try:
        df = pd.read_csv(csv_path)
    except UnicodeDecodeError:
        # fallback encoding umum
        df = pd.read_csv(csv_path, encoding="latin-1")
    except Exception as e:
        raise RuntimeError(f"Gagal membaca CSV: {e}") from e

    # Validasi kolom
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Kolom wajib hilang di CSV: {missing}. Kolom tersedia: {list(df.columns)}")

    # Pastikan kolom judul_berita ada dan bukan semua NaN
    if df["judul_berita"].isna().all():
        logging.warning("Semua nilai 'judul_berita' adalah NaN. Hasil analisis bisa kosong.")

    # Drop baris tanpa judul
    df = df.dropna(subset=["judul_berita"]).copy()
    df["judul_berita"] = df["judul_berita"].astype(str)

    return df


# -----------------------------
# Lexical match (regex contains)
# -----------------------------
def lexical_match_score(title: str, keyword: str) -> int:
    """
    Memberi skor 1 jika keyword muncul sebagai token (case-insensitive) di title, 0 jika tidak.
    Menggunakan regex word boundary agar tidak memicu pada substring yang salah.
    """
    if not title or not keyword:
        return 0
    # Escape keyword agar literal, pakai boundary longgar: \bkeyword\b (case-insensitive)
    pattern = r"\b{}\b".format(re.escape(keyword))
    return 1 if re.search(pattern, title, flags=re.IGNORECASE) else 0


# -----------------------------
# Fuzzy similarity
# -----------------------------
def fuzzy_similarity(title: str, keyword: str) -> float:
    """
    Skor fuzzy 0..1 antara title dan keyword.
    Menggunakan RapidFuzz jika tersedia, fallback ke difflib.
    """
    if not title or not keyword:
        return 0.0

    if _USE_RAPIDFUZZ:
        # token_set_ratio cukup robust untuk frasa pendek
        score = fuzz.token_set_ratio(title, keyword) / 100.0
    else:
        score = difflib.SequenceMatcher(None, title.lower(), keyword.lower()).ratio()
    return float(score)


# -----------------------------
# Embedding-based cosine similarity
# -----------------------------
class Embedder:
    """
    Pembungkus model embeddings agar lazy-load & optional.
    """
    def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None

    def ensure_model(self):
        if not _EMBEDDINGS_AVAILABLE:
            raise RuntimeError(
                "Paket 'sentence-transformers' tidak tersedia. "
                "Silakan install dengan: pip install sentence-transformers"
            )
        if self.model is None:
            self.model = SentenceTransformer(self.model_name)

    def encode(self, texts: List[str]):
        self.ensure_model()
        return self.model.encode(texts, convert_to_tensor=True)

    @staticmethod
    def cosine_sim_matrix(emb_titles, emb_keywords) -> np.ndarray:
        """
        Menghasilkan matriks cosine similarity (n_titles x n_keywords)
        """
        sims = util.cos_sim(emb_titles, emb_keywords)  # tensor
        return sims.cpu().numpy()


# -----------------------------
# Pipeline Analisis Relevansi
# -----------------------------
def analyze_relevance(
    df: pd.DataFrame,
    keywords: List[str],
    embedder: Optional[Embedder] = None,
    w_lexical: float = 0.20,
    w_fuzzy: float = 0.30,
    w_embed: float = 0.50,
    relevance_threshold: float = 0.55,
) -> pd.DataFrame:
    """
    Hitung relevansi tiap judul terhadap daftar keywords.
    Skor gabungan: w_lexical*lex + w_fuzzy*max_fuzzy + w_embed*max_embed
    - lex: 0/1 (apakah keyword sebagai token ada di judul)
    - fuzzy: max ratio atas seluruh keywords
    - embed: max cosine similarity atas seluruh keywords
    Output: df dengan kolom:
      - top_keyword (keyword paling mirip)
      - lex_match (0/1 untuk top_keyword)
      - fuzz_score (0..1 untuk top_keyword)
      - embed_sim (0..1 untuk top_keyword)
      - relevance_score (0..1)
      - is_relevant (bool di atas ambang)
    """
    if not keywords:
        raise ValueError("Daftar keywords kosong. Pastikan 'keywords' di config.json terisi.")

    titles = df["judul_berita"].astype(str).fillna("").tolist()
    n = len(titles)
    k = len(keywords)
    logging.info(f"Jumlah judul: {n} | Jumlah keywords: {k}")

    # --- Lexical & Fuzzy per pasangan (iteratif) untuk top keyword per judul
    lex_scores = np.zeros((n, k), dtype=float)
    fuzz_scores = np.zeros((n, k), dtype=float)

    for j, kw in enumerate(keywords):
        # Pre-kompilasi sederhana untuk percepatan lexical (opsional)
        for i, t in enumerate(titles):
            lex_scores[i, j] = lexical_match_score(t, kw)
            fuzz_scores[i, j] = fuzzy_similarity(t, kw)

    # --- Embeddings (vectorized) jika tersedia
    if embedder is not None:
        try:
            emb_titles = embedder.encode(titles)
            emb_keywords = embedder.encode(keywords)
            embed_sims = Embedder.cosine_sim_matrix(emb_titles, emb_keywords)  # shape (n, k)
            # Normalisasi simpelnya: cosine sim dari SBERT biasanya ~[-1..1], kita potong ke [0..1]
            embed_sims = (embed_sims + 1.0) / 2.0
        except Exception as e:
            logging.error(f"Gagal menghitung embeddings, akan lanjut tanpa embeddings. Detail: {e}")
            embed_sims = np.zeros((n, k), dtype=float)
    else:
        embed_sims = np.zeros((n, k), dtype=float)

    # --- Ambil top keyword per judul (berdasarkan embed dulu, lalu fuzzy sebagai tie-breaker)
    # Skor gabungan per keyword (untuk menentukan 'top keyword' per judul)
    combo_for_top = (0.2 * lex_scores) + (0.4 * fuzz_scores) + (0.4 * embed_sims)
    top_idx = combo_for_top.argmax(axis=1)

    top_keywords = [keywords[idx] for idx in top_idx]
    top_lex = lex_scores[np.arange(n), top_idx]
    top_fuzz = fuzz_scores[np.arange(n), top_idx]
    top_embed = embed_sims[np.arange(n), top_idx]

    # --- Skor final gabungan & label relevansi
    relevance_score = (w_lexical * top_lex) + (w_fuzzy * top_fuzz) + (w_embed * top_embed)
    is_relevant = relevance_score >= relevance_threshold

    # --- Susun output
    out = df.copy()
    out["top_keyword"] = top_keywords
    out["lex_match"] = top_lex.round(3)
    out["fuzz_score"] = top_fuzz.round(3)
    out["embed_sim"] = top_embed.round(3)
    out["relevance_score"] = relevance_score.round(3)
    out["is_relevant"] = is_relevant

    # Urutkan berdasarkan skor (descending)
    out = out.sort_values(by=["is_relevant", "relevance_score"], ascending=[False, False]).reset_index(drop=True)
    return out


# =========================================================
# ================  MAIN (untuk Notebook)  ================
# =========================================================
if __name__ == "__main__":
    try:
        # 1) Baca config
        cfg = load_config("config.json")
        csv_path = cfg.get("last_output_path", "")
        keywords_raw = cfg.get("topic_keywords", [])

        # 2) Normalisasi keywords
        keywords = normalize_keywords(keywords_raw)
        if not keywords:
            raise ValueError("Keywords di config.json kosong. Isi dengan string atau list, contoh: 'ekonomi, rupiah'.")

        logging.info(f"CSV path: {csv_path}")
        logging.info(f"Keywords: {keywords}")

        # 3) Baca CSV
        df_news = read_news_csv(csv_path)

        # 4) Siapkan embedder (opsional). Jika paket tidak tersedia, akan tetap jalan tanpa embeddings.
        embedder = Embedder("all-MiniLM-L6-v2") if _EMBEDDINGS_AVAILABLE else None
        if embedder is None:
            logging.warning(
                "Embeddings tidak tersedia. Relevansi hanya memakai lexical + fuzzy. "
                "Untuk hasil kontekstual yang lebih baik: pip install sentence-transformers"
            )

        # 5) Analisis relevansi
        result = analyze_relevance(
            df_news,
            keywords=keywords,
            embedder=embedder,
            # Bobot dapat di-tuning sesuai preferensi:
            w_lexical=0.10,
            w_fuzzy=0.30,
            w_embed=0.60,
            relevance_threshold=0.55  # Ambang relevansi; sesuaikan sesuai use-case
        )

        # 6) Tampilkan ringkas
        logging.info(f"Total artikel: {len(result)} | Relevan: {result['is_relevant'].sum()}")
        display_cols = [
            "judul_berita", "top_keyword", "lex_match", "fuzz_score",
            "embed_sim", "relevance_score", "is_relevant",
            "penulis_berita", "url_berita", "media_online", "tanggal_berita_dt"
        ]
        display(result[display_cols].head(20))

    except Exception as e:
        logging.error(f"Gagal menjalankan analisis: {e}")
        # Jika ingin debug lebih dalam:
        # import traceback; traceback.print_exc()


Unnamed: 0,judul_berita,top_keyword,lex_match,fuzz_score,embed_sim,relevance_score,is_relevant,penulis_berita,url_berita,media_online,tanggal_berita_dt
0,Kopdes Merah Putih Kecipratan Dana Rp 200 Tril...,suntikan dana rp200 triliun,0.0,0.598,0.833,0.679,True,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6162609/k...,liputan6,2025-09-18 14:00:04+07:00
1,"APBN 2026 Tembus Rp3.842 T, Kemenkeu: Rp2.070 ...",apbn 2026,1.0,0.191,0.779,0.625,True,Tidak Diketahui,https://www.cnbcindonesia.com/news/20250918171...,cnbc,2025-09-19 00:00:00+07:00
2,"Defisit APBN 2026 Bengkak, Purbaya: Tak Usah T...",apbn 2026,1.0,0.189,0.766,0.616,True,Tidak Diketahui,https://www.cnbcindonesia.com/news/20250918152...,cnbc,2025-09-19 00:00:00+07:00
3,Defisit RAPBN 2026 Disepakati Melebar Jadi Rp ...,suntikan dana rp200 triliun,0.0,0.419,0.814,0.614,True,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6163011/d...,liputan6,2025-09-18 20:45:26+07:00
4,"Defisit APBN 2026 Naik Jadi 2,68 Persen, Menke...",apbn 2026,1.0,0.186,0.755,0.609,True,Tidak Diketahui,https://nasional.kompas.com/read/2025/09/18/21...,kompas,2025-09-18 00:00:00+07:00
5,Banggar DPR Desak Menkeu Atur Dana Rp 200 Tril...,suntikan dana rp200 triliun,0.0,0.372,0.821,0.604,True,Tidak Diketahui,https://www.liputan6.com/news/read/6162643/ban...,liputan6,2025-09-18 14:10:33+07:00
6,Banggar Minta Menkeu Terbitkan PMK Atur Penggu...,suntikan dana rp200 triliun,0.0,0.383,0.8,0.595,True,Amir Faisol,https://www.idntimes.com/business/economy/bang...,idn_times,2025-09-18 12:20:00+07:00
7,Defisit APBN 2026 Melebar Demi Pertumbuhan Eko...,apbn 2026,1.0,0.171,0.735,0.593,True,Tidak Diketahui,https://money.kompas.com/read/2025/09/19/06000...,kompas,2025-09-19 00:00:00+07:00
8,"RAPBN 2026 Dibawa ke Paripurna 23 September, N...",suntikan dana rp200 triliun,0.0,0.379,0.799,0.593,True,Tidak Diketahui,https://www.liputan6.com/news/read/6163142/rap...,liputan6,2025-09-18 23:50:07+07:00
9,"BRI Sambut Kepercayaan Pemerintah, Dana Rp55 T...",suntikan dana rp200 triliun,0.0,0.333,0.799,0.579,True,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6163130/b...,liputan6,2025-09-18 23:31:50+07:00


### SIMPAN PATH OUTPUT KE CONFIG.JSON

In [None]:
# %%
# Simpan sebagai CSV/Excel untuk integrasi lanjutan
DATE_TAG = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
OUTPUT_CSV = f"/content/drive/MyDrive/Monitoring Berita/analisis_relevansi/analisis_relevansi_{DATE_TAG}.csv"
OUTPUT_XLSX = f"/content/drive/MyDrive/Monitoring Berita/analisis_relevansi/analisis_relevansi_{DATE_TAG}.xlsx"

try:
    result.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    result.to_excel(OUTPUT_XLSX, index=False)
    logger.info(f"Tersimpan: {OUTPUT_CSV} & {OUTPUT_XLSX}")
except Exception as e:
    logger.error(f"Gagal menyimpan output: {e}")

result.tail(3)


In [None]:
# %%
import json
from pathlib import Path

CONFIG_PATH = Path("/content/drive/MyDrive/Monitoring Berita/config.json")

def update_config(path: Path, new_values: dict):
    """Update config.json hanya pada key tertentu tanpa menimpa keseluruhan isi."""
    data = {}
    if path.exists():
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            logger.warning(f"Gagal membaca config lama: {e}")
            data = {}

    # update hanya key yang diberikan
    data.update(new_values)

    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        logger.info(f"Berhasil update config.json di {path}")
    except Exception as e:
        logger.error(f"Gagal menyimpan config.json: {e}")

# Simpan OUTPUT_CSV & OUTPUT_XLSX ke config dengan nama yang lebih jelas
update_config(CONFIG_PATH, {
    "relevansi_output_csv": OUTPUT_CSV,
    "relevansi_output_xlsx": OUTPUT_XLSX
})
