In [1]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [None]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [3]:
# =======================
# Sel 1: Imports & Utils
# =======================
import re
import time
import json
import math
import logging
import unicodedata
from typing import List, Dict, Tuple, Optional

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin

# Logging set-up
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0 Safari/537.36"
    )
})

def http_get(url: str, timeout: int = 20, max_retries: int = 3, backoff: float = 1.5) -> Optional[requests.Response]:
    """GET dengan retry & backoff sederhana."""
    for attempt in range(1, max_retries + 1):
        try:
            resp = SESSION.get(url, timeout=timeout)
            if 200 <= resp.status_code < 300:
                return resp
            else:
                logging.warning("Non-2xx status %s untuk %s (percobaan %d)", resp.status_code, url, attempt)
        except requests.RequestException as e:
            logging.warning("Request error untuk %s (percobaan %d): %s", url, attempt, e)
        time.sleep(backoff ** attempt)
    logging.error("Gagal GET setelah %d percobaan: %s", max_retries, url)
    return None

def normalize_text(s: str) -> str:
    """Lowercase + strip + NFC normalize + collapse whitespace."""
    if not isinstance(s, str):
        return ""
    s = unicodedata.normalize("NFKC", s).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def to_dd_mm_yyyy(date_yyyy_mm_dd: str) -> str:
    """Convert 'YYYY-MM-DD' -> 'DD-MM-YYYY' (string-safe)."""
    m = re.fullmatch(r"(\d{4})-(\d{2})-(\d{2})", date_yyyy_mm_dd.strip())
    if not m:
        raise ValueError(f"Format tanggal tidak valid: {date_yyyy_mm_dd} (harus YYYY-MM-DD)")
    yyyy, mm, dd = m.groups()
    return f"{dd}-{mm}-{yyyy}"

def keyword_patterns(keywords: List[str]) -> List[re.Pattern]:
    """Buat regex word-boundary untuk setiap keyword/frasa."""
    pats = []
    for kw in keywords:
        kw_norm = re.escape(normalize_text(kw))
        # Izinkan spasi fleksibel untuk frasa multi kata
        kw_norm = kw_norm.replace("\\ ", r"\s+")
        # Word boundary di awal/akhir, tetapi toleransi untuk unicode letters/digits
        pat = re.compile(rf"(^|\W){kw_norm}(\W|$)", flags=re.IGNORECASE)
        pats.append(pat)
    return pats

def title_relevance(title: str, kw_patterns: List[re.Pattern]) -> Tuple[bool, List[str]]:
    """Cek relevansi judul terhadap keywords. Return (relevan, keywords_found)."""
    t = normalize_text(title)
    found = []
    for pat in kw_patterns:
        if pat.search(t):
            # Ambil kembali bentuk tekstual dari pola untuk pelaporan (approx)
            # Kita simpan pola original via pat.pattern (ekstrak kata di tengah)
            core = pat.pattern
            # Upayakan ekstrak isi di antara boundary (^|\W) ... (\W|$)
            core_kw = re.sub(r"^\(\^\|\\W\)|\(\?i\)|\(\W\|\$\)$", "", core)
            found.append(core_kw)
    # Rapikan daftar found agar menampilkan keyword 'asli' (tanpa regex)
    found_clean = list({re.sub(r"\(\?:\^\|\W\)|\(\?:\W\|\$\)", "", f).strip() for f in found})
    return (len(found_clean) > 0, found_clean)

def unique_keep_order(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            out.append(x)
            seen.add(x)
    return out


In [None]:
# --- Sel 2: Parameter (mudah diubah) ---

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]*3

# Jeda antar request agar lebih ramah
pause_between_requests = 0.6  # detik

# Timeout per request
timeout = 20

# Base pattern URL Tribunnews index harian
BASE_URL = "https://www.tribunnews.com/index-news/?date={ddmmyyyy}&page={page}"

In [5]:
# ===========================
# Sel 3: Parser & Scraper
# ===========================

def parse_index_items(html: str) -> List[Dict]:
    """
    Parse HTML index Tribunnews dan ambil list item artikel.
    Struktur Tribunnews kadang berubah; kita gunakan selector fleksibel:
    - Cari blok artikel pada <li> atau <div> yang punya <h3> atau <h4> berisi <a>.
    - Ambil judul dari a tag.
    - Ambil URL absolute.
    - Ambil tanggal bila muncul di index (seringnya tidak lengkap), kita tetap simpan string mentah jika ada.
    - Penulis biasanya tidak muncul di index: set None.
    """
    soup = BeautifulSoup(html, "html.parser")

    # Kandidat container item; beberapa variasi class yang sering muncul
    candidates = []
    # Pola umum list di halaman index:
    candidates += soup.select("ul.lsi > li")
    candidates += soup.select("div.pos_rel > ul > li")
    candidates += soup.select("li.ptb15")
    candidates += soup.select("li")  # fallback cukup hati2, nanti difilter

    items = []
    for li in unique_keep_order(candidates):
        # Cari anchor judul
        a = li.select_one("h3 a") or li.select_one("h4 a") or li.select_one("a.f20") or li.select_one("a.f18") or li.select_one("a")
        if not a:
            continue
        title = (a.get_text(strip=True) or "").strip()
        href = a.get("href") or ""
        # Filter: abaikan yang bukan artikel Tribunnews (mis. ads, anchor kosong)
        if not href or not href.startswith("http"):
            continue
        if "tribunnews.com" not in href:
            continue
        # Heuristic: abaikan link index sendiri
        if "/index-news" in href:
            continue

        # Tanggal (opsional) — di index kadang ada <time>, <div class="grey">, atau <span class="time">
        date_text = None
        time_tag = li.select_one("time") or li.select_one("span.time") or li.select_one(".grey, .grey2, .grey3")
        if time_tag:
            date_text = time_tag.get_text(" ", strip=True)

        items.append({
            "judul_berita": title,
            "tanggal_berita": date_text,  # ini string mentah dari index (kalau ada)
            "penulis_berita": None,       # umumnya tidak tersedia di index
            "url_berita": href
        })
    return items

def scrape_tribun_by_date(date_yyyy_mm_dd: str, max_pages: int = 20, timeout: int = 20, pause: float = 0.6) -> List[Dict]:
    """Scrape Tribunnews index untuk satu tanggal (YYYY-MM-DD) hingga max_pages, berhenti saat tidak ada artikel baru."""
    ddmmyyyy = to_dd_mm_yyyy(date_yyyy_mm_dd)
    logging.info("Mulai scrape Tribunnews untuk tanggal %s (format index: %s)", date_yyyy_mm_dd, ddmmyyyy)

    all_rows = []
    seen_urls = set()
    for page in range(1, max_pages + 1):
        url = BASE_URL.format(ddmmyyyy=ddmmyyyy, page=page)
        logging.info("Fetching page %d: %s", page, url)
        resp = http_get(url, timeout=timeout)
        if resp is None:
            logging.warning("Lewati page %d (gagal request)", page)
            break

        items = parse_index_items(resp.text)
        logging.info("Ditemukan %d kandidat item pada page %d", len(items), page)

        # Filter duplikat dan kosong
        new_count = 0
        for it in items:
            href = it["url_berita"]
            if href in seen_urls:
                continue
            # Filter link yang jelas bukan artikel (heuristic)
            if any(p in href for p in ["/video/", "/foto/"]):
                continue
            seen_urls.add(href)
            all_rows.append(it)
            new_count += 1

        logging.info("Item baru ditambahkan dari page %d: %d", page, new_count)
        if new_count == 0:
            logging.info("Tidak ada item baru, hentikan iterasi tanggal ini.")
            break

        time.sleep(pause)

    return all_rows


In [6]:
# ====================================
# Sel 4: Analisis relevansi & Pipeline
# ====================================

def analyze_relevance(df: pd.DataFrame, keywords: List[str]) -> pd.DataFrame:
    pats = keyword_patterns(keywords)
    relevan_list = []
    found_list = []

    for title in df["judul_berita"].fillna("").astype(str).tolist():
        is_rel, found = title_relevance(title, pats)
        relevan_list.append(bool(is_rel))
        # Rapikan keywords_found sebagai string dipisah koma
        found_list.append(", ".join(sorted(set(found))) if found else "")

    df["relevan"] = relevan_list
    df["keywords_found"] = found_list
    return df

def run_pipeline(
    topic_keywords: List[str],
    dates: List[str],
    max_pages_per_date: int = 20,
    timeout: int = 20,
    pause_between_requests: float = 0.6
) -> pd.DataFrame:
    rows = []
    for d in dates:
        try:
            data = scrape_tribun_by_date(d, max_pages=max_pages_per_date, timeout=timeout, pause=pause_between_requests)
            logging.info("Total item terkumpul untuk %s: %d", d, len(data))
            rows.extend(data)
        except Exception as e:
            logging.exception("Gagal scrape untuk tanggal %s: %s", d, e)

    if not rows:
        logging.warning("Tidak ada data terkumpul dari seluruh tanggal.")
        df = pd.DataFrame(columns=[
            "judul_berita", "tanggal_berita", "penulis_berita", "url_berita",
            "media_online", "_source", "relevan", "keywords_found"
        ])
        return df

    df = pd.DataFrame(rows)

    # Lengkapi kolom standar
    df["media_online"] = "tribunnews"
    df["_source"] = "index-harian"

    # Analisis relevansi
    df = analyze_relevance(df, topic_keywords)

    # Dedup by url_berita
    if "url_berita" in df.columns:
        df = df.drop_duplicates(subset=["url_berita"], keep="first").reset_index(drop=True)

    return df


In [7]:
# ==========================
# Sel 5: Eksekusi & Output
# ==========================

df_tribun = run_pipeline(
    topic_keywords=topic_keywords,
    dates=dates,
    max_pages_per_date=max_pages_per_date,
    timeout=timeout,
    pause_between_requests=pause_between_requests
)

# Tampilkan ringkasan
total = len(df_tribun)
rel_count = int(df_tribun["relevan"].sum()) if "relevan" in df_tribun else 0
logging.info("--- Ringkasan ---")
logging.info("Total artikel: %s | Artikel relevan: %s", total, rel_count)

# Urutkan opsional: relevan dulu
if not df_tribun.empty and "relevan" in df_tribun:
    df_tribun = df_tribun.sort_values(by=["relevan"], ascending=[False]).reset_index(drop=True)

df_tribun.head(20)


2025-09-26 07:56:35,437 | INFO | Mulai scrape Tribunnews untuk tanggal 2025-09-24 (format index: 24-09-2025)
2025-09-26 07:56:35,438 | INFO | Fetching page 1: https://www.tribunnews.com/index-news/?date=24-09-2025&page=1
2025-09-26 07:56:35,669 | INFO | Ditemukan 88 kandidat item pada page 1
2025-09-26 07:56:35,670 | INFO | Item baru ditambahkan dari page 1: 87
2025-09-26 07:56:36,275 | INFO | Fetching page 2: https://www.tribunnews.com/index-news/?date=24-09-2025&page=2
2025-09-26 07:56:36,431 | INFO | Ditemukan 88 kandidat item pada page 2
2025-09-26 07:56:36,431 | INFO | Item baru ditambahkan dari page 2: 20
2025-09-26 07:56:37,036 | INFO | Fetching page 3: https://www.tribunnews.com/index-news/?date=24-09-2025&page=3
2025-09-26 07:56:37,202 | INFO | Ditemukan 88 kandidat item pada page 3
2025-09-26 07:56:37,203 | INFO | Item baru ditambahkan dari page 3: 20
2025-09-26 07:56:37,808 | INFO | Total item terkumpul untuk 2025-09-24: 127
2025-09-26 07:56:37,819 | INFO | --- Ringkasan ---

Unnamed: 0,judul_berita,tanggal_berita,penulis_berita,url_berita,media_online,_source,relevan,keywords_found
0,"Eks Wamenlu: Jokowi Selalu Absen Sidang PBB, M...","Rabu, 24 September 2025 23:51 WIB",,https://www.tribunnews.com/nasional/7733163/ek...,tribunnews,index-harian,False,
1,Kesehatan,,,https://www.tribunnews.com/kesehatan,tribunnews,index-harian,False,
2,Pelatih Malut United Ungkap Kunci Penting untu...,"Rabu, 24 September 2025 21:36 WIB",,https://www.tribunnews.com/superskor/7733137/p...,tribunnews,index-harian,False,
3,Kabareskrim Respons Tokoh GNB Tuntut Delpedro ...,"Rabu, 24 September 2025 21:41 WIB",,https://www.tribunnews.com/nasional/7733138/ka...,tribunnews,index-harian,False,
4,Zaskia Adya Mecca Tahu Identitas Terduga Penga...,"Rabu, 24 September 2025 21:42 WIB",,https://www.tribunnews.com/metropolitan/773313...,tribunnews,index-harian,False,
5,"Prakiraan Cuaca Kota Kendari Kamis, 25 Septemb...","Rabu, 24 September 2025 21:43 WIB",,https://www.tribunnews.com/regional/7733140/pr...,tribunnews,index-harian,False,
6,Sarwendah Batasi Hubungannya dengan Giorgio An...,"Rabu, 24 September 2025 21:44 WIB",,https://www.tribunnews.com/seleb/7733141/sarwe...,tribunnews,index-harian,False,
7,Indonesia Disebut Bisa Usulkan SBY jadi Sekjen...,"Rabu, 24 September 2025 21:47 WIB",,https://www.tribunnews.com/nasional/7733142/in...,tribunnews,index-harian,False,
8,"Jangan Stres, Ini yang Dilakukan Nikita Willy ...","Rabu, 24 September 2025 21:55 WIB",,https://www.tribunnews.com/seleb/7733143/janga...,tribunnews,index-harian,False,
9,Indeks Tag,,,https://www.tribunnews.com/tag,tribunnews,index-harian,False,


In [8]:
# simpan data artikel relevan ke excel
df_tribun.to_excel(cwd + '/daftar_berita/hasil_tribun.xlsx', index=False)