In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# set folder tempat kerja (current working directory)
import os
cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [3]:
import re
import time
import random
import traceback
from urllib.parse import quote_plus, urlparse
from typing import Optional, List, Dict, Tuple
from dataclasses import dataclass

import requests
from bs4 import BeautifulSoup
import pandas as pd


# =========================
# Konfigurasi umum
# =========================
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "id,en;q=0.9",
}
BASE_SEARCH = "https://search.kompas.com/search?q="
REQUEST_TIMEOUT = 12
MAX_RETRIES = 3
BACKOFF_BASE = 1.7

# Domain dan pola URL artikel Kompas yang valid
VALID_HOST = re.compile(r"(?:^|\.)kompas\.com$", re.I)
# Artikel Kompas umumnya memiliki /read/ pada path
ARTICLE_PATH = re.compile(r"/read/", re.I)
# Kecualikan path non-artikel umum
EXCLUDE_PATH = re.compile(
    r"/(search|plus|apps|video|foto|vik|parapuan|play|indeks|inside|kgnow|kabarpalmerah|jobs|pasangiklan|tag)\b",
    re.I,
)

# =========================
# Utilitas
# =========================
def _sleep(min_s=0.6, max_s=1.2):
    time.sleep(random.uniform(min_s, max_s))

def http_get(url: str, session: Optional[requests.Session] = None) -> Optional[requests.Response]:
    sess = session or requests.Session()
    backoff = 1.0
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = sess.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp
        except requests.RequestException as e:
            if attempt == MAX_RETRIES:
                print(f"[ERROR] GET gagal setelah {attempt}x: {url}\n  -> {repr(e)}")
                return None
            time.sleep(backoff)
            backoff *= BACKOFF_BASE
    return None

def build_search_url(topic: str, page: int = 1) -> str:
    q = quote_plus(topic.strip())
    return f"{BASE_SEARCH}{q}" if page <= 1 else f"{BASE_SEARCH}{q}&page={page}"

def normalize_text(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip().lower())

def split_terms(topic: str) -> List[str]:
    # dukung kutip ganda sebagai frasa
    topic = topic.strip()
    phrases = re.findall(r'"([^"]+)"', topic)
    no_quotes = re.sub(r'"[^"]+"', " ", topic).strip()
    terms = [t for t in re.split(r"\s+", no_quotes) if t]
    # gabungkan frasa sebagai satuan utuh (akan diproses terpisah)
    return terms, phrases

def term_hits(text: str, terms: List[str]) -> int:
    # hit sederhana berdasarkan kemunculan kata utuh (word-like)
    score = 0
    for t in terms:
        if not t:
            continue
        # gunakan boundary longgar untuk bahasa Indonesia
        if re.search(rf"(?i)\b{re.escape(t)}\b", text):
            score += 1
    return score

def phrase_hits(text: str, phrases: List[str]) -> int:
    score = 0
    for ph in phrases:
        if ph and ph.lower() in text:
            score += 1
    return score

def relevance_score(topic: str, title: str, desc: str, body: str,
                    require_all_terms: bool, require_phrase: bool) -> Tuple[int, Dict[str, int]]:
    terms, phrases = split_terms(topic)
    t = normalize_text(title)
    d = normalize_text(desc)
    b = normalize_text(body)

    # hit per bagian
    th = term_hits(t, terms) + (2 * phrase_hits(t, phrases))
    dh = term_hits(d, terms) + (2 * phrase_hits(d, phrases))
    bh = term_hits(b, terms) + (2 * phrase_hits(b, phrases))

    # Bobot: judul 3x, deskripsi 2x, isi 1x
    score = 3 * th + 2 * dh + 1 * bh

    # Aturan ketat (opsional)
    if require_all_terms and terms:
        # pastikan semua term muncul di gabungan title+desc+body
        joined = f"{t} {d} {b}"
        for tt in terms:
            if not re.search(rf"(?i)\b{re.escape(tt)}\b", joined):
                score = 0  # gugurkan
                break
    if require_phrase and phrases:
        # pastikan sedikitnya satu frasa persis muncul
        joined = f"{t} {d} {b}"
        if not any(ph.lower() in joined for ph in phrases):
            score = 0

    return score, {"title_hits": th, "desc_hits": dh, "body_hits": bh}

# =========================
# Parsing halaman
# =========================
def extract_search_links(soup: BeautifulSoup) -> List[str]:
    """
    Ambil link kandidat dari blok hasil (lebih ketat):
    - Coba selector kartu hasil dulu (lebih akurat)
    - Jika gagal, fallback ke semua <a> dengan filter domain + /read/ + exclude path
    """
    links = []

    # 1) Selector kartu hasil yang umum dipakai Kompas
    # (Struktur bisa berubah, jadi kita kombinasikan beberapa kandidat)
    blocks = []
    blocks.extend(soup.select("div.article__list, div.article__grid, div.search__result"))
    blocks.extend(soup.select("div.gsc-webResult, div.gsc-result"))
    blocks.extend(soup.select("article.search-result, li.search-result"))

    for block in blocks:
        a = block.select_one("a[href]")
        if not a:
            continue
        href = a.get("href") or ""
        p = urlparse(href)
        if not href.startswith("http"):
            continue
        if not VALID_HOST.search(p.hostname or ""):
            continue
        if EXCLUDE_PATH.search(p.path or ""):
            continue
        if not ARTICLE_PATH.search(p.path or ""):
            continue
        links.append(href)

    # 2) Fallback: scan semua <a> namun tetap syarat /read/ agar fokus artikel
    if not links:
        for a in soup.select("a[href]"):
            href = (a.get("href") or "").strip()
            if not href.startswith("http"):
                continue
            p = urlparse(href)
            if not VALID_HOST.search(p.hostname or ""):
                continue
            if EXCLUDE_PATH.search(p.path or ""):
                continue
            if not ARTICLE_PATH.search(p.path or ""):
                continue
            links.append(href)

    # Unik + pertahankan urutan
    seen, uniq = set(), []
    for u in links:
        if u not in seen:
            seen.add(u)
            uniq.append(u)
    return uniq

def parse_article(html: str) -> Dict[str, str]:
    soup = BeautifulSoup(html, "lxml")

    # Judul
    title = None
    for sel in [
        "meta[property='og:title']",
        "meta[name='og:title']",
        "meta[name='twitter:title']",
    ]:
        m = soup.select_one(sel)
        if m and m.get("content"):
            title = m["content"].strip()
            break
    if not title:
        h1 = soup.find("h1")
        if h1 and h1.get_text(strip=True):
            title = h1.get_text(strip=True)
    if not title:
        t = soup.find("title")
        if t and t.get_text(strip=True):
            title = t.get_text(strip=True)

    # Tanggal
    tanggal = None
    for sel in [
        "meta[property='article:published_time']",
        "meta[name='article:published_time']",
        "meta[itemprop='datePublished']",
        "time[datetime]",
    ]:
        m = soup.select_one(sel)
        if m:
            tanggal = m.get("content") or m.get("datetime")
            if tanggal:
                tanggal = tanggal.strip()
                break
    if not tanggal:
        t = soup.find("time")
        if t and t.get_text(strip=True):
            tanggal = t.get_text(strip=True)

    # Penulis
    penulis = None
    for sel in [
        "meta[name='author']",
        "meta[property='article:author']",
        "[itemprop='author'] [itemprop='name']",
        ".author, .article__author, span[class*='author']",
    ]:
        a = soup.select_one(sel)
        if a:
            penulis = a.get("content") or a.get_text(" ", strip=True)
            if penulis:
                penulis = penulis.strip()
                break

    # Meta description (untuk skor)
    desc = ""
    md = soup.select_one("meta[name='description'], meta[property='og:description']")
    if md and md.get("content"):
        desc = md["content"].strip()

    # Ambil paragraf awal artikel (untuk skor)
    body = ""
    # beberapa pola umum konten artikel kompas
    body_sel = [
        "[itemprop='articleBody']",
        "div.read__content",
        "div#read__content",
        "article",
    ]
    for sel in body_sel:
        node = soup.select_one(sel)
        if node:
            # Ambil 1-3 paragraf awal agar tidak berat
            ps = node.select("p")
            if ps:
                body = " ".join(p.get_text(" ", strip=True) for p in ps[:3])
                break
    if not body:
        # fallback: ambil seluruh teks paragraf pertama yang terlihat
        p = soup.find("p")
        if p:
            body = p.get_text(" ", strip=True)

    return {
        "title": title or "",
        "tanggal": tanggal or "",
        "penulis": penulis or "",
        "desc": desc,
        "body": body,
    }

# =========================
# Main scraper
# =========================
@dataclass
class Row:
    judul_berita: Optional[str] = None
    tanggal_berita: Optional[str] = None
    penulis_berita: Optional[str] = None
    url_berita: Optional[str] = None
    keterangan: Optional[str] = None
    _score: Optional[int] = None   # kolom internal (debug)
    _hits: Optional[dict] = None   # kolom internal (debug)

def scrape_kompas(
    topic: str,
    max_pages: int = 3,
    min_score: int = 3,            # ambang skor relevansi
    require_all_terms: bool = True, # semua kata harus muncul di total teks
    require_phrase: bool = False,   # jika ada "frasa" di topik, wajib muncul
) -> pd.DataFrame:
    """
    Kembalikan DataFrame (judul_berita, tanggal_berita, penulis_berita, url_berita)
    yang sudah difilter agar relevan terhadap 'topic'.
    """
    topic = (topic or "").strip()
    if not topic:
        raise ValueError("Parameter 'topic' kosong.")

    session = requests.Session()
    candidate_urls: List[str] = []

    for page in range(1, max_pages + 1):
        search_url = build_search_url(topic, page)
        resp = http_get(search_url, session=session)
        if resp is None:
            print(f"[WARN] Melewati halaman {page} (gagal load).")
            continue
        soup = BeautifulSoup(resp.text, "lxml")
        page_links = extract_search_links(soup)
        if not page_links:
            print(f"[INFO] Tidak ada link kandidat pada halaman {page}.")
            if page == 1:
                break
        candidate_urls.extend(page_links)
        _sleep(0.8, 1.5)

    # Unik
    seen = set()
    urls = []
    for u in candidate_urls:
        if u not in seen:
            seen.add(u)
            urls.append(u)

    rows: List[Row] = []
    for u in urls:
        r = Row(url_berita=u)
        try:
            _sleep(0.6, 1.2)
            art = http_get(u, session=session)
            if art is None:
                r.keterangan = "Gagal memuat halaman artikel."
                rows.append(r)
                continue

            meta = parse_article(art.text)
            score, hits = relevance_score(
                topic=topic,
                title=meta["title"],
                desc=meta["desc"],
                body=meta["body"],
                require_all_terms=require_all_terms,
                require_phrase=require_phrase,
            )

            if score >= min_score:
                r.judul_berita = meta["title"]
                r.tanggal_berita = meta["tanggal"]
                r.penulis_berita = meta["penulis"]
                r._score = score
                r._hits = hits
            else:
                r.keterangan = f"Terfilter (skor {score} < min_score {min_score})"
                r._score = score
                r._hits = hits

        except Exception as e:
            r.keterangan = f"Exception saat parse artikel: {repr(e)}"
        rows.append(r)

    # Bangun DF & hanya tampilkan baris lolos filter
    df_full = pd.DataFrame([{
        "judul_berita": i.judul_berita,
        "tanggal_berita": i.tanggal_berita,
        "penulis_berita": i.penulis_berita,
        "url_berita": i.url_berita,
        "keterangan": i.keterangan,
        "_score": i._score,
        "_hits": i._hits
    } for i in rows])

    # ambil yang relevan (judul_berita terisi)
    df = df_full[df_full["judul_berita"].notna()].copy()
    df.drop_duplicates(subset=["url_berita"], inplace=True, ignore_index=True)

    # Urutkan berdasarkan skor desc (kalau ingin transparan)
    if "_score" in df.columns:
        df.sort_values(by="_score", ascending=False, inplace=True, ignore_index=True)

    # Kembalikan hanya kolom yang diminta + keterangan (opsional untuk audit)
    # Jika ingin benar-benar minimal, tinggal drop 'keterangan'
    return df[["judul_berita", "tanggal_berita", "penulis_berita", "url_berita"]]





In [5]:
if __name__ == "__main__":
    # === CONTOH PENGGUNAAN ===
    TOPIC = 'purbaya yudhi'  # contoh: dukung kata & frasa "subsidi bbm"
    MAX_PAGES = 3
    date = time.strftime("%Y%m%d")
    try:
        df = scrape_kompas(
            topic=TOPIC,
            max_pages=MAX_PAGES,
            min_score=2,             # boleh dinaikkan utk lebih ketat (mis. 5–7)
            require_all_terms=True,  # semua kata non-kutip wajib ada
            require_phrase=False,    # set True jika frasa dalam "..." wajib ada
        )
        print(df.head(20))
        # Simpan jika perlu:
        df.to_excel(cwd + f"/daftar_berita/kompas/{TOPIC}_{date}.xlsx", index=False)
    except Exception:
        print("[FATAL] Proses scraping gagal:")
        traceback.print_exc()

                                         judul_berita  \
0   Bisakah Ekonomi Indonesia Membaik di Tangan Me...   
1   Menteri Keuangan Baru Purbaya Yudhi Sadewa: Ko...   
2   Sah, Purbaya Yudhi Sadewa Gantikan Sri Mulyani...   
3   Profil Purbaya Yudhi Sadewa, Menteri Keuangan ...   
4   Profil Pendidikan Purbaya Yudhi Sadewa, Menkeu...   
5   Purbaya Yudhi Sadewo, Menkeu Baru Lulusan Elek...   
6   Baru 3 Hari Jadi Menteri Keuangan, Purbaya Yud...   
7   Profil Menkeu Purbaya Yudhi Sadewa, Bos LPS ya...   
8   Profil dan Daftar Kekayaan Purbaya Yudhi Sadew...   
9   Siapa Purbaya Yudhi Sadewa, Menkeu Baru Pengga...   
10  Jejak Karier Purbaya Yudhi Sadewa, Menteri Keu...   
11  Profil Purbaya Yudhi Sadewa, Menteri Keuangan ...   
12  Prabowo Ganti Sri Mulyani dengan Purbaya Yudhi...   
13  Profil Purbaya Yudhi Sadewa, Menteri Keuangan ...   
14  Perjalanan Karier Purbaya Yudhi Sadewa, Menter...   
15  Anak Sebut Sri Mulyani Agen CIA, Menkeu Purbay...   
16  Klarifikasi Menkeu Purbaya 