In [None]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

Mounted at /content/drive


In [1]:
# set folder tempat kerja (current working directory)
import os

# cwd = '/content/drive/MyDrive/Monitoring Berita'

cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita'

os.chdir(cwd)

In [2]:
import re
import json
import time
import random
import unicodedata
from urllib.parse import urljoin, urlparse
import requests
from requests.adapters import HTTPAdapter, Retry
from bs4 import BeautifulSoup
import pandas as pd

# =========================
# Konfigurasi dasar
# =========================
TAG_BASE = "https://www.detik.com/tag/"
ARTICLE_HOSTS = {
    "detik.com","www.detik.com",
    "news.detik.com","finance.detik.com","inet.detik.com","hot.detik.com",
    "health.detik.com","food.detik.com","travel.detik.com","oto.detik.com",
    "sport.detik.com","sepakbola.detik.com","wolipop.detik.com","english.detik.com"
}

UAS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
]

def headers():
    return {
        "User-Agent": random.choice(UAS),
        "Accept-Language": "id,en;q=0.8",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Connection": "keep-alive",
    }

def make_session():
    s = requests.Session()
    retries = Retry(
        total=5,
        backoff_factor=0.5,
        status_forcelist=(429,500,502,503,504),
        allowed_methods=frozenset(["GET"])
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=10, pool_maxsize=10)
    s.mount("http://", adapter)
    s.mount("https://", adapter)
    return s

def slugify_tag(s: str) -> str:
    # "Kementerian Keuangan" -> "kementerian-keuangan"
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = re.sub(r"[^a-zA-Z0-9]+", "-", s.strip().lower())
    s = re.sub(r"-{2,}", "-", s).strip("-")
    return s or "berita"

def fetch_html(session, url, timeout=15):
    try:
        r = session.get(url, headers=headers(), timeout=timeout)
        r.raise_for_status()
        return r.text
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Gagal fetch {url}: {e}")

# =========================
# Ekstraksi detail artikel
# =========================
def parse_ld_json(soup):
    best = {}
    for tag in soup.find_all("script", type="application/ld+json"):
        try:
            text = tag.string or tag.text or ""
            data = json.loads(text)
            blocks = data if isinstance(data, list) else [data]
            for b in blocks:
                typ = (b.get("@type") or "").lower()
                if "newsarticle" in typ or "article" in typ:
                    best.update(b)
        except Exception:
            continue
    return best

def extract_author_from_ld(ld):
    a = ld.get("author")
    if isinstance(a, str):
        return a.strip()
    if isinstance(a, dict):
        return (a.get("name") or "").strip() or None
    if isinstance(a, list) and a:
        first = a[0]
        if isinstance(first, str):
            return first.strip()
        if isinstance(first, dict):
            return (first.get("name") or "").strip() or None
    return None

def extract_date_from_ld(ld):
    for k in ("datePublished","dateCreated","dateModified"):
        if ld.get(k):
            return str(ld[k]).strip()
    return None

def extract_author_meta(soup):
    for key in ("author","dc.creator","og:article:author","article:author"):
        tag = soup.find("meta", attrs={"name": key}) or soup.find("meta", attrs={"property": key})
        if tag and tag.get("content"):
            return tag["content"].strip()
    # byline detik
    by = soup.select_one(".author, .author__name, .detail__author, .author-name")
    if by:
        t = by.get_text(" ", strip=True)
        return re.sub(r"^[Bb]y\s+", "", t).strip() or None
    return None

def extract_date_meta(soup):
    for key in ("article:published_time","og:published_time","pubdate","date"):
        tag = soup.find("meta", attrs={"property": key}) or soup.find("meta", attrs={"name": key})
        if tag and tag.get("content"):
            return tag["content"].strip()
    t = soup.find("time")
    if t:
        return (t.get("datetime") or t.get_text(" ", strip=True) or "").strip() or None
    cand = soup.select_one(".date, .detail__date, .date-time, .author__date, .media__date")
    if cand:
        return cand.get_text(" ", strip=True)
    return None

def parse_article_detail(session, url):
    try:
        html = fetch_html(session, url)
        soup = BeautifulSoup(html, "html.parser")
        ld = parse_ld_json(soup)
        author = extract_author_from_ld(ld) or extract_author_meta(soup) or ""
        date = extract_date_from_ld(ld) or extract_date_meta(soup) or ""
        author = re.sub(r"\s+"," ",author).strip()
        date = re.sub(r"\s+"," ",date).strip()
        return author, date
    except Exception:
        return "", ""

# =========================
# Deteksi URL artikel Detik
# =========================
def looks_like_article(url):
    try:
        p = urlparse(url)
        if p.netloc.lower() not in ARTICLE_HOSTS:
            return False
        # Pola umum artikel Detik:
        # - /d-<digits> (sangat umum)
        # - /<yyyy>/<mm>/<dd>/...
        return bool(re.search(r"/d-\d+", url)) or bool(re.search(r"/\d{4}/\d{2}/\d{2}/", url))
    except Exception:
        return False

# =========================
# Parser halaman TAG Detik
# =========================
def parse_tag_list(html):
    soup = BeautifulSoup(html, "html.parser")
    items = []
    seen = set()

    # Selector paling stabil di Detik untuk list: link judul ada di a.media__link
    for a in soup.select("a.media__link[href]"):
        title = a.get_text(" ", strip=True)
        href = a["href"].strip()
        if not title or not href:
            continue
        if href.startswith("/"):
            href = urljoin("https://www.detik.com/", href)
        key = (title, href)
        if key in seen:
            continue
        seen.add(key)
        if looks_like_article(href):
            items.append({"judul_berita": title, "url_berita": href})

    # Fallback: beberapa layout lama pakai .list-content__item
    if not items:
        for card in soup.select("article, .list-content__item"):
            a = card.find("a", href=True)
            if not a:
                continue
            title = (card.find(["h2","h3"]).get_text(" ", strip=True)
                     if card.find(["h2","h3"]) else a.get_text(" ", strip=True))
            href = a["href"].strip()
            if href.startswith("/"):
                href = urljoin("https://www.detik.com/", href)
            if title and href and looks_like_article(href):
                key = (title, href)
                if key not in seen:
                    seen.add(key)
                    items.append({"judul_berita": title, "url_berita": href})

    return items

# =========================
# Main
# =========================
def scrape_detik_by_tag(topic, max_pages=3, delay_range=(0.8, 1.6)):
    """
    Ambil artikel dari halaman TAG Detik berdasarkan 'topic'.
    Contoh topic: "Kementerian Keuangan" -> /tag/kementerian-keuangan
    """
    session = make_session()
    slug = slugify_tag(topic)
    results = []

    for page in range(1, max_pages + 1):
        url = f"{TAG_BASE}{slug}"
        if page > 1:
            url = f"{url}?page={page}"

        try:
            html = fetch_html(session, url)
        except RuntimeError as e:
            print(f"[PERINGATAN] Melewati page={page}: {e}")
            continue

        items = parse_tag_list(html)
        if not items:
            print(f"[INFO] Tidak ada item pada tag page={page} ({url}).")
            # Kalau halaman benar-benar kosong, tidak usah lanjut halaman berikut
            # tapi kita tetap lanjut iterasi untuk berjaga jika page berikut ada.
        for it in items:
            judul = it["judul_berita"]
            href = it["url_berita"]
            penulis, tanggal = parse_article_detail(session, href)
            results.append({
                "judul_berita": judul,
                "tanggal_berita": tanggal,
                "penulis_berita": penulis,
                "url_berita": href,
            })
            time.sleep(random.uniform(*delay_range))
        time.sleep(random.uniform(*delay_range))

    df = pd.DataFrame(results, columns=["judul_berita","tanggal_berita","penulis_berita","url_berita"])
    if not df.empty:
        df = df.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
    return df


In [3]:
# Baca parameter search

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]


In [5]:
# ===== Contoh pakai =====
if __name__ == "__main__":
  df_list = []
  for topic in topic_keywords:
      df = scrape_detik_by_tag(topic, max_pages=1)
      df_list.append(df)
  df = pd.concat(df_list, ignore_index=True)
  df.to_excel(cwd + f"/daftar_berita/detik_.xlsx", index=False)