In [2]:
# mount the colab with google drive
"""from google.colab import drive
drive.mount('/content/drive')"""

"from google.colab import drive\ndrive.mount('/content/drive')"

In [3]:
# set folder tempat kerja (current working directory)
import os
# cwd = '/content/drive/MyDrive/Monitoring Berita'
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita'
os.chdir(cwd)


In [4]:
# -*- coding: utf-8 -*-
"""
Scrape daftar pemberitaan IDN Times dari halaman pencarian:
https://www.idntimes.com/search?q=<topik>

Keluaran: pandas DataFrame dengan kolom:
['judul_berita', 'tanggal_berita', 'penulis_berita', 'url_berita']

Prasyarat:
pip install requests beautifulsoup4 pandas lxml
"""

import re
import time
import logging
from typing import List, Dict, Optional
from urllib.parse import quote_plus

import requests
from bs4 import BeautifulSoup
import pandas as pd

# =======================
# Konfigurasi umum
# =======================
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0 Safari/537.36"
    ),
    "Accept-Language": "id,en;q=0.9",
}
TIMEOUT = 12  # detik
RETRY = 2     # jumlah retry per request
SLEEP_BETWEEN_REQUESTS = 0.5  # detik - kurangi beban server

# Pola tanggal seperti "09 Sep 2025, 22:13 WIB" (fallback bila meta tak ada)
DATE_REGEX = re.compile(
    r"\b(\d{1,2}\s(?:Jan|Feb|Mar|Apr|Mei|Jun|Jul|Agu|Sep|Okt|Nov|Des)\s\d{4},\s\d{2}:\d{2}\sWIB)\b",
    flags=re.IGNORECASE,
)

# =======================
# Utilitas HTTP
# =======================
def http_get(url: str) -> Optional[requests.Response]:
    """GET dengan retry & timeout. Return Response atau None jika gagal total."""
    last_err = None
    for attempt in range(1, RETRY + 2):
        try:
            resp = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
            # IDN kadang redirect ke App/banner; kita tetap terima 200 saja
            if resp.status_code == 200:
                return resp
            last_err = f"HTTP {resp.status_code}"
        except requests.RequestException as e:
            last_err = str(e)
        # tunggu sebelum retry
        time.sleep(0.6 * attempt)
    logging.warning("Gagal GET %s: %s", url, last_err)
    return None

# =======================
# Parsing hasil pencarian
# =======================
def extract_article_links_from_search(html: str) -> List[str]:
    """
    Ambil link artikel dari HTML pencarian.
    Filter: hanya domain idntimes.com dan path yang mengarah ke artikel (ada kategori).
    """
    soup = BeautifulSoup(html, "lxml")

    links = set()

    # Heuristik: link artikel biasanya memiliki pola /<kategori>/<subkategori>/<slug>...
    # dan bukan link ke ekosistem lain (Popbela, Duniaku, dll).
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if not href.startswith("https://www.idntimes.com/"):
            continue
        # Buang halaman yang jelas bukan artikel (homepage, tag, search, sitemap, dsb.)
        if any(bad in href for bad in ["/search", "/tag/", "/sitemap", "/index", "/about", "/contact"]):
            continue
        # Pastikan ada kategori di path (mis. /business/economy/..., /news/...)
        # Contoh kategori utama IDN: business, news, life, hype, sport, tech, travel, dll.
        path = href.replace("https://www.idntimes.com", "")
        if path.count("/") >= 3:
            links.add(href)

    return list(links)

def paginate_search_urls(query: str, max_pages: int = 1) -> List[str]:
    """
    Bentuk daftar URL pencarian. IDN Times biasanya mendukung ?q=<term>&page=<n>.
    Jika struktur berubah, tetap aman (hanya halaman 1 yang dipakai).
    """
    q = quote_plus(query.strip())
    base = f"https://www.idntimes.com/search?q={q}"
    urls = [base]
    for p in range(2, max_pages + 1):
        urls.append(f"{base}&page={p}")
    return urls

# =======================
# Parsing halaman artikel
# =======================
def parse_article_page(url: str) -> Dict[str, Optional[str]]:
    """
    Ambil judul, tanggal, penulis dari halaman artikel.
    Strategi:
      1) Coba cari dari elemen visible: <h1>, teks tanggal, penulis dekat H1.
      2) Fallback cari dari meta tag umum: <meta name="author">,
         <meta property="article:published_time">, JSON-LD jika tersedia.
    """
    data = {
        "judul_berita": None,
        "tanggal_berita": None,
        "penulis_berita": None,
        "url_berita": url,
        "error": None,  # simpan error untuk logging (tidak ikut ke DataFrame final)
    }

    resp = http_get(url)
    if not resp:
        data["error"] = "Gagal memuat artikel"
        return data

    soup = BeautifulSoup(resp.text, "lxml")

    # --- Judul (H1) ---
    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        data["judul_berita"] = h1.get_text(strip=True)

    # --- Tanggal (visible text dekat header) ---
    # Cari pola tanggal standar yang sering tampil di dekat judul.
    m = DATE_REGEX.search(soup.get_text(" ", strip=True))
    if m:
        data["tanggal_berita"] = m.group(1)

    # --- Penulis (sering berupa teks 'Ridwan Aji Pitoko' dekat header) ---
    # Heuristik: cari anchor/span yang mirip 'author' di sekitar H1.
    author = None

    # 1) Meta name=author
    meta_author = soup.find("meta", attrs={"name": "author"})
    if meta_author and meta_author.get("content"):
        author = meta_author["content"].strip()

    # 2) Cari link/teks berulang yang tampak seperti nama penulis (huruf & spasi)
    if not author:
        # Banyak halaman meletakkan penulis dalam anchor/spans dekat judul
        header_block = h1.find_parent() if h1 else None
        candidates = []
        if header_block:
            candidates = header_block.find_all(["a", "span"], string=True)
        else:
            candidates = soup.find_all(["a", "span"], string=True)

        for el in candidates:
            text = el.get_text(strip=True)
            # heuristik sederhana: 2-4 kata, huruf/karakter nama
            if 2 <= len(text.split()) <= 5 and re.match(r"^[A-Za-zÀ-ÖØ-öø-ÿ'.\-\s]+$", text):
                # Hindari kata kategori umum (Business, News, Life, dsb.)
                if text.lower() not in {"home", "news", "business", "economy", "life", "sport", "tech", "travel"}:
                    author = text
                    break

    # 3) Fallback: cari schema.org JSON-LD bila ada
    if not author or not data["tanggal_berita"]:
        for script in soup.find_all("script", type="application/ld+json"):
            try:
                import json
                j = json.loads(script.string or "{}")
                # Bisa berupa dict atau list
                objs = j if isinstance(j, list) else [j]
                for obj in objs:
                    if not isinstance(obj, dict):
                        continue
                    if not author:
                        a = obj.get("author")
                        if isinstance(a, dict) and a.get("name"):
                            author = a["name"]
                        elif isinstance(a, list) and a and isinstance(a[0], dict) and a[0].get("name"):
                            author = a[0]["name"]
                    if not data["tanggal_berita"]:
                        dp = obj.get("datePublished") or obj.get("dateCreated")
                        if dp:
                            data["tanggal_berita"] = dp
            except Exception:
                # Abaikan JSON-LD yang tidak valid
                pass

    # 4) Meta OpenGraph waktu publish
    if not data["tanggal_berita"]:
        og_pub = soup.find("meta", attrs={"property": "article:published_time"})
        if og_pub and og_pub.get("content"):
            data["tanggal_berita"] = og_pub["content"].strip()

    data["penulis_berita"] = author
    # Validasi minimal
    if not data["judul_berita"]:
        data["error"] = "Judul tidak ditemukan"
    return data

# =======================
# Pipeline utama
# =======================
def scrape_idntimes_search(topic: str, max_pages: int = 1, limit: Optional[int] = None) -> pd.DataFrame:
    """
    Ambil daftar artikel untuk 'topic' dari halaman pencarian IDN Times.
    - max_pages: jumlah halaman pencarian yang dipindai (default 1).
    - limit: batasi jumlah artikel yang diproses (None = semua).
    """
    if not topic or not topic.strip():
        raise ValueError("Parameter 'topic' tidak boleh kosong.")

    search_urls = paginate_search_urls(topic, max_pages=max_pages)
    all_links: List[str] = []

    for su in search_urls:
        resp = http_get(su)
        if not resp:
            logging.warning("Lewati halaman pencarian (gagal dimuat): %s", su)
            continue
        links = extract_article_links_from_search(resp.text)
        # Hindari duplikat & pertahankan urutan relatif
        for lk in links:
            if lk not in all_links:
                all_links.append(lk)
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    if not all_links:
        logging.warning("Tidak ada link artikel ditemukan untuk topik '%s'.", topic)

    if limit is not None:
        all_links = all_links[:max(0, int(limit))]

    rows: List[Dict] = []
    for i, url in enumerate(all_links, 1):
        try:
            row = parse_article_page(url)
            if row.get("error"):
                logging.warning("Artikel bermasalah (%s): %s", row["error"], url)
            rows.append(
                {
                    "judul_berita": row.get("judul_berita"),
                    "tanggal_berita": row.get("tanggal_berita"),
                    "penulis_berita": row.get("penulis_berita"),
                    "url_berita": row.get("url_berita"),
                }
            )
        except Exception as e:
            logging.exception("Gagal memproses artikel #%d: %s | err=%s", i, url, e)
            rows.append(
                {
                    "judul_berita": None,
                    "tanggal_berita": None,
                    "penulis_berita": None,
                    "url_berita": url,
                }
            )
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    df = pd.DataFrame(rows, columns=["judul_berita", "tanggal_berita", "penulis_berita", "url_berita"])
    return df



In [5]:
# --- Sel 2: Parameter (mudah diubah) ---

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]


In [6]:
if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s | %(levelname)s | %(message)s",
    )

    df_list = []
    for topik in topic_keywords:
        df1 = scrape_idntimes_search(topik, max_pages=3, limit=50)
        df_list.append(df1)

    df = pd.concat(df_list)
    date = pd.Timestamp.now().strftime("%Y%m%d")
    df.to_excel(cwd + f"/daftar_berita/idn_times.xlsx", index=False)
    # Simpan kalau perlu:
    # df.to_csv(f"idntimes_{topik}.csv", index=False)
