In [3]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [None]:
# set folder tempat kerja (current working directory)

import os

# cwd = '/content/drive/MyDrive/Monitoring Berita'

cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
os.chdir(cwd)

In [5]:
# === Sel 1: Import & Logging Config ===

# %%
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import logging
from urllib.parse import urljoin
from datetime import datetime

# Konfigurasi logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("detik_scraper")


In [None]:
# --- Sel 2: Parameter (mudah diubah) ---

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]*3


In [7]:
# === Sel 3 (REVISI): Kelas DetikNewsScraper dengan stop-on-no-new ===

# %%
class DetikNewsScraper:
    """
    Scraper indeks harian Detik News.
    Contoh indeks:
      https://news.detik.com/berita/indeks?page=2&date=09/17/2025
    Pola:
      https://news.detik.com/indeks?page={page}&date={MM}/{DD}/{YYYY}
    """

    BASE_INDEX = "https://news.detik.com/berita/indeks"
    BASE_DOMAIN = "https://news.detik.com/"

    def __init__(self, timeout=10):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/125.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
        })
        self.timeout = timeout

    # -------------------- Util & HTTP --------------------
    def _safe_get(self, url, max_retries=3):
        delay = 1.0
        last_exc = None
        for attempt in range(1, max_retries + 1):
            try:
                resp = self.session.get(url, timeout=self.timeout)
                resp.raise_for_status()
                return resp
            except (requests.exceptions.Timeout, requests.exceptions.HTTPError) as e:
                status = getattr(e.response, "status_code", None)
                if isinstance(e, requests.exceptions.Timeout) or (status and 500 <= status < 600):
                    logger.warning(f"[{attempt}/{max_retries}] Error {type(e).__name__} untuk {url}. Retry dalam {int(delay)}s.")
                    last_exc = e
                    time.sleep(delay)
                    delay *= 2
                    continue
                else:
                    logger.error(f"HTTP error non-retryable untuk {url}: {e}")
                    raise
            except requests.exceptions.RequestException as e:
                logger.error(f"RequestException untuk {url}: {e}")
                last_exc = e
                break
        if last_exc:
            raise last_exc

    @staticmethod
    def _norm_text(s):
        if not s:
            return ""
        s = s.lower()
        s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    # -------------------- URL Builder --------------------
    def _build_index_url(self, date_str: str, page: int) -> str:
        try:
            dt = datetime.strptime(date_str, "%Y-%m-%d")
        except ValueError:
            raise ValueError(f"Tanggal tidak valid: {date_str}. Gunakan format YYYY-MM-DD.")
        date_for_query = dt.strftime("%m/%d/%Y")
        return f"{self.BASE_INDEX}?page={page}&date={date_for_query}"

    # -------------------- Parsing List Page --------------------
    def _parse_list_page(self, html: str):
        soup = BeautifulSoup(html, "html.parser")
        candidates = []
        candidates.extend(soup.select("article.list-content__item"))
        candidates.extend(soup.select("div.list-content__item"))
        candidates.extend(soup.select("li"))

        uniq = []
        seen = set()
        for tag in candidates:
            key = id(tag)
            if key not in seen:
                seen.add(key)
                uniq.append(tag)

        items = []
        for tag in uniq:
            a = tag.select_one("a[href]")
            title_node = (
                tag.select_one(".media__title") or
                tag.select_one(".title") or
                tag.select_one("h3") or
                (a and a)
            )
            if a and title_node:
                items.append(tag)
        return items

    # -------------------- Ekstrak Data per Item --------------------
    def _extract_article_data(self, tag, fallback_date: str, topic_keywords: list[str]) -> dict | None:
        try:
            a = tag.select_one("a[href]")
            if not a:
                return None

            url = a.get("href", "").strip()
            if not url:
                return None
            url = urljoin(self.BASE_DOMAIN, url)

            title_node = (
                tag.select_one(".media__title") or
                tag.select_one(".title") or
                tag.select_one("h3") or
                a
            )
            title = title_node.get_text(strip=True) if title_node else ""
            if not title:
                return None

            date_text = ""
            date_node = (
                tag.select_one(".media__date") or
                tag.select_one(".list-content__date") or
                tag.select_one(".date") or
                tag.select_one("time")
            )
            if date_node:
                date_text = date_node.get_text(" ", strip=True)

            tanggal_berita = date_text if date_text else fallback_date

            penulis = "Tidak Diketahui"
            author_node = (
                tag.select_one(".media__author") or
                tag.select_one(".author") or
                tag.select_one("[rel='author']")
            )
            if author_node:
                pn = author_node.get_text(" ", strip=True)
                if pn:
                    penulis = pn

            norm_title = self._norm_text(title)
            found = []
            for kw in topic_keywords:
                if not kw:
                    continue
                kw_norm = self._norm_text(kw)
                if kw_norm and re.search(rf"\b{re.escape(kw_norm)}\b", norm_title):
                    found.append(kw)

            return {
                "judul_berita": title,
                "tanggal_berita": tanggal_berita,
                "penulis_berita": penulis,
                "url_berita": url,
                "relevan": bool(found),
                "keywords_found": ", ".join(found)
            }
        except Exception as e:
            logger.warning(f"Gagal parse 1 item: {e}")
            return None

    # -------------------- Scrape per tanggal --------------------
    def scrape_date(self, date_str: str, max_pages: int, topic_keywords: list[str]) -> pd.DataFrame:
        all_rows = []
        seen_urls = set()  # track URL unik per tanggal
        fallback_date_iso = date_str

        for page in range(1, max_pages + 1):
            url = self._build_index_url(date_str, page)
            try:
                logger.info(f"Proses tanggal {date_str} | Halaman {page} | URL: {url}")
                resp = self._safe_get(url)
                html = resp.text
            except Exception as e:
                logger.error(f"Gagal memuat halaman indeks: {e}. Lewati halaman ini.")
                continue

            items = self._parse_list_page(html)
            logger.info(f"  Ditemukan {len(items)} item artikel pada halaman ini.")

            if not items:
                logger.info("  Halaman tidak memuat artikel lagi. Hentikan iterasi tanggal ini.")
                break

            before = len(seen_urls)
            added_this_page = 0
            dup_this_page = 0

            for tag in items:
                row = self._extract_article_data(tag, fallback_date_iso, topic_keywords)
                if not row:
                    continue
                u = row["url_berita"]
                if u in seen_urls:
                    dup_this_page += 1
                    continue
                seen_urls.add(u)
                all_rows.append(row)
                added_this_page += 1

            after = len(seen_urls)
            logger.info(f"  Baru: {added_this_page} | Duplikat: {dup_this_page} | Total unik {after}.")

            # === KUNCI: break jika halaman ini tidak menambah apa-apa (indikasi batas nyata tercapai) ===
            if added_this_page == 0:
                logger.info("  Tidak ada URL baru pada halaman ini (konten berulang). Stop iterasi tanggal ini.")
                break

            time.sleep(1)

        if not all_rows:
            logger.warning(f"Tidak ada data untuk tanggal {date_str}")
            return pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])

        df = pd.DataFrame(all_rows, columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])
        return df

    # -------------------- Scrape banyak tanggal --------------------
    def scrape_many(self, dates: list[str], max_pages: int, topic_keywords: list[str]) -> pd.DataFrame:
        frames = []
        for d in dates:
            try:
                df_d = self.scrape_date(d, max_pages, topic_keywords)
                frames.append(df_d)
            except Exception as e:
                logger.error(f"Gagal memproses tanggal {d}: {e}")
        if not frames:
            return pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])
        out = pd.concat(frames, ignore_index=True)
        # Tetap drop duplikat antar tanggal (kadang lintas kanal sama)
        out = out.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
        return out


In [8]:
# === Sel 4: Eksekusi Scraping & Ringkasan ===

# %%
scraper = DetikNewsScraper(timeout=10)
df = scraper.scrape_many(dates=dates, max_pages=max_pages_per_date, topic_keywords=topic_keywords)

# Tampilkan DataFrame (kolom harus persis sesuai spesifikasi)
expected_cols = ["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"]
df = df.reindex(columns=expected_cols) if not df.empty else pd.DataFrame(columns=expected_cols)
display(df)

# Ringkasan
total = len(df)
relevant_count = int(df["relevan"].sum()) if not df.empty else 0
print("\n--- Ringkasan ---")
print(f"Total artikel: {total}")
print(f"Artikel relevan: {relevant_count}")

if relevant_count > 0:
    contoh = df[df["relevan"]].head(5)["judul_berita"].tolist()
    print("5 contoh judul relevan:")
    for i, j in enumerate(contoh, 1):
        print(f"{i}. {j}")
else:
    print("Tidak ada judul relevan untuk ditampilkan.")


07:53:07 | INFO | Proses tanggal 2025-09-24 | Halaman 1 | URL: https://news.detik.com/berita/indeks?page=1&date=09/24/2025
07:53:08 | INFO |   Ditemukan 168 item artikel pada halaman ini.
07:53:08 | INFO |   Baru: 141 | Duplikat: 26 | Total unik 141.
07:53:09 | INFO | Proses tanggal 2025-09-24 | Halaman 2 | URL: https://news.detik.com/berita/indeks?page=2&date=09/24/2025
07:53:09 | INFO |   Ditemukan 168 item artikel pada halaman ini.
07:53:09 | INFO |   Baru: 0 | Duplikat: 167 | Total unik 141.
07:53:09 | INFO |   Tidak ada URL baru pada halaman ini (konten berulang). Stop iterasi tanggal ini.


Unnamed: 0,judul_berita,tanggal_berita,penulis_berita,url_berita,relevan,keywords_found
0,"Prabowo Bertemu Presiden FIFA di AS, Perkuat K...","Rabu, 24 Sep 2025 23:44 WIB",Tidak Diketahui,https://news.detik.com/berita/d-8129051/prabow...,False,
1,Wawalkot Tangsel Jawab Kritikan Leony soal Ang...,"Rabu, 24 Sep 2025 23:40 WIB",Tidak Diketahui,https://news.detik.com/berita/d-8129050/wawalk...,False,
2,Penjelasan Jasa Marga soal Macet Parah di Jaka...,"Rabu, 24 Sep 2025 23:24 WIB",Tidak Diketahui,https://news.detik.com/berita/d-8129043/penjel...,False,
3,Mendes Minta 2 Desa di Bogor Tak Dilelang: Mer...,"Rabu, 24 Sep 2025 23:13 WIB",Tidak Diketahui,https://news.detik.com/berita/d-8129026/mendes...,False,
4,"Buka IBF 2025, Fadli Zon Tegaskan Pentingnya L...","Rabu, 24 Sep 2025 23:03 WIB",Tidak Diketahui,https://news.detik.com/berita/d-8129016/buka-i...,False,
...,...,...,...,...,...,...
136,Disclaimer,2025-09-24,Tidak Diketahui,https://www.detik.com/disclaimer,False,
137,Insertlive,2025-09-24,Tidak Diketahui,https://www.insertlive.com/,False,
138,Beautynesia,2025-09-24,Tidak Diketahui,https://beautynesia.id,False,
139,Female Daily,2025-09-24,Tidak Diketahui,https://femaledaily.com,False,



--- Ringkasan ---
Total artikel: 141
Artikel relevan: 0
Tidak ada judul relevan untuk ditampilkan.


In [10]:
df.to_excel(cwd + f"/daftar_berita/detik_index.xlsx", index=False)