In [1]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [2]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita'
#cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [3]:
# --- Sel 1: Import & Logging Setup ---

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import logging
from urllib.parse import urljoin
from datetime import datetime
from requests.exceptions import RequestException, HTTPError, Timeout

# Logging format
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)

logging.info("Environment ready. Libraries loaded.")


07:54:30 | INFO | Environment ready. Libraries loaded.


In [4]:
# --- Sel 2: Parameter (mudah diubah) ---

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]

site_param = "all"

In [5]:
# --- Sel 3: KompasIndexScraper Class ---

class KompasIndexScraper:
    BASE = "https://indeks.kompas.com"

    def __init__(self, timeout: int = 10):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0.0.0 Safari/537.36"
            )
        })
        self.timeout = timeout

    def _build_index_url(self, date_str: str, page: int, site: str) -> str:
        """
        Pola indeks Kompas:
        https://indeks.kompas.com/?site={site}&date=YYYY-MM-DD&page=PAGE
        """
        return f"{self.BASE}/?site={site}&date={date_str}&page={page}"

    def _request_with_retry(self, url: str, max_retries: int = 3, backoff_base: int = 1):
        """Retry untuk timeout / 5xx errors."""
        for attempt in range(max_retries):
            try:
                resp = self.session.get(url, timeout=self.timeout)
                resp.raise_for_status()
                return resp
            except Timeout as e:
                logging.warning(f"Timeout on {url} (attempt {attempt+1}/{max_retries}): {e}")
            except HTTPError as e:
                code = getattr(e.response, "status_code", 0)
                if 500 <= code < 600:
                    logging.warning(f"HTTP {code} on {url} (attempt {attempt+1}/{max_retries})")
                else:
                    logging.error(f"HTTP error {code} on {url}, not retrying.")
                    return None
            except RequestException as e:
                logging.error(f"RequestException on {url}: {e}")
                return None

            time.sleep(backoff_base * (2 ** attempt))
        logging.error(f"Failed after {max_retries} attempts: {url}")
        return None

    def _parse_list_page(self, html: str | bytes):
        """
        Mengambil elemen-artikel dari halaman indeks Kompas.
        Struktur Kompas indeks: biasanya ada <h3 class="article__title"> <a> … </a> … </h3>,
        atau elemen dengan class article__list, article__title dll.
        """
        soup = BeautifulSoup(html, "html.parser")
        # Coba selector yang umum dipakai di Kompas indeks
        items = soup.select("h3.article__title > a")
        if not items:
            items = soup.select("h3.article__title.article__title--medium > a")
        if not items:
            items = soup.select("div.article__list a.article__link")
        if not items:
            # fallback generik: semua <a> yang punya /read/ dan di dalam listing
            items = soup.select("a[href*='/read/']")
        return items

    @staticmethod
    def _normalize_text(s: str) -> str:
        s = s.lower()
        s = re.sub(r"[^0-9a-zA-Z\s]", " ", s)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    def _extract_article_data(self, a_tag, fallback_date: str, topic_keywords: list[str]) -> dict | None:
        """
        Dari <a> tag atau elemen link, ekstrak title, tanggal, penulis jika ada, url.
        Tanggal & penulis mungkin tidak ada di indeks -> fallback.
        """
        try:
            judul = a_tag.get_text(strip=True)
            href = a_tag.get("href", "").strip()
            if not judul or not href:
                return None
            url_abs = urljoin(self.BASE, href)

            tanggal_berita = fallback_date
            penulis = "Tidak Diketahui"

            # Ada elemen tanggal di sebelah link / dalam kontainer
            # Contoh: span.time, div.date, time[datetime]
            parent = a_tag.parent
            date_el = None
            # naik ke atas beberapa level
            for up in [a_tag, parent, parent.parent]:
                if up is None:
                    continue
                date_el = up.select_one("time[datetime]") or up.select_one("span.date") or up.select_one("div.date") or up.select_one("span.article__date")
                if date_el:
                    break
            if date_el:
                if date_el.name == "time" and date_el.has_attr("datetime"):
                    tanggal_berita = date_el["datetime"].strip()
                else:
                    tanggal_berita = date_el.get_text(strip=True)

            # Penulis: elemen author/byline jika ada
            author_el = None
            for up in [a_tag, parent, parent.parent]:
                if up is None:
                    continue
                author_el = up.select_one(".author") or up.select_one(".article__author") or up.select_one(".read__author") or up.select_one("span.author")
                if author_el:
                    break
            if author_el:
                penulis_txt = author_el.get_text(" ", strip=True)
                penulis_txt = re.sub(r"^(penulis\s*:|oleh\s*:?)\s*", "", penulis_txt, flags=re.I)
                if penulis_txt:
                    penulis = penulis_txt

            # Relevansi berdasarkan keyword
            norm_title = self._normalize_text(judul)
            found = [kw for kw in topic_keywords if re.search(rf"\b{re.escape(kw.lower())}\b", norm_title)]
            relevan = bool(found)
            keywords_found = ", ".join(found)

            return {
                "judul_berita": judul,
                "tanggal_berita": tanggal_berita,
                "penulis_berita": penulis,
                "url_berita": url_abs,
                "relevan": relevan,
                "keywords_found": keywords_found
            }
        except Exception as e:
            logging.warning(f"Gagal parsing satu artikel: {e}")
            return None

    def scrape_date(self, date_str: str, max_pages: int, topic_keywords: list[str], site: str) -> pd.DataFrame:
        all_rows = []
        logging.info(f"Memproses tanggal: {date_str}, site: {site}")

        for page in range(1, max_pages + 1):
            url = self._build_index_url(date_str, page, site)
            logging.info(f"Fetch: {url}")

            resp = self._request_with_retry(url)
            if resp is None:
                logging.error(f"Skip halaman (gagal ambil): {url}")
                break

            items = self._parse_list_page(resp.text)
            logging.info(f"Kandidat link artikel di page {page}: {len(items)}")

            if not items:
                logging.info(f"Tidak ada artikel di page {page}. Stop iterasi untuk tanggal {date_str}.")
                break

            count_ok = 0
            for a_tag in items:
                data = self._extract_article_data(a_tag, fallback_date=date_str, topic_keywords=topic_keywords)
                if data:
                    all_rows.append(data)
                    count_ok += 1

            logging.info(f"Artikel berhasil diparse dari page {page}: {count_ok}")

            # Sleep antar halaman
            time.sleep(1)

        if not all_rows:
            logging.warning(f"Tidak ada data untuk tanggal {date_str}, site={site}")

        df = pd.DataFrame(all_rows, columns=[
            "judul_berita",
            "tanggal_berita",
            "penulis_berita",
            "url_berita",
            "relevan",
            "keywords_found"
        ])
        return df

    def scrape_many(self, dates: list[str], max_pages: int, topic_keywords: list[str], site: str) -> pd.DataFrame:
        frames = []
        for ds in dates:
            try:
                datetime.strptime(ds, "%Y-%m-%d")
            except ValueError:
                logging.error(f"Format tanggal salah (harus YYYY-MM-DD): {ds}")
                continue
            df = self.scrape_date(ds, max_pages, topic_keywords, site)
            if not df.empty:
                frames.append(df)

        if frames:
            big = pd.concat(frames, ignore_index=True)
            big = big.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
        else:
            big = pd.DataFrame(columns=[
                "judul_berita",
                "tanggal_berita",
                "penulis_berita",
                "url_berita",
                "relevan",
                "keywords_found"
            ])
        return big


In [6]:
# --- Sel 4: Run & Summary ---

scraper = KompasIndexScraper()
df = scraper.scrape_many(dates=dates, max_pages=max_pages_per_date, topic_keywords=topic_keywords, site=site_param)

# Reindex kolom sesuai spesifikasi
expected_cols = ["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"]
df = df.reindex(columns=expected_cols)

display(df)

# Ringkasan
total = len(df)
relevant = int(df["relevan"].sum()) if total else 0
samples = df.loc[df["relevan"], "judul_berita"].head(5).tolist()

print("\n--- Ringkasan ---")
print(f"Total artikel: {total}")
print(f"Artikel relevan: {relevant}")
if samples:
    print("Contoh 5 judul relevan:")
    for i, s in enumerate(samples, 1):
        print(f"{i}. {s}")
else:
    print("Tidak ada judul relevan untuk ditampilkan.")


07:54:30 | INFO | Memproses tanggal: 2025-09-24, site: all
07:54:30 | INFO | Fetch: https://indeks.kompas.com/?site=all&date=2025-09-24&page=1
07:54:30 | INFO | Kandidat link artikel di page 1: 38
07:54:30 | INFO | Artikel berhasil diparse dari page 1: 37
07:54:31 | INFO | Fetch: https://indeks.kompas.com/?site=all&date=2025-09-24&page=2
07:54:32 | INFO | Kandidat link artikel di page 2: 40
07:54:32 | INFO | Artikel berhasil diparse dari page 2: 39
07:54:33 | INFO | Fetch: https://indeks.kompas.com/?site=all&date=2025-09-24&page=3
07:54:33 | INFO | Kandidat link artikel di page 3: 41
07:54:33 | INFO | Artikel berhasil diparse dari page 3: 40


Unnamed: 0,judul_berita,tanggal_berita,penulis_berita,url_berita,relevan,keywords_found
0,Kisah di Balik Macet Parah di Jakarta Rabu Mal...,2025-09-24,Tidak Diketahui,https://www.kompas.com/banten/read/2025/09/24/...,False,
1,Nenek di Padang Pariaman Kritis Dianiaya Saat ...,2025-09-24,Tidak Diketahui,https://regional.kompas.com/read/2025/09/24/23...,False,
2,Kepala SPPG Bangka 2 Dorong Relawan Baca Berit...,2025-09-24,Tidak Diketahui,https://megapolitan.kompas.com/read/2025/09/24...,False,
3,"Tiap Tahun Dibongkar, Warga Harap Revitalisasi...",2025-09-24,Tidak Diketahui,https://megapolitan.kompas.com/read/2025/09/24...,False,
4,Beredar Kabar Keracunan MBG di SDN 07 Pulogeba...,2025-09-24,Tidak Diketahui,https://megapolitan.kompas.com/read/2025/09/24...,False,
...,...,...,...,...,...,...
111,Bus Mogok di Flyover Slipi Perparah Macet di G...,2025-09-24,Tidak Diketahui,https://megapolitan.kompas.com/read/2025/09/24...,False,
112,Tujuh Siswa yang Diduga Keracunan MBG di Jakut...,2025-09-24,Tidak Diketahui,https://megapolitan.kompas.com/read/2025/09/24...,False,
113,"12 Tahun Bersembunyi, Akhirnya DPO Korupsi Hib...",2025-09-24,Tidak Diketahui,https://surabaya.kompas.com/read/2025/09/24/20...,False,
114,Marsha Timothy Stres Syuting Film Tukar Takdir...,2025-09-24,Tidak Diketahui,https://www.kompas.com/hype/read/2025/09/24/20...,False,



--- Ringkasan ---
Total artikel: 116
Artikel relevan: 0
Tidak ada judul relevan untuk ditampilkan.


In [7]:
# simpan data artikel relevan ke excel
df_output = df[df['relevan']==True]
df_output = df_output[['judul_berita', 'tanggal_berita', 'penulis_berita', 'url_berita', 'keywords_found']]
df_output.to_excel(cwd + '/daftar_berita/hasil_kompas.xlsx', index=False)