In [1]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [None]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [3]:
# Sel 1: Import & konfigurasi logging

# %%
import logging
import time
import re
from datetime import datetime
from urllib.parse import urljoin

import requests
from bs4 import BeautifulSoup
import pandas as pd
from requests.exceptions import RequestException, HTTPError, Timeout

# Logging konfigurasi
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
    datefmt="%H:%M:%S"
)
logger = logging.getLogger("Liputan6Scraper")


In [4]:
# --- Sel 2: Parameter (mudah diubah) ---

import json

with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

# Kata kunci topik untuk analisis relevansi judul
topic_keywords = config["keywords"]

# Daftar tanggal (YYYY-MM-DD). Akan di-convert ke DD-MM-YYYY untuk pencocokan di halaman.
dates = config["search_date"]

# Maksimum halaman per tanggal (akan berhenti lebih awal jika halaman kosong)
max_pages_per_date = config["max_page_length"]

# Sertakan kanal Bisnis (tanggalan) selain News
include_bisnis = True


In [5]:
# Sel 3: Kelas Liputan6Scraper

# %%
class Liputan6Scraper:
    BASE = "https://www.liputan6.com"
    NEWS_INDEX_BASE = "https://www.liputan6.com/news/indeks"
    BISNIS_INDEX_BASE = "https://www.liputan6.com/bisnis/indeks"

    def __init__(self, timeout: int = 10, max_retries: int = 3, sleep_between_pages: float = 1.0):
        self.timeout = timeout
        self.max_retries = max_retries
        self.sleep_between_pages = sleep_between_pages

        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": (
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                "AppleWebKit/537.36 (KHTML, like Gecko) "
                "Chrome/124.0.0.0 Safari/537.36"
            ),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "id,en;q=0.9",
            "Connection": "keep-alive",
        })

    # ---------- HTTP GET dengan retry & exponential backoff ----------
    def _get(self, url: str) -> requests.Response | None:
        delay = 1.0
        last_exc = None
        for attempt in range(1, self.max_retries + 1):
            try:
                resp = self.session.get(url, timeout=self.timeout)
                if 500 <= resp.status_code < 600:
                    raise HTTPError(f"Server error {resp.status_code} untuk {url}")
                if resp.status_code == 429:
                    raise HTTPError("429 Too Many Requests")
                if 400 <= resp.status_code < 500:
                    logger.error(f"HTTP {resp.status_code} pada {url}")
                    return None
                return resp
            except (Timeout, HTTPError) as e:
                last_exc = e
                if attempt < self.max_retries:
                    logger.warning(f"Percobaan {attempt}/{self.max_retries} gagal: {e} | retry dalam {int(delay)}s | {url}")
                    time.sleep(delay)
                    delay *= 2
                else:
                    logger.error(f"Gagal GET setelah retry: {url} | error: {e}")
                    return None
            except RequestException as e:
                logger.error(f"RequestException: {e} | {url}")
                return None
        return None

    # ---------- Helpers URL ----------
    def _build_news_url(self, date_str: str, page: int) -> str:
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        return f"{self.NEWS_INDEX_BASE}/{dt.year:04d}/{dt.month:02d}/{dt.day:02d}?page={page}"

    def _build_bisnis_url(self, date_str: str, page: int) -> str:
        # Sesuai permintaan: kanal Bisnis juga bertanggal
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        return f"{self.BISNIS_INDEX_BASE}/{dt.year:04d}/{dt.month:02d}/{dt.day:02d}?page={page}"

    # ---------- Normalisasi & parsing list ----------
    def _normalize_text(self, s: str) -> str:
        s = s.lower()
        s = re.sub(r"[^\w\s]", " ", s, flags=re.UNICODE)
        s = re.sub(r"\s+", " ", s).strip()
        return s

    def _parse_list_page(self, html: str) -> list[dict]:
        """
        Ambil semua tautan artikel dari halaman indeks.
        Strategi robust: cari semua <a> dengan href berisi '/read/' lalu deduplikasi.
        Return: list dict { 'a': tag, 'url': full_url, 'title': title }
        """
        soup = BeautifulSoup(html, "html.parser")
        anchors = soup.select('a[href*="/read/"]')

        items, seen = [], set()
        for a in anchors:
            href = (a.get("href") or "").strip()
            title = (a.get_text(strip=True) or "").strip()
            if not href or not title:
                continue
            full_url = urljoin(self.BASE, href)
            if "liputan6.com" not in full_url or "/read/" not in full_url:
                continue
            key = (full_url, title.lower())
            if key in seen:
                continue
            seen.add(key)
            items.append({"a": a, "url": full_url, "title": title})
        return items

    # ---------- Ekstraksi item + analisis relevansi ----------
    def _extract_article_data(self, item: dict, fallback_date: str, topic_keywords: list[str]) -> dict | None:
        try:
            title = item.get("title", "").strip()
            url = item.get("url", "").strip()
            if not title or not url:
                return None

            # Cari tanggal di sekitar anchor (listing). Jika tak ada, fallback ke tanggal indeks.
            tanggal = ""
            a = item.get("a")
            container = a.find_parent(["li", "article", "div"]) if a else None
            time_tag = container.find("time") if container else None
            if not time_tag and a:
                time_tag = a.find_next("time")
            if time_tag:
                tanggal = (time_tag.get("datetime") or time_tag.get_text(strip=True) or "").strip()
            if not tanggal:
                tanggal = fallback_date

            penulis = "Tidak Diketahui"

            # Relevansi judul
            norm_title = self._normalize_text(title)
            norm_keys = [self._normalize_text(k) for k in topic_keywords if k.strip()]
            found = [kw for kw in norm_keys if kw and kw in norm_title]
            relevan = bool(found)
            keywords_found = ", ".join(found)

            return {
                "judul_berita": title,
                "tanggal_berita": tanggal,
                "penulis_berita": penulis,
                "url_berita": url,
                "relevan": relevan,
                "keywords_found": keywords_found
            }
        except Exception as e:
            logger.error(f"Gagal parsing artikel: {e}")
            return None

    # ---------- Scrape per tanggal (News & Bisnis) ----------
    def _scrape_generic_by_date(self, build_url_fn, date_str: str, max_pages: int, topic_keywords: list[str], label: str) -> pd.DataFrame:
        logger.info(f"Memproses tanggal [{label}]: {date_str}")
        rows = []
        for page in range(1, max_pages + 1):
            url = build_url_fn(date_str, page)
            logger.info(f"GET {label}: {url}")
            resp = self._get(url)
            if not resp:
                logger.warning(f"Gagal memuat halaman {label}, hentikan iterasi tanggal {date_str}")
                break

            items = self._parse_list_page(resp.text)
            logger.info(f"{label} halaman {page}: ditemukan {len(items)} artikel")
            if not items:
                logger.info(f"Halaman kosong/pola tidak cocok ({label}), stop {date_str}")
                break

            for it in items:
                data = self._extract_article_data(it, fallback_date=date_str, topic_keywords=topic_keywords)
                if data:
                    rows.append(data)

            time.sleep(self.sleep_between_pages)

        if rows:
            df = pd.DataFrame(rows).drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
        else:
            df = pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])
        return df

    def scrape_news_date(self, date_str: str, max_pages: int, topic_keywords: list[str]) -> pd.DataFrame:
        return self._scrape_generic_by_date(self._build_news_url, date_str, max_pages, topic_keywords, label="News")

    def scrape_bisnis_date(self, date_str: str, max_pages: int, topic_keywords: list[str]) -> pd.DataFrame:
        return self._scrape_generic_by_date(self._build_bisnis_url, date_str, max_pages, topic_keywords, label="Bisnis")

    # ---------- Orkestrasi ----------
    def scrape_many(self, dates: list[str], max_pages: int, topic_keywords: list[str], include_bisnis: bool=True) -> pd.DataFrame:
        frames = []

        for d in dates:
            try:
                frames.append(self.scrape_news_date(d, max_pages, topic_keywords))
            except Exception as e:
                logger.error(f"Gagal scraping News {d}: {e}")

        if include_bisnis:
            for d in dates:
                try:
                    frames.append(self.scrape_bisnis_date(d, max_pages, topic_keywords))
                except Exception as e:
                    logger.error(f"Gagal scraping Bisnis {d}: {e}")

        if not frames:
            return pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])

        df_all = pd.concat(frames, ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
        df_all = df_all[["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"]]
        return df_all


In [6]:
# Sel 4: Eksekusi scraping & ringkasan

# %%
scraper = Liputan6Scraper(timeout=10, max_retries=3, sleep_between_pages=1.0)

df = scraper.scrape_many(
    dates=dates,
    max_pages=max_pages_per_date,
    topic_keywords=topic_keywords,
    include_bisnis=include_bisnis
)

# Tampilkan DataFrame dengan urutan kolom sesuai spesifikasi
expected_cols = ["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"]
df = df.reindex(columns=expected_cols)
display(df)

# Ringkasan
total_artikel = len(df)
total_relevan = int(df["relevan"].sum()) if total_artikel else 0
contoh_relevan = df[df["relevan"]].head(5)["judul_berita"].tolist()

print("\n==== RINGKASAN ====")
print(f"Total artikel: {total_artikel}")
print(f"Artikel relevan: {total_relevan}")
if contoh_relevan:
    print("Contoh 5 judul relevan:")
    for i, j in enumerate(contoh_relevan, 1):
        print(f"{i}. {j}")
else:
    print("Tidak ada contoh judul relevan.")


07:55:19 | INFO | Memproses tanggal [News]: 2025-09-24
07:55:19 | INFO | GET News: https://www.liputan6.com/news/indeks/2025/09/24?page=1
07:55:19 | INFO | News halaman 1: ditemukan 24 artikel
07:55:20 | INFO | GET News: https://www.liputan6.com/news/indeks/2025/09/24?page=2
07:55:20 | INFO | News halaman 2: ditemukan 24 artikel
07:55:21 | INFO | GET News: https://www.liputan6.com/news/indeks/2025/09/24?page=3
07:55:22 | INFO | News halaman 3: ditemukan 24 artikel
07:55:23 | INFO | Memproses tanggal [Bisnis]: 2025-09-24
07:55:23 | INFO | GET Bisnis: https://www.liputan6.com/bisnis/indeks/2025/09/24?page=1
07:55:23 | INFO | Bisnis halaman 1: ditemukan 24 artikel
07:55:24 | INFO | GET Bisnis: https://www.liputan6.com/bisnis/indeks/2025/09/24?page=2
07:55:24 | INFO | Bisnis halaman 2: ditemukan 24 artikel
07:55:25 | INFO | GET Bisnis: https://www.liputan6.com/bisnis/indeks/2025/09/24?page=3
07:55:25 | INFO | Bisnis halaman 3: ditemukan 24 artikel


Unnamed: 0,judul_berita,tanggal_berita,penulis_berita,url_berita,relevan,keywords_found
0,Wilayah Terdampak Gempa Banyuwangi 25 Septembe...,2025-09-25T21:00:22+07:00,Tidak Diketahui,https://www.liputan6.com/cek-fakta/read/616889...,False,
1,Simak Jadwal Lengkap Hari Libur Nasional dan C...,2025-09-25T19:00:00+07:00,Tidak Diketahui,https://www.liputan6.com/cek-fakta/read/616840...,False,
2,Pertamina Patra Niaga Ajak Konsumen Pastikan K...,2025-09-25T17:36:59+07:00,Tidak Diketahui,https://www.liputan6.com/cek-fakta/read/616876...,False,
3,Cek Fakta: Hoaks Uang Pecahan Baru Rp 300 Ribu...,2025-09-25T15:00:00+07:00,Tidak Diketahui,https://www.liputan6.com/cek-fakta/read/616830...,False,
4,KPK Tangkap Seorang Pengusaha Terkait Kasus TP...,2025-09-24T23:25:41+07:00,Tidak Diketahui,https://www.liputan6.com/news/read/6168036/kpk...,False,
...,...,...,...,...,...,...
119,Seleksi Ketat CPNS Bisa Lahirkan PNS Anti Koru...,2025-09-24T11:00:10+07:00,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6167113/s...,False,
120,OECD Revisi Pertumbuhan Ekonomi Indonesia Naik...,2025-09-24T10:50:31+07:00,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6167310/o...,False,
121,Saat Prabowo Pamer Produksi Beras Indonesia Ce...,2025-09-24T10:30:10+07:00,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6167170/s...,False,
122,Kapan Gaji PNS Naik? Ini Bocorannya,2025-09-24T10:15:02+07:00,Tidak Diketahui,https://www.liputan6.com/bisnis/read/6167117/k...,False,



==== RINGKASAN ====
Total artikel: 124
Artikel relevan: 5
Contoh 5 judul relevan:
1. Soal Gaya 'Koboi' Menkeu Purbaya, Begini Komentar Luhut Pandjaitan
2. Defisit APBN Agustus Capai Rp 321,6 Triliun, Ini Dampaknya ke Stabilitas Fiskal
3. Transfer Rp 200 Triliun ke Bank Himbara Ala Menkeu Purbaya Jadi Terobosan Penting
4. Hotman Paris Protes Bunga Deposito Turun, Ini Jawaban Menohok Menkeu Purbaya
5. Jurus Rahasia Menkeu Purbaya Tekan Utang Indonesia


In [7]:
df_out = df[df['relevan']==True]
df_out = df_out[['judul_berita', 'tanggal_berita', 'penulis_berita', 'url_berita', 'keywords_found']]
df_out.to_excel(cwd + "/daftar_berita/liputan6.xlsx", index=False)