In [1]:
# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# set folder tempat kerja (current working directory)
import os

cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [9]:
import re
import time
import math
import logging
from typing import List, Dict, Optional
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

CNBC_SEARCH_URL = "https://www.cnbcindonesia.com/search"

def _build_session(total_retries: int = 5, backoff: float = 0.5) -> requests.Session:
    """Create a requests session with robust retry strategy."""
    session = requests.Session()
    retries = Retry(
        total=total_retries,
        connect=total_retries,
        read=total_retries,
        status=total_retries,
        backoff_factor=backoff,
        status_forcelist=(429, 500, 502, 503, 504),
        allowed_methods=frozenset(["GET"]),
        raise_on_status=False,
    )
    adapter = HTTPAdapter(max_retries=retries, pool_connections=20, pool_maxsize=20)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    session.headers.update({
        # User-Agent realistis agar tidak langsung diblokir
        "User-Agent": ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                       "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15"),
        "Accept-Language": "id,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Cache-Control": "no-cache",
    })
    return session

def _safe_get_text(el) -> str:
    return (el.get_text(strip=True) if el else "").strip()

def _first(soup: BeautifulSoup, selectors: List[str]):
    for sel in selectors:
        found = soup.select_one(sel)
        if found:
            return found
    return None

def _extract_cards(soup: BeautifulSoup) -> List[BeautifulSoup]:
    """
    Kumpulkan node 'kartu' hasil dari berbagai kemungkinan struktur.
    """
    candidates = []
    # Pola umum daftar hasil (ul > li > article / div)
    candidates += soup.select("ul li article")
    candidates += soup.select("ul li div")
    candidates += soup.select("div.list, div.list-content, div.media, article")
    # Dedup by id() pointer
    uniq = []
    seen = set()
    for c in candidates:
        if id(c) not in seen:
            uniq.append(c)
            seen.add(id(c))
    return uniq

def _guess_anchor(card: BeautifulSoup):
    """
    Cari anchor (link) utama ke artikel dari sebuah 'card'.
    """
    # Cari <a> dengan href domain cnbcindonesia & path mengandung /news/ atau kategori lain
    anchors = card.select("a[href]")
    for a in anchors:
        href = a.get("href", "")
        if "cnbcindonesia.com" in href and re.search(r"/(news|market|tech|syariah|lifestyle|investment|entrepreneur|opini)/", href):
            return a
    # fallback: ambil anchor pertama yang mengandung judul
    title_like = card.select_one("a[title], h2 a, h3 a")
    return title_like or (anchors[0] if anchors else None)

def _parse_date_author_from_detail(session: requests.Session, url: str, timeout: int = 12) -> Dict[str, str]:
    """
    Buka halaman detail artikel untuk mengambil tanggal & penulis dari meta tag.
    """
    out = {"tanggal_berita": "", "penulis_berita": ""}
    try:
        r = session.get(url, timeout=timeout)
        r.raise_for_status()
        dsoup = BeautifulSoup(r.text, "lxml")
        # Meta tanggal (beberapa kemungkinan)
        date_meta = _first(dsoup, [
            'meta[property="article:published_time"]',
            'meta[name="date"]',
            'meta[itemprop="datePublished"]',
            'time[datetime]'
        ])
        if date_meta:
            out["tanggal_berita"] = date_meta.get("content") or date_meta.get("datetime") or _safe_get_text(date_meta)

        # Penulis
        author_meta = _first(dsoup, [
            'meta[name="author"]',
            'meta[property="article:author"]',
            '[itemprop="author"]',
            '.author, .byline, .reporter'
        ])
        if author_meta:
            out["penulis_berita"] = author_meta.get("content") or _safe_get_text(author_meta)
    except requests.RequestException as e:
        logging.warning(f"Gagal ambil detail: {url} -> {e}")
    except Exception as e:
        logging.warning(f"Parse detail error: {url} -> {e}")
    return out

def scrape_cnbc_search(
    query: str = "kemenkeu",
    max_pages: int = 2,
    delay_sec: float = 0.6,
    fetch_detail: bool = True,
    stop_if_no_results: bool = True,
) -> pd.DataFrame:
    """
    Scrape hasil pencarian CNBC Indonesia untuk `query` dan kembalikan DataFrame:
    kolom: judul_berita, tanggal_berita, penulis_berita, url_berita

    Args:
        query: kata kunci pencarian.
        max_pages: jumlah halaman pencarian yang akan di-scrape.
        delay_sec: jeda antar request untuk sopan & menghindari rate limit.
        fetch_detail: kalau True, buka halaman artikel untuk tanggal/penulis akurat.
        stop_if_no_results: berhenti lebih awal bila suatu halaman tidak punya hasil.

    Returns:
        pandas.DataFrame
    """
    session = _build_session()
    rows: List[Dict[str, str]] = []
    for page in range(1, max_pages + 1):
        params = {"query": query}
        if page > 1:
            params["page"] = page

        url = CNBC_SEARCH_URL
        try:
            resp = session.get(url, params=params, timeout=15)
            # Jika respon mengarah ke challenge (403/503), raise_for_status tidak selalu error.
            if resp.status_code >= 400:
                logging.warning(f"Halaman {page} -> HTTP {resp.status_code}, mencoba lanjut...")
            html = resp.text
        except requests.RequestException as e:
            logging.error(f"Gagal meminta halaman pencarian (page {page}): {e}")
            if stop_if_no_results:
                break
            else:
                continue

        soup = BeautifulSoup(html, "lxml")

        cards = _extract_cards(soup)
        page_rows = []
        for card in cards:
            try:
                a = _guess_anchor(card)
                if not a:
                    continue
                href = a.get("href", "").strip()
                title = (a.get("title") or _safe_get_text(a)).strip()
                if not href or not title:
                    continue
                # Normalisasi URL relatif -> absolut
                if href.startswith("/"):
                    href = "https://www.cnbcindonesia.com" + href

                # Ambil tanggal/penulis dari list jika tersedia (fallback)
                # CNBC sering menaruh tanggal/penulis di elemen sekitar judul
                date_el = _first(card, [".date", ".tanggal", "time", ".publish-date"])
                author_el = _first(card, [".author", ".byline", ".reporter", "[itemprop='author']"])
                tanggal = _safe_get_text(date_el)
                penulis = _safe_get_text(author_el)

                # Jika fetch_detail diaktifkan atau data masih kosong, buka halaman detail
                if fetch_detail and (not tanggal or not penulis):
                    detail = _parse_date_author_from_detail(session, href)
                    tanggal = detail.get("tanggal_berita") or tanggal
                    penulis = detail.get("penulis_berita") or penulis

                page_rows.append({
                    "judul_berita": title,
                    "tanggal_berita": tanggal,
                    "penulis_berita": penulis,
                    "url_berita": href
                })
            except Exception as e:
                # Tangkap error parsing di level kartu agar kartu lain tetap diproses
                logging.warning(f"Error parsing card di page {page}: {e}")
                continue

        # Filter duplikat berdasarkan URL
        before = len(page_rows)
        seen_urls = set()
        deduped = []
        for r in page_rows:
            if r["url_berita"] not in seen_urls:
                deduped.append(r)
                seen_urls.add(r["url_berita"])
        after = len(deduped)
        if before != after:
            logging.info(f"Page {page}: {before - after} duplikat dihapus.")

        rows.extend(deduped)

        logging.info(f"Page {page}: {len(deduped)} artikel ditemukan.")
        # Berhenti bila halaman tampak kosong
        if stop_if_no_results and len(deduped) == 0:
            logging.info("Tidak ada hasil di halaman ini. Stop lebih awal.")
            break

        time.sleep(delay_sec)

    # DataFrame akhir
    df = pd.DataFrame(rows, columns=["judul_berita", "tanggal_berita", "penulis_berita", "url_berita"]).drop_duplicates(subset=["url_berita"])
    return df

In [10]:
if __name__ == "__main__":
    TOPIC = "kemenkeu"
    df = scrape_cnbc_search(query=TOPIC, max_pages=2, delay_sec=0.8, fetch_detail=True)
    date = time.strftime("%Y%m%d")
    df.to_excel(cwd + f"/daftar_berita/cnbc/{TOPIC}_{date}.xlsx", index=False)
    # Simpan jika perlu:
    # df.to_csv("cnbc_kemenkeu.csv", index=False, encoding="utf-8")