# SETTING ENVIRONMENT


In [25]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [26]:
# set folder tempat kerja (current working directory)
import os

cwd = "/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita"
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

# MAIN CODE

## 1

In [27]:
# # Scraping Artikel Berita Multisumber (Fleksibel)
# - Membaca list URL dari Excel
# - Ekstraksi teks artikel bersih (tanpa iklan/link terkait)
# - Error handling kuat, retry, timeout
# - Site-specific extractor + fallback readability/trafilatura + heuristik generik
# - Output dataframe: url_berita, artikel_berita, source_domain, status, error

# %%
!pip -q install beautifulsoup4 lxml html5lib requests-html readability-lxml trafilatura tqdm pandas openpyxl regex > /dev/null

# --- Sel 1 (tambahkan paket) ---
!pip -q install cloudscraper httpx > /dev/null


# %%
import re
import regex as reg
import sys
import time
import json
import logging
import warnings
from dataclasses import dataclass
from typing import Optional, List, Tuple, Callable, Dict

import pandas as pd
import requests
from bs4 import BeautifulSoup, NavigableString, Tag
from urllib.parse import urlparse, urljoin
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# Optional imports (fallback parsers)
try:
    from readability import Document
    HAS_READABILITY = True
except Exception:
    HAS_READABILITY = False

try:
    import trafilatura
    HAS_TRAFILATURA = True
except Exception:
    HAS_TRAFILATURA = False

# Logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)
logger = logging.getLogger("news-scraper")


## 2

In [28]:
# %%
# ======== PARAMETER YANG MUDAH DIUBAH ========

# config path
from pathlib import Path

# CONFIG_PATH = Path("/content/drive/MyDrive/Monitoring Berita/config.json")

CONFIG_PATH = Path(f"{cwd}/config.json")

with open(CONFIG_PATH, "r", encoding="utf-8") as f:
            cfg = json.load(f)

# INPUT_EXCEL_PATH = cfg["labelled_data_xlsx"]
INPUT_CSV_PATH = cfg["last_konsolidasi_path"]
INPUT_URL_COLUMN = "url_berita"

# Batasi jumlah URL saat uji (None untuk semua)
MAX_URLS: Optional[int] = None

# Timeout & retry (per attempt)
REQUEST_TIMEOUT = 8  # dikurangi agar tidak memakan seluruh budget 10 detik
MAX_RETRIES = 2
RETRY_SLEEP = 1.0

# Total batas waktu proses ekstraksi per URL (detik)
PER_ARTICLE_TIMEOUT = 10

# User-Agent khusus agar tidak diblok mudah
DEFAULT_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
    )
}

# Pola frasa yang ingin dihapus dari artikel (iklan/clickbait/relate links)
CLEANUP_PATTERNS = [
    r"\bBaca juga\b.*", r"\bSimak juga\b.*", r"\bADVERTISEMENT\b.*",
    r"\bIklan\b.*", r"\bIklan Layanan\b.*", r"\bInfografis\b.*",
    r"\bTonton juga\b.*", r"\bVideo:\b.*", r"\b[Gg]rafis\b.*",
    r"\b[Gg]allery\b.*", r"\bArtikel ini telah\b.*", r"\bEditor:\b.*$",
    r"^\s*—\s*$", r"^\s*-\s*$", r"\b[ \t]*[•\-\*]\s*$"
]

# Tag/kelas yang menandai elemen non-konten yang harus dibuang
JUNK_SELECTORS = [
    "[class*=iklan]", "[class*=ads]", "[id*=ads]", "[id*=banner]", "[class*=related]",
    "[class*=tag]", "[class*=breadcrumb]", "script", "style", "noscript", "iframe",
    "[class*=share]", "[class*=social]", "[class*=recommend]", "[class*=promo]",
    "[class*=copyright]", "[class*=author]", "[class*=metadata]"
]


## 3

In [29]:
# %%
session = requests.Session()
session.headers.update(DEFAULT_HEADERS)

def fetch(url: str, timeout: int = REQUEST_TIMEOUT, max_retries: int = MAX_RETRIES) -> Optional[requests.Response]:
    """
    Fetch URL dengan retry sederhana dan timeout.
    """
    last_err = None
    for attempt in range(1, max_retries + 1):
        try:
            resp = session.get(url, timeout=timeout, allow_redirects=True)
            if 200 <= resp.status_code < 300:
                return resp
            else:
                last_err = RuntimeError(f"HTTP {resp.status_code}")
                logger.warning(f"Attempt {attempt}: {url} -> {resp.status_code}")
        except Exception as e:
            last_err = e
            logger.warning(f"Attempt {attempt} failed for {url}: {e}")
        time.sleep(RETRY_SLEEP * attempt)
    logger.error(f"Failed to fetch {url}: {last_err}")
    return None

def normalize_whitespace(text: str) -> str:
    text = reg.sub(r"[ \t]+", " ", text)
    text = reg.sub(r"\n{3,}", "\n\n", text)
    return text.strip()

def remove_cleanup_patterns(text: str, patterns: List[str]) -> str:
    out = text
    for pat in patterns:
        out = re.sub(pat, "", out, flags=re.IGNORECASE)
    # Buang baris kosong sisa
    out = "\n".join([ln.strip() for ln in out.splitlines() if ln.strip()])
    return normalize_whitespace(out)


In [30]:
# --- Sel 3 (ganti sesi & fetch) ---
import random
import httpx
import cloudscraper
from urllib.parse import urlparse
import time as _time

# Headers "benar-bener browser" (ditambah beberapa header network hints)
BROWSER_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
        "(KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
    "Cache-Control": "no-cache",
    "Pragma": "no-cache",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "none",
    "Sec-Fetch-User": "?1",
}

DETIK_EXTRA_HEADERS = {
    "Authority": "www.detik.com",
    "Accept-Encoding": "gzip, deflate, br",
}

UA_POOL = [
    BROWSER_HEADERS["User-Agent"],
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64; rv:130.0) Gecko/20100101 Firefox/130.0",
]


def make_session():
    # cloudscraper biasanya lolos 403 di situs yang pakai anti-bot
    try:
        s = cloudscraper.create_scraper(
            browser={"browser": "chrome", "platform": "windows", "mobile": False}
        )
        s.headers.update(BROWSER_HEADERS)
        return s
    except Exception:
        s = requests.Session()
        s.headers.update(BROWSER_HEADERS)
        return s

session = make_session()


def jitter_sleep(base=REQUEST_TIMEOUT * 0.05):
    _time.sleep(base + random.random() * 0.5)


def as_amp(url: str) -> str:
    u = url.split("?")[0].rstrip("/")
    return u + "/amp"


def _attempt_requests(url, headers, timeout):
    return session.get(url, headers=headers, timeout=timeout, allow_redirects=True)


def _attempt_httpx(url, headers, timeout):
    try:
        with httpx.Client(follow_redirects=True, timeout=timeout, headers=headers, http2=True) as client:
            return client.get(url)
    except Exception:
        return None


def fetch(url: str, timeout: int = REQUEST_TIMEOUT, max_retries: int = MAX_RETRIES, deadline: float = None) -> Optional[requests.Response]:
    """
    Fetch dengan retry + header lengkap dan menghormati *deadline* absolut (epoch detik).
    Jika waktu sekarang melewati deadline sebelum attempt berikutnya, hentikan lebih awal.
    """
    last_err = None
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    is_tribun = "tribunnews.com" in domain
    is_detik = "detik.com" in domain

    for attempt in range(1, max_retries + 1):
        if deadline and _time.time() > deadline:
            last_err = TimeoutError("global deadline exceeded before attempt")
            break
        try:
            headers = BROWSER_HEADERS.copy()
            headers["User-Agent"] = random.choice(UA_POOL)
            headers["Referer"] = "https://www.google.com/"
            if is_detik:
                headers.update(DETIK_EXTRA_HEADERS)

            per_attempt_timeout = min(timeout, max(2, int((deadline - _time.time()) if deadline else timeout))) if deadline else timeout
            resp = _attempt_requests(url, headers=headers, timeout=per_attempt_timeout)
            if resp is not None and 200 <= resp.status_code < 300:
                return resp

            status = resp.status_code if resp is not None else "NO_RESP"

            if is_detik and (resp is None or status in (403, 503)) and (not deadline or _time.time() < deadline):
                per_attempt_timeout = min(per_attempt_timeout, 5)
                resp_hx = _attempt_httpx(url, headers=headers, timeout=per_attempt_timeout)
                if resp_hx is not None and 200 <= resp_hx.status_code < 300:
                    class SimpleResp:
                        def __init__(self, r):
                            self.text = r.text
                            self.status_code = r.status_code
                            self.url = str(r.url)
                            self.headers = dict(r.headers)
                    return SimpleResp(resp_hx)  # type: ignore

            if is_tribun and resp is not None and resp.status_code in (401, 403) and (not deadline or _time.time() < deadline):
                amp_url = as_amp(url)
                try:
                    per_attempt_timeout = min(per_attempt_timeout, 5)
                    resp2 = _attempt_requests(amp_url, headers=headers, timeout=per_attempt_timeout)
                    if resp2 is not None and 200 <= resp2.status_code < 300:
                        resp2.url = url
                        return resp2
                    else:
                        last_err = RuntimeError(f"AMP HTTP {resp2.status_code if resp2 else 'none'}")
                except Exception as e2:
                    last_err = e2
            else:
                last_err = RuntimeError(f"HTTP {status}")
        except Exception as e:
            last_err = e
            logger.warning(f"Attempt {attempt} failed for {url}: {e}")

        if attempt < max_retries and (not deadline or _time.time() + 0.5 < deadline):
            jitter_sleep(0.4 + attempt * 0.2)

    if last_err:
        logger.error(f"Failed to fetch {url}: {last_err}")
    return None

## 4

In [31]:
# %%
CANDIDATE_CONTENT_SELECTORS = [
    "article",
    "[class*=content]",
    "[class*=article]",
    "[class*=read__content]",
    "[class*=detail__body]",
    "[class*=entry-content]",
    "[id*=content]", "[id*=article]",
    "[itemprop='articleBody']",
]

def clean_soup(soup: BeautifulSoup) -> None:
    # Hapus elemen junk
    for sel in JUNK_SELECTORS:
        for el in soup.select(sel):
            el.decompose()

def join_block_text(container: Tag) -> str:
    # Ambil <p>, <h2/3/4>, <li> sebagai paragraf/kalimat
    parts: List[str] = []
    for el in container.descendants:
        if isinstance(el, Tag):
            if el.name in {"p", "h2", "h3", "h4", "li"}:
                txt = el.get_text(separator=" ", strip=True)
                if txt and len(txt) > 2:
                    parts.append(txt)
        elif isinstance(el, NavigableString):
            # Abaikan NavigableString langsung agar tidak duplikat
            pass
    return "\n".join(parts)

def generic_extract(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    clean_soup(soup)

    # 1) cari tag <article> atau kandidat body konten
    for sel in CANDIDATE_CONTENT_SELECTORS:
        nodes = soup.select(sel)
        for node in nodes:
            text = join_block_text(node)
            if text and len(text) > 300:  # ambang minimal isi artikel
                return text

    # 2) fallback: ambil konten terpanjang dari beberapa kandidat
    candidates = []
    for node in soup.find_all(["article", "div", "section"], limit=50):
        text = join_block_text(node)
        if text:
            candidates.append((len(text), text))
    if candidates:
        candidates.sort(reverse=True, key=lambda x: x[0])
        return candidates[0][1]

    # 3) fallback terakhir: keseluruhan dokumen (p, h2, li)
    body = soup.body or soup
    text = join_block_text(body)
    return text


## 5

In [32]:
# %%
# Improved detik.com extraction + existing other extractors

def extract_detik(soup: BeautifulSoup) -> Optional[str]:
    """Ekstraktor khusus detik.com dengan beberapa lapis fallback.

    Strategi:
    1. Hapus elemen iklan/junk (clean_soup)
    2. Coba kumpulan selector spesifik (kelas bisa berubah urutannya)
    3. Fallback gabungan semua <div> kandidat yang mengandung paragraf panjang
    4. Fallback cari skrip __NEXT_DATA__ (jika arsitektur Next.js) dan ambil field konten
    5. Fallback semua <p> dalam area utama dokumen
    """
    clean_soup(soup)

    candidate_selectors = [
        "article.detail__article",
        "div.detail__body-text.itp_bodycontent",
        "div.detail__body-text",
        "div.itp_bodycontent",
        "div.detail__body",  # kadang dipakai
        "[class*=detail__body]",
        "[class*=itp_bodycontent]",
        "div#detikdetailtext",
    ]

    for sel in candidate_selectors:
        for node in soup.select(sel):
            txt = join_block_text(node)
            if txt and len(txt) > 200:
                return txt

    # Fallback: kumpulkan paragraf dari beberapa kandidat container besar
    containers = soup.select("article, div#detikdetailtext, div.detail__body-text, div.itp_bodycontent")
    gathered: List[str] = []
    for c in containers:
        t = join_block_text(c)
        if t and len(t) > 40:
            gathered.append(t)
    if gathered:
        merged = "\n".join(gathered)
        if len(merged) > 200:
            return merged

    # Fallback: parsing __NEXT_DATA__ jika ada (arsitektur JS modern)
    next_data = soup.find("script", id="__NEXT_DATA__")
    if next_data and next_data.string:
        try:
            data = json.loads(next_data.string)
            # Cari secara rekursif field bernuansa body
            def walk(o):
                if isinstance(o, dict):
                    for k, v in o.items():
                        kl = k.lower()
                        if kl in {"content", "body", "articlebody", "article_body"} and isinstance(v, (str, list)):
                            yield v
                        else:
                            yield from walk(v)
                elif isinstance(o, list):
                    for it in o:
                        yield from walk(it)
            texts = []
            for val in walk(data):
                if isinstance(val, str) and len(val) > 100:
                    # Hilangkan tag HTML dasar jika ada
                    cleaned = BeautifulSoup(val, "lxml").get_text(" ", strip=True)
                    texts.append(cleaned)
                elif isinstance(val, list):
                    joined = " ".join([str(x) for x in val])
                    if len(joined) > 100:
                        texts.append(joined)
            if texts:
                uniq = []
                seen = set()
                for t in texts:
                    if t not in seen:
                        uniq.append(t)
                        seen.add(t)
                merged = "\n".join(uniq)
                if len(merged) > 200:
                    return merged
        except Exception:
            pass

    # Fallback akhir: semua paragraf yang panjang di dokumen
    paragraphs = [p.get_text(" ", strip=True) for p in soup.find_all("p")]
    paragraphs = [p for p in paragraphs if len(p) > 40 and not p.lower().startswith("advertorial")]  # buang advertorial
    if paragraphs:
        merged = "\n".join(paragraphs)
        if len(merged) > 200:
            return merged
    return None

def extract_cnbcindo(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "article",
        "div.detail_text",
        "[class*=detail__body]"
    ]:
        nodes = soup.select(sel)
        for n in nodes:
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    return None

def extract_kompas(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "div.read__content",
        "article",
        "[class*=read__content]",
    ]:
        nodes = soup.select(sel)
        for n in nodes:
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    return None

def extract_liputan6(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "div.article-content-body__item-content",  # paragraf-paragraf
        "div.article-content-body",
        "article"
    ]:
        nodes = soup.select(sel)
        if nodes:
            # gabungkan semua paragraf dari beberapa item-content
            text_parts = []
            for n in nodes:
                t = join_block_text(n)
                if t:
                    text_parts.append(t)
            txt = "\n".join(text_parts)
            if txt and len(txt) > 200:
                return txt
    return None

def extract_tribun(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "div.side-article txt-article",
        "div.txt-article",
        "div#articlebody",
        "div#article"
    ]:
        nodes = soup.select(sel)
        for n in nodes:
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    # Lainnya:
    nodes = soup.select("div > p")
    if nodes:
        txt = "\n".join([n.get_text(" ", strip=True) for n in nodes])
        if len(txt) > 200:
            return txt
    return None

def extract_merdeka(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "div.article-content",
        "div.kanal-content",
        "article"
    ]:
        nodes = soup.select(sel)
        for n in nodes:
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    return None

def extract_antaranews(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    for sel in [
        "div.post-content",
        "div#content",
        "article"
    ]:
        nodes = soup.select(sel)
        for n in nodes:
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    return None

DOMAIN_EXTRACTORS: Dict[str, Callable[[BeautifulSoup], Optional[str]]] = {
    "detik.com": extract_detik,
    "cnbcindonesia.com": extract_cnbcindo,
    "kompas.com": extract_kompas,
    "liputan6.com": extract_liputan6,
    "tribunnews.com": extract_tribun,
    "merdeka.com": extract_merdeka,
    "antaranews.com": extract_antaranews,
}


In [33]:
def extract_tribun(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    # Versi desktop lama/baru
    for sel in [
        "div.txt-article", "div#article",
        "div#articlebody", "div.side-article.txt-article",
        "article", "[class*=read__content]", "[class*=detail__body]"
    ]:
        for n in soup.select(sel):
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    # fallback p
    ps = soup.select("article p, div p")
    if ps:
        txt = "\n".join([p.get_text(" ", strip=True) for p in ps])
        if len(txt) > 200:
            return txt
    return None

def extract_tribun_amp(soup: BeautifulSoup) -> Optional[str]:
    clean_soup(soup)
    # AMP biasanya lebih bersih
    for sel in [
        "div.read__content", "div.read__content--body",
        "article", "[itemprop='articleBody']"
    ]:
        for n in soup.select(sel):
            txt = join_block_text(n)
            if txt and len(txt) > 200:
                return txt
    ps = soup.select("article p, div p")
    if ps:
        txt = "\n".join([p.get_text(' ', strip=True) for p in ps])
        if len(txt) > 200:
            return txt
    return None


In [34]:
DOMAIN_EXTRACTORS.update({
    "tribunnews.com": extract_tribun,  # desktop
})


## 6

In [35]:
# %%
@dataclass
class ExtractResult:
    url: str
    domain: str
    text: Optional[str]
    status: str
    error: Optional[str]

def extract_with_layers(url: str, per_article_timeout: int = PER_ARTICLE_TIMEOUT) -> ExtractResult:
    """
    Layered extraction dengan batas waktu total per URL.
    Deadline global dikontrol via per_article_timeout (detik).
    """
    start = time.time()
    deadline = start + per_article_timeout

    parsed = urlparse(url)
    domain = parsed.netloc.lower()

    def remaining() -> float:
        return max(0, deadline - time.time())

    # 1) Fetch HTML (menghormati deadline)
    resp = fetch(url, timeout=REQUEST_TIMEOUT, max_retries=MAX_RETRIES, deadline=deadline)
    if resp is None:
        if time.time() > deadline:
            return ExtractResult(url, domain, None, "timeout_fetch", "Exceeded per-article timeout during fetch")
        return ExtractResult(url, domain, None, "fetch_failed", "Request failed")

    html = resp.text

    # 2) Site-specific
    try:
        if time.time() > deadline:
            return ExtractResult(url, domain, None, "timeout_before_site_specific", None)
        for known in DOMAIN_EXTRACTORS:
            if known in domain:
                soup = BeautifulSoup(html, "lxml")
                text = DOMAIN_EXTRACTORS[known](soup)
                if time.time() > deadline:
                    return ExtractResult(url, domain, None, "timeout_site_specific", None)
                if text and len(text) > 150:
                    text = remove_cleanup_patterns(text, CLEANUP_PATTERNS)
                    return ExtractResult(url, domain, text, "ok_site_specific", None)
    except Exception as e:
        logger.warning(f"Site-specific extractor error for {url}: {e}")

    # 3) Tribun AMP khusus
    if "tribunnews.com" in domain and time.time() < deadline:
        try:
            amp_html = None
            if "/amp" in resp.url or "amp" in (resp.headers.get("content-location", "") or "").lower():
                amp_html = html
            else:
                rem = remaining()
                if rem > 1.5:  # masih layak coba
                    amp_resp = fetch(as_amp(url), timeout=min(REQUEST_TIMEOUT, int(rem)), max_retries=1, deadline=deadline)
                    if amp_resp is not None and 200 <= amp_resp.status_code < 300:
                        amp_html = amp_resp.text
            if amp_html and time.time() < deadline:
                soup_amp = BeautifulSoup(amp_html, "lxml")
                text_amp = extract_tribun_amp(soup_amp)
                if time.time() > deadline:
                    return ExtractResult(url, domain, None, "timeout_tribun_amp", None)
                if text_amp and len(text_amp) > 150:
                    text_amp = remove_cleanup_patterns(text_amp, CLEANUP_PATTERNS)
                    return ExtractResult(url, domain, text_amp, "ok_tribun_amp", None)
        except Exception as e:
            logger.warning(f"Tribun AMP extractor error for {url}: {e}")

    # 4) Trafilatura
    if HAS_TRAFILATURA and time.time() < deadline:
        try:
            rem = remaining()
            if rem > 1.5:
                downloaded = None
                try:
                    import inspect
                    if "timeout" in inspect.signature(trafilatura.fetch_url).parameters:
                        downloaded = trafilatura.fetch_url(url, timeout=min(REQUEST_TIMEOUT, int(rem)))
                    else:
                        downloaded = trafilatura.fetch_url(url)
                except Exception:
                    downloaded = trafilatura.fetch_url(url)
                if downloaded and time.time() < deadline:
                    t_text = trafilatura.extract(
                        downloaded,
                        include_comments=False,
                        include_tables=False,
                        favor_recall=True,
                    )
                    if time.time() > deadline:
                        return ExtractResult(url, domain, None, "timeout_trafilatura", None)
                    if t_text and len(t_text) > 150:
                        t_text = remove_cleanup_patterns(t_text, CLEANUP_PATTERNS)
                        return ExtractResult(url, domain, t_text, "ok_trafilatura", None)
        except Exception as e:
            logger.warning(f"Trafilatura extractor error for {url}: {e}")

    # 5) Readability
    if HAS_READABILITY and time.time() < deadline:
        try:
            rem = remaining()
            if rem > 1.0:
                doc = Document(html)
                readable_html = doc.summary(html_partial=True)
                soup = BeautifulSoup(readable_html, "lxml")
                clean_soup(soup)
                r_text = join_block_text(soup)
                if time.time() > deadline:
                    return ExtractResult(url, domain, None, "timeout_readability", None)
                if r_text and len(r_text) > 150:
                    r_text = remove_cleanup_patterns(r_text, CLEANUP_PATTERNS)
                    return ExtractResult(url, domain, r_text, "ok_readability", None)
        except Exception as e:
            logger.warning(f"Readability extractor error for {url}: {e}")

    # 6) Generic heuristic
    if time.time() < deadline:
        try:
            g_text = generic_extract(html)
            if time.time() > deadline:
                return ExtractResult(url, domain, None, "timeout_generic", None)
            g_text = remove_cleanup_patterns(g_text, CLEANUP_PATTERNS)
            if g_text and len(g_text) > 100:
                return ExtractResult(url, domain, g_text, "ok_generic", None)
            else:
                return ExtractResult(url, domain, None, "empty_after_generic", None)
        except Exception as e:
            if time.time() > deadline:
                return ExtractResult(url, domain, None, "timeout_generic_exception", str(e))
            return ExtractResult(url, domain, None, "generic_failed", str(e))

    # Jika semua tahap habis waktu
    return ExtractResult(url, domain, None, "timeout_no_content", None)

## 7

In [36]:
# %%
# === Baca daftar URL dari Excel===
try:
    df_in = pd.read_csv(INPUT_CSV_PATH) # 🚀 limit ke 100 baris
except Exception as e:
    raise RuntimeError(f"Gagal membaca Excel input: {INPUT_CSV_PATH} -> {e}")

if INPUT_URL_COLUMN not in df_in.columns:
    raise ValueError(f"Kolom '{INPUT_URL_COLUMN}' tidak ditemukan di file Excel.")

urls = df_in[INPUT_URL_COLUMN].dropna().astype(str).str.strip().unique().tolist()
if MAX_URLS is not None:
    urls = urls[:MAX_URLS]

logger.info(f"Total URL untuk diproses: {len(urls)} (limit={MAX_URLS})")

results: List[ExtractResult] = []
for url in tqdm(urls, desc="Scraping artikel"):
    try:
        res = extract_with_layers(url, per_article_timeout=PER_ARTICLE_TIMEOUT)
        results.append(res)
    except Exception as e:
        parsed = urlparse(url)
        results.append(ExtractResult(url, parsed.netloc.lower(), None, "fatal_error", str(e)))

# === Buat dataframe hasil scraping ===
df_scraped = pd.DataFrame([{
    "url_berita": r.url,
    "source_domain": r.domain,
    "artikel_berita": r.text if r.text else "",
    "status": r.status,
    "error": r.error if r.error else "",
} for r in results])

# === Gabungkan dengan df_in, tambahkan kolom baru ===
df_output = df_in.merge(df_scraped, on="url_berita", how="left")

# === Bersihkan artikel_berita ===
def final_cleanup(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = normalize_whitespace(text)
    lines = [ln for ln in text.splitlines() if len(ln.strip()) > 2]
    return "\n".join(lines).strip()

df_output["artikel_berita"] = df_output["artikel_berita"].apply(final_cleanup)
df_output["is_empty"] = df_output["artikel_berita"].str.len().fillna(0).lt(60)

df_output["elapsed_flag"] = df_output["status"].str.startswith("timeout")

# === Ringkas hasil ===
summary = df_output["status"].value_counts(dropna=False)
logger.info(f"\nSummary status:\n{summary}")

# Tampilkan beberapa baris yang timeout untuk inspeksi cepat
timeout_preview = df_output[df_output["elapsed_flag"]].head(5)
logger.info(f"Timeout sample rows: {len(timeout_preview)}")

df_output.head(5)

2025-10-01 20:41:56,845 | INFO | Total URL untuk diproses: 46 (limit=None)


Scraping artikel:   0%|          | 0/46 [00:00<?, ?it/s]

2025-10-01 20:42:02,580 | ERROR | not a 200 response: 502 for URL https://kumparan.com/kumparanbisnis/purbaya-diminta-gempur-rokok-ilegal-bea-cukai-kejar-sampai-ke-e-commerce-25wTqv6LRis
2025-10-01 20:42:07,329 | ERROR | not a 200 response: 502 for URL https://kumparan.com/kumparanbisnis/ketua-komisi-xi-dpr-minta-purbaya-gempur-praktik-rokok-ilegal-25wOoSLlaNg
2025-10-01 20:42:11,431 | ERROR | not a 200 response: 403 for URL https://kaltimpost.jawapos.com/nasional/2386640196/purbaya-tegas-larang-rokok-ilegal-mulai-1-oktober-2025-warung-hingga-tokopedia-wajib-patuh
2025-10-01 20:42:11,666 | ERROR | not a 200 response: 502 for URL https://kumparan.com/kumparanbisnis/purbaya-pastikan-sidak-rokok-impor-ilegal-tak-ganggu-operasional-25vaWNIlzKQ
2025-10-01 20:42:55,904 | ERROR | not a 200 response: 404 for URL https://rri.co.id/info-pemda/1865499/pedagang-diedukasi-rokok-ilegal-diamankan
2025-10-01 20:43:03,577 | INFO | 
Summary status:
status
ok_trafilatura         34
ok_site_specific      

Unnamed: 0,judul_berita,url_berita,tanggal_berita,source_domain,artikel_berita,status,error,is_empty,elapsed_flag
0,"Purbaya Bakal Sikat Rokok Ilegal, Seberapa Par...",https://ekbis.sindonews.com/read/1626051/34/pu...,2025-09-28T16:00:00+07:00,ekbis.sindonews.com,"Purbaya Bakal Sikat Rokok Ilegal, Seberapa Par...",ok_trafilatura,,False,False
1,Kiai Jatim Dukung Menkeu Purbaya Berantas Roko...,https://surabaya.kompas.com/read/2025/10/01/11...,2025-10-01T11:13:00+07:00,surabaya.kompas.com,"TUBAN, KOMPAS.com - Pengasuh Pondok Pesantren ...",ok_site_specific,,False,False
2,Rencana Menkeu Purbaya Buat Para Penjual Rokok...,https://pasardana.id/news/2025/9/29/rencana-me...,2025-09-29T00:19:00+07:00,pasardana.id,Rencana Menkeu Purbaya Buat Para Penjual Rokok...,ok_trafilatura,,False,False
3,Penindakan Rokok Ilegal Bakal Semakin Diperket...,https://www.pajak.com/pajak/penindakan-rokok-i...,2025-09-29T14:00:01+07:00,www.pajak.com,Penindakan Rokok Ilegal Bakal Semakin Diperket...,ok_trafilatura,,False,False
4,"Lawan Produk Ilegal, Menkeu Purbaya Tak Naikka...",https://www.metrotvnews.com/play/KYVC4EaR-lawa...,2025-09-30T20:35:45+07:00,www.metrotvnews.com,30 September 2025 20:35\nMenteri Keuangan (Men...,ok_trafilatura,,False,False


## 8

In [37]:
# %%
# Simpan sebagai CSV/Excel untuk integrasi lanjutan
DATE_TAG = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")

OUTPUT_CSV = f"{cwd}/hasil_baca_berita/hasil_scraping_artikel_{DATE_TAG}.csv"
# OUTPUT_XLSX = f"{cwd}/hasil_baca_berita/hasil_scraping_artikel_{DATE_TAG}.xlsx"

try:
    df_output.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    # df_output.to_excel(OUTPUT_XLSX, index=False)
    logger.info(f"Tersimpan: {OUTPUT_CSV}")
except Exception as e:
    logger.error(f"Gagal menyimpan output: {e}")

df_output.tail(3)


2025-10-01 20:43:03,600 | INFO | Tersimpan: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita/hasil_baca_berita/hasil_scraping_artikel_20251001_204303.csv


Unnamed: 0,judul_berita,url_berita,tanggal_berita,source_domain,artikel_berita,status,error,is_empty,elapsed_flag
43,Ulama Jatim Dukung Menkeu Berantas Rokok Ilega...,https://www.detik.com/jatim/berita/d-8137673/u...,2025-09-30T15:10:55+07:00,www.detik.com,"Pengasuh Pondok Pesantren Langitan Tuban, KH M...",ok_site_specific,,False,False
44,Bea Cukai Labuan Bajo gencarkan penindakan rok...,https://kupang.antaranews.com/berita/170337/be...,2025-09-30T19:12:10+07:00,kupang.antaranews.com,"Labuan Bajo (ANTARA) - Bea Cukai Labuan Bajo, ...",ok_site_specific,,False,False
45,Pemkab Pasuruan Stop Peredaran Rokok Ilegal - ...,https://kabarbaik.co/pemkab-pasuruan-stop-pere...,2025-09-29T14:22:22+07:00,kabarbaik.co,Cek Berita dan Artikel kabarbaik.co yang lain ...,ok_trafilatura,,False,False


In [38]:
# %%
import json
from pathlib import Path


def update_config(path: Path, new_values: dict):
    """Update config.json hanya pada key tertentu tanpa menimpa keseluruhan isi."""
    data = {}
    if path.exists():
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception as e:
            logger.warning(f"Gagal membaca config lama: {e}")
            data = {}

    # update hanya key yang diberikan
    data.update(new_values)

    try:
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)
        logger.info(f"Berhasil update config.json di {path}")
    except Exception as e:
        logger.error(f"Gagal menyimpan config.json: {e}")

# Simpan OUTPUT_CSV & OUTPUT_XLSX ke config dengan nama yang lebih jelas
update_config(CONFIG_PATH, {
    "last_baca_berita_path": OUTPUT_CSV,
})


2025-10-01 20:43:03,611 | INFO | Berhasil update config.json di /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita/config.json
