# SETTING ENVIRONMENT


In [5]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [6]:
# set folder tempat kerja (current working directory)
import os
# cwd = '/content/drive/MyDrive/Monitoring Berita'
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita'
os.chdir(cwd)

# MAIN

In [7]:
# CNN Indonesia - Scraper (Tag-pages first, Search via Selenium fallback)
# - Reads keywords from config.json['keywords']
# - For each keyword: try TAG mode (recommended). If you insist on search, set FORCE_SEARCH=True.
# - Saves Excel to .../daftar_berita/cnn (auto-detected project root)

# %%
import os, re, json, time, random, logging
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Tuple, Optional
import pandas as pd
import requests
from bs4 import BeautifulSoup

# --- Auto-detect project root containing config.json ---
def locate_project_root(target_file: str = "config.json", max_up: int = 4) -> Optional[str]:
    # Priority 1: Colab path
    colab_root = "/content/drive/MyDrive/Monitoring Berita"
    if os.path.exists(os.path.join(colab_root, target_file)):
        return colab_root
    # Priority 2: current working directory or its parents
    cwd = os.getcwd()
    cur = cwd
    for _ in range(max_up):
        if os.path.exists(os.path.join(cur, target_file)):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:
            break
        cur = parent
    return None

PROJECT_ROOT = locate_project_root() or os.getcwd()
CONFIG_PATH = os.path.join(PROJECT_ROOT, "config.json")
OUTPUT_DIR  = os.path.join(PROJECT_ROOT, "daftar_berita")

MAX_PAGES   = 3      # batas iterasi halaman
FORCE_SEARCH = False # True = pakai search ?query=... via Selenium fallback

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger("cnn-fix")

UA_POOL = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.6 Safari/605.1.15",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Linux; Android 13; SM-S908E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Mobile Safari/537.36",
]

def pick_ua(): return random.choice(UA_POOL)

def sess():
    s = requests.Session()
    s.headers.update({
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "id,en-US;q=0.9,en;q=0.8",
        "Connection": "keep-alive",
        "DNT": "1",
        "Upgrade-Insecure-Requests": "1",
    })
    return s

def http_get(s: requests.Session, url: str, tries=3, backoff=1.5, timeout=25):
    last = None
    for i in range(1, tries+1):
        try:
            s.headers["User-Agent"] = pick_ua()
            r = s.get(url, timeout=timeout)
            if r.status_code == 200 and len(r.text) > 1000:
                return r
            last = f"HTTP {r.status_code}, len={len(r.text)}"
        except Exception as e:
            last = str(e)
        time.sleep(backoff ** i)
    logger.warning(f"GET fail {url} -> {last}")
    return None

# ---------- Helpers ----------
def load_keywords(path: str) -> List[str]:
    try:
        with open(path, "r", encoding="utf-8") as f:
            cfg = json.load(f)
        ks = cfg.get("keywords") or []
        ks = [str(k).strip() for k in ks if str(k).strip()]
        if ks: return ks
    except Exception as e:
        logger.warning(f"load_keywords: {e}")
    return ["menkeu"]

def slugify_for_tag(term: str) -> str:
    # cnn tag uses lowercase, spaces -> hyphen, strip non-word except hyphen
    t = term.strip().lower()
    t = re.sub(r"\s+", "-", t)
    t = re.sub(r"[^a-z0-9\-]+", "", t)
    return t

# ---------- Published date extraction ----------
def extract_published_at_from_html(html: str) -> Optional[pd.Timestamp]:
    """
    Try multiple strategies to get article publish datetime:
    1) JSON-LD (NewsArticle) datePublished
    2) <meta property="article:published_time" content="...">
    3) <meta itemprop="datePublished" content="..."> or <time datetime="...">
    Returns timezone-aware Asia/Jakarta timestamp when possible.
    """
    soup = BeautifulSoup(html, "lxml")

    # 1) JSON-LD
    for sc in soup.find_all("script", attrs={"type": "application/ld+json"}):
        txt = sc.string or sc.get_text(strip=True) or ""
        if not txt: continue
        try:
            data = json.loads(txt)
        except Exception:
            continue

        def find_date(obj):
            if isinstance(obj, dict):
                at = obj.get("@type")
                if at in ("NewsArticle", "Article", "BlogPosting"):
                    dp = obj.get("datePublished") or obj.get("dateCreated")
                    if dp:
                        return dp
                # search nested
                for v in obj.values():
                    found = find_date(v)
                    if found:
                        return found
            elif isinstance(obj, list):
                for v in obj:
                    found = find_date(v)
                    if found:
                        return found
            return None

        ds = find_date(data)
        if ds:
            ts = _to_wib_timestamp(ds)
            if ts is not None:
                return ts

    # 2) Meta tags
    meta_candidates = [
        ("meta", {"property": "article:published_time"}, "content"),
        ("meta", {"name": "pubdate"}, "content"),
        ("meta", {"itemprop": "datePublished"}, "content"),
        ("time", {"itemprop": "datePublished"}, "datetime"),
        ("time", {}, "datetime"),
    ]
    for tag, attrs, attrname in meta_candidates:
        el = soup.find(tag, attrs=attrs)
        if el and el.get(attrname):
            ts = _to_wib_timestamp(el.get(attrname).strip())
            if ts is not None:
                return ts

    # 3) Fallback: common date containers (best-effort)
    for sel in [".media__date", ".date", ".article__date", "time"]:
        for e in soup.select(sel):
            raw = (e.get("datetime") or e.get_text(" ", strip=True) or "").strip()
            if not raw:
                continue
            # handle WIB hint
            if "WIB" in raw.upper():
                raw = raw.replace("WIB", "+07:00")
            ts = _to_wib_timestamp(raw)
            if ts is not None:
                return ts

    return None

def _to_wib_timestamp(val: str) -> Optional[pd.Timestamp]:
    try:
        ts = pd.to_datetime(val, utc=True, errors="raise")
        # If parsed with timezone, convert to Asia/Jakarta
        return ts.tz_convert("Asia/Jakarta")
    except Exception:
        # Try non-UTC parse
        ts2 = pd.to_datetime(val, errors="coerce", dayfirst=True)
        if ts2 is pd.NaT:
            return None
        # Assume WIB if no tz info
        try:
            return ts2.tz_localize("Asia/Jakarta")
        except Exception:
            return None

def fetch_published_at(s: requests.Session, url: str) -> Optional[pd.Timestamp]:
    r = http_get(s, url, tries=3)
    if not r:
        return None
    return extract_published_at_from_html(r.text)

def parse_tag_page(html: str) -> List[Dict]:
    soup = BeautifulSoup(html, "lxml")
    items = []
    # Grab all anchors that look like article links beneath the tag listing.
    # Tag pages list cards; keep it heuristic but strict on domain and path depth
    for a in soup.select("a[href]"):
        href = a.get("href", "").strip()
        if not href: continue
        # absolute-ize
        if href.startswith("//"): href = "https:" + href
        elif href.startswith("/"): href = urljoin("https://www.cnnindonesia.com", href)
        # Filter to article-like paths, ruling out pure tag/section/search
        p = urlparse(href).path
        if "/tag/" in p or "/search" in p or p == "/" or p.count("/") < 2:
            continue
        title = a.get("title") or a.get_text(" ", strip=True)
        if title and len(title) > 5:
            items.append({"judul_berita": title, "url_berita": href})
    # dedup by url
    seen, uniq = set(), []
    for it in items:
        if it["url_berita"] in seen: continue
        seen.add(it["url_berita"])
        uniq.append(it)
    return uniq

def crawl_tag(term: str, max_pages=3) -> pd.DataFrame:
    s = sess()
    slug = slugify_for_tag(term)
    rows = []
    for page in range(1, max_pages+1):
        url = f"https://www.cnnindonesia.com/tag/{slug}"
        # some tag pages support ?page=2, some infinite-scroll; we try param
        if page > 1:
            url = f"{url}?page={page}"
        r = http_get(s, url)
        if not r:
            logger.warning(f"[tag:{term}] no response page {page}")
            continue
        items = parse_tag_page(r.text)
        for it in items:
            it["keyword"] = term
            it["page"] = page
            # Fetch published_at from article page (instead of scrape time)
            pub = fetch_published_at(s, it["url_berita"]) or pd.NaT
            it["published_at"] = pub
            # Keep actual scrape timestamp separately
            it["scraped_at"] = pd.Timestamp.now(tz="Asia/Jakarta")
            rows.append(it)
        logger.info(f"[tag:{term}] page {page} -> {len(items)} items")
        time.sleep(random.uniform(0.8, 1.6))
    cols = ["judul_berita","url_berita","keyword","page","published_at","scraped_at"]
    df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols)
    if not df.empty:
        df = df.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
    return df

# ---------- Optional Search Fallback (Selenium) ----------
def crawl_search_selenium(term: str, max_pages=3) -> pd.DataFrame:
    """
    Only use if you truly need the search ?query=… endpoint.
    Uses undetected-chromedriver to load pages like a real browser.
    """
    try:
        import undetected_chromedriver as uc
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC
    except Exception:
        # Install on the fly (Colab-friendly)
        import sys, subprocess
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "undetected-chromedriver", "selenium"])
        import undetected_chromedriver as uc
        from selenium.webdriver.common.by import By
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.support import expected_conditions as EC

    opts = uc.ChromeOptions()
    opts.add_argument("--headless=new")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--window-size=1366,768")
    opts.add_argument(f"--user-agent={pick_ua()}")

    driver = uc.Chrome(options=opts)
    rows = []
    try:
        for page in range(1, max_pages+1):
            url = f"https://www.cnnindonesia.com/search?query={term}&result_type=latest&page={page}"
            driver.get(url)
            # wait minimal content
            WebDriverWait(driver, 15).until(lambda d: len(d.page_source) > 20000)
            html = driver.page_source
            items = parse_tag_page(html)  # reuse same robust anchor collector
            # We'll fetch published_at with requests session for each item
            s = sess()
            for it in items:
                it["keyword"] = term
                it["page"] = page
                pub = fetch_published_at(s, it["url_berita"]) or pd.NaT
                it["published_at"] = pub
                it["scraped_at"] = pd.Timestamp.now(tz="Asia/Jakarta")
                rows.append(it)
            logger.info(f"[search:{term}] page {page} -> {len(items)} items")
            time.sleep(random.uniform(1.0, 2.0))
    finally:
        driver.quit()
    cols = ["judul_berita","url_berita","keyword","page","published_at","scraped_at"]
    df = pd.DataFrame(rows) if rows else pd.DataFrame(columns=cols)
    if not df.empty:
        df = df.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
    return df

# ---------- Run ----------
def save_excel(df: pd.DataFrame, out_dir: str) -> str:
    os.makedirs(out_dir, exist_ok=True)
    fpath = os.path.join(out_dir, f"cnn_links.csv")
    df.to_csv(fpath, index=False)
    return fpath

def main():
    keywords = load_keywords(CONFIG_PATH)
    logger.info(f"Project root: {PROJECT_ROOT}")
    logger.info(f"Keywords: {keywords}")
    dfs = []
    for kw in keywords:
        if FORCE_SEARCH:
            logger.info(f"Using SEARCH mode for: {kw}")
            df = crawl_search_selenium(kw, MAX_PAGES)
        else:
            logger.info(f"Using TAG mode for: {kw}")
            df = crawl_tag(kw, MAX_PAGES)
        dfs.append(df)
    default_cols = ["judul_berita","url_berita","keyword","page","published_at","scraped_at"]
    df_all = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame(columns=default_cols)
    df_all = df_all.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
    logger.info(f"Total unique links: {len(df_all)}")
    out = save_excel(df_all, OUTPUT_DIR)
    logger.info(f"Saved: {out}")
    try:
        display(df_all.head(10))
    except Exception:
        pass
    return out

if __name__ == "__main__":
    path = main()
    print("Output Excel:", path)


2025-09-26 07:45:15,071 | INFO | Project root: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita
2025-09-26 07:45:15,071 | INFO | Keywords: ['purbaya', 'menkeu', 'banggar', 'apbn']
2025-09-26 07:45:15,072 | INFO | Using TAG mode for: purbaya
2025-09-26 07:45:15,071 | INFO | Keywords: ['purbaya', 'menkeu', 'banggar', 'apbn']
2025-09-26 07:45:15,072 | INFO | Using TAG mode for: purbaya
2025-09-26 07:45:19,885 | INFO | [tag:purbaya] page 1 -> 36 items
2025-09-26 07:45:19,885 | INFO | [tag:purbaya] page 1 -> 36 items
2025-09-26 07:45:24,642 | INFO | [tag:purbaya] page 2 -> 34 items
2025-09-26 07:45:24,642 | INFO | [tag:purbaya] page 2 -> 34 items
2025-09-26 07:45:29,090 | INFO | [tag:purbaya] page 3 -> 28 items
2025-09-26 07:45:29,090 | INFO | [tag:purbaya] page 3 -> 28 items
2025-09-26 07:45:30,677 | INFO | Using TAG mode for: menkeu
2025-09-26 07:45:30,677 | INFO | Using TAG mode for: menkeu
2025-09-26 07:45:34,558 | INFO | [tag:menkeu

Unnamed: 0,judul_berita,url_berita,keyword,page,published_at,scraped_at
0,Keluar,https://connect.detik.com/oauth/signout?redire...,purbaya,1,NaT,2025-09-26 07:45:15.469785+07:00
1,DAFTAR,https://connect.detik.com/accounts/register?cl...,purbaya,1,NaT,2025-09-26 07:45:15.509931+07:00
2,Politik,https://www.cnnindonesia.com/nasional/politik,purbaya,1,NaT,2025-09-26 07:45:15.656043+07:00
3,Hukum & Kriminal,https://www.cnnindonesia.com/nasional/hukum-kr...,purbaya,1,NaT,2025-09-26 07:45:15.745989+07:00
4,Peristiwa,https://www.cnnindonesia.com/nasional/peristiwa,purbaya,1,NaT,2025-09-26 07:45:15.910123+07:00
5,Pemilu,https://www.cnnindonesia.com/nasional/pemilu,purbaya,1,NaT,2025-09-26 07:45:16.041971+07:00
6,Info Politik,https://www.cnnindonesia.com/nasional/info-pol...,purbaya,1,NaT,2025-09-26 07:45:16.176766+07:00
7,Asia Pasifik,https://www.cnnindonesia.com/internasional/asi...,purbaya,1,NaT,2025-09-26 07:45:16.280720+07:00
8,Timur Tengah,https://www.cnnindonesia.com/internasional/tim...,purbaya,1,NaT,2025-09-26 07:45:16.348669+07:00
9,Eropa Amerika,https://www.cnnindonesia.com/internasional/ero...,purbaya,1,NaT,2025-09-26 07:45:16.480125+07:00


Output Excel: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/daftar_berita/cnn_links.csv


In [8]:
df = pd.read_csv(path)
# Prefer published_at (from article page). If missing, fallback to scraped_at.
df["tanggal_berita"] = pd.to_datetime(df.get("published_at", pd.NaT), errors="coerce").fillna(
    pd.to_datetime(df.get("scraped_at", pd.NaT), errors="coerce")
)

# Ensure timezone-naive for Excel (Excel cannot handle tz-aware datetimes)
if pd.api.types.is_datetime64_any_dtype(df["tanggal_berita"]):
    # If any tz-aware, strip tz info after converting to Asia/Jakarta
    try:
        # Convert any timezone-aware values to Asia/Jakarta then drop tz
        s = pd.to_datetime(df["tanggal_berita"], errors="coerce")
        # If some are tz-aware, convert; otherwise this is a no-op
        if getattr(s.dt.tz, "zone", None) is not None or s.dt.tz is not None:
            s = s.dt.tz_convert("Asia/Jakarta")
        # Drop tz info
        df["tanggal_berita"] = s.dt.tz_localize(None)
    except Exception:
        # Fallback: force naive by localizing-none where possible
        try:
            df["tanggal_berita"] = pd.to_datetime(df["tanggal_berita"], errors="coerce").dt.tz_localize(None)
        except Exception:
            pass

# Keep columns tidy
cols_order = [
    c for c in ["judul_berita","url_berita","keyword","page","tanggal_berita","published_at","scraped_at"]
    if c in df.columns
]
cols_order += [c for c in df.columns if c not in cols_order]
df = df[cols_order]

df.to_csv(path, index=False)
df.to_excel(path.replace(".csv", ".xlsx"), index=False)
