# SETTING ENVIRONMENT


In [1]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [None]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

# MAIN CODE

In [3]:
# === Sel 1: Import & Logging Config ===
import logging
import time
import re
from datetime import datetime
from typing import List, Optional, Tuple
from urllib.parse import urljoin

import requests
from requests.exceptions import RequestException, HTTPError, Timeout
from bs4 import BeautifulSoup
import pandas as pd

# Konfigurasi logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s",
)

# Konstanta umum
DEFAULT_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/127.0.0.0 Safari/537.36"
)
REQUEST_TIMEOUT = 10  # detik
MAX_RETRIES = 3
BACKOFF_SCHEDULE = [1, 2, 4]  # detik
BASE_DOMAIN = "https://index.okezone.com/"


In [4]:
# === Sel 2: Parameter Mudah Diubah (baca dari config.json) ===
import json
from pathlib import Path

CONFIG_PATH = Path("/content/drive/MyDrive/Monitoring Berita/config.json")

DEFAULT_TOPIC_KEYWORDS = ["ekonomi","bursa","saham","investasi","keuangan","bank","rupiah","inflasi","bisnis"]
DEFAULT_DATES = [datetime.now().strftime("%Y-%m-%d")]
DEFAULT_MAX_PAGES = 5  # bisa diubah

def load_parameters_from_config(config_path: Path):
    topic_keywords = DEFAULT_TOPIC_KEYWORDS
    dates = DEFAULT_DATES
    max_pages_per_date = DEFAULT_MAX_PAGES

    if config_path.exists():
        try:
            with open(config_path, "r", encoding="utf-8") as f:
                cfg = json.load(f)

            if isinstance(cfg.get("keywords"), list) and cfg["keywords"]:
                topic_keywords = [str(k).strip() for k in cfg["keywords"] if str(k).strip()]

            if isinstance(cfg.get("search_data"), list) and cfg["search_data"]:
                dates_tmp = []
                for d in cfg["search_data"]:
                    ds = str(d).strip()
                    try:
                        datetime.strptime(ds, "%Y-%m-%d")
                        dates_tmp.append(ds)
                    except ValueError:
                        logging.warning(f"Ignored invalid date in config.search_data: {ds}")
                if dates_tmp:
                    dates = dates_tmp

            if cfg.get("max_page_length") is not None:
                try:
                    m = int(cfg["max_page_length"])
                    if m > 0:
                        max_pages_per_date = m
                except (TypeError, ValueError):
                    logging.warning("Invalid max_page_length; using default.")
        except Exception as e:
            logging.error(f"Gagal membaca config.json: {e}. Menggunakan default.")
    else:
        logging.warning(f"Config tidak ditemukan di {config_path}. Menggunakan default.")

    # Safety cap
    max_pages_per_date = min(max_pages_per_date, 50)
    return topic_keywords, dates, max_pages_per_date

topic_keywords, dates, max_pages_per_date = load_parameters_from_config(CONFIG_PATH)

logging.info(f"topic_keywords: {topic_keywords}")
logging.info(f"dates: {dates}")
logging.info(f"max_pages_per_date: {max_pages_per_date}")


2025-09-26 07:55:44,517 | INFO | topic_keywords: ['ekonomi', 'bursa', 'saham', 'investasi', 'keuangan', 'bank', 'rupiah', 'inflasi', 'bisnis']
2025-09-26 07:55:44,518 | INFO | dates: ['2025-09-26']
2025-09-26 07:55:44,518 | INFO | max_pages_per_date: 5


In [5]:
# === Sel 3: Kelas Scraper (OkezoneIndexScraper) ===
class OkezoneIndexScraper:
    """
    Halaman pertama per tanggal:
      https://index.okezone.com/bydate/index/YYYY/MM/DD/0/
    Halaman selanjutnya (pagination):
      https://index.okezone.com/home/index/{offset}/
    offset = (page-1) * 15
    """

    def __init__(self, user_agent: str = DEFAULT_USER_AGENT):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": user_agent,
            "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7"
        })
        self.base = BASE_DOMAIN

    def _build_index_urls(self, date_str: str, page: int) -> List[str]:
        """
        Page 1: bydate/index/YYYY/MM/DD/0/
        Page 2+: home/index/{offset}/
        """
        dt = datetime.strptime(date_str, "%Y-%m-%d")
        year, month, day = dt.strftime("%Y"), dt.strftime("%m"), dt.strftime("%d")
        offset = (page - 1) * 15

        if page == 1:
            return [f"https://index.okezone.com/bydate/index/{year}/{month}/{day}/0/"]
        else:
            return [f"https://index.okezone.com/home/index/{offset}/"]

    def _get_with_retry(self, url: str) -> Optional[requests.Response]:
        for attempt in range(MAX_RETRIES):
            try:
                resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
                resp.raise_for_status()
                return resp
            except Timeout:
                logging.warning(f"Timeout {url} (try {attempt+1}/{MAX_RETRIES})")
            except HTTPError as he:
                status = getattr(he.response, "status_code", None)
                if status and 500 <= status < 600 and attempt < MAX_RETRIES - 1:
                    logging.warning(f"HTTP {status} {url} (retry {attempt+1}/{MAX_RETRIES})")
                else:
                    logging.error(f"HTTP error {status} pada {url}. Stop URL ini.")
                    return None
            except RequestException as e:
                logging.error(f"RequestException {url}: {e}")
                return None
            if attempt < len(BACKOFF_SCHEDULE):
                time.sleep(BACKOFF_SCHEDULE[attempt])
        return None

    @staticmethod
    def _select_article_nodes(soup: BeautifulSoup) -> List:
        """
        Struktur Okezone indeks:
        <ul class="list-berita"> <li class="idx-list"> ... <h4><a>Judul</a></h4> <time>...</time> ...
        """
        nodes = soup.select("ul.list-berita li.idx-list")
        if nodes:
            return nodes

        # Fallbacks
        for css in [
            "ul.list-berita li",
            "div.news-content li.idx-list",
            "article",
            "li",
        ]:
            cand = soup.select(css)
            if cand:
                return cand
        return []

    @staticmethod
    def _normalize_text(s: str) -> str:
        s = s.lower().strip()
        s = re.sub(r"[^\w\s-]", " ", s, flags=re.UNICODE)
        s = re.sub(r"\s+", " ", s)
        return s

    @staticmethod
    def _find_keywords(title: str, topic_keywords: List[str]) -> Tuple[bool, str]:
        norm_title = OkezoneIndexScraper._normalize_text(title)
        found = []
        for kw in topic_keywords:
            kw_norm = OkezoneIndexScraper._normalize_text(str(kw))
            if kw_norm and re.search(rf"\b{re.escape(kw_norm)}\b", norm_title):
                found.append(kw)
        return (len(found) > 0, ", ".join(found))

    def _extract_article_data(self, node, fallback_date_iso: str, topic_keywords: List[str]) -> Optional[dict]:
        try:
            a = node.find("a", href=True) or node.select_one("a[href]")
            if not a:
                return None

            url = a.get("href", "").strip()
            if not url:
                return None
            if not url.startswith("http"):
                url = urljoin(self.base, url)

            # Judul
            title = ""
            h4a = node.select_one("h4 a")
            if h4a:
                title = h4a.get_text(" ", strip=True)
            if not title:
                title = a.get_text(" ", strip=True)
            if not title:
                return None

            # Tanggal di listing (jika ada)
            tanggal = None
            time_tag = node.find("time")
            if time_tag:
                tanggal = (time_tag.get("datetime") or
                           time_tag.get_text(" ", strip=True))

            if not tanggal:
                tanggal = fallback_date_iso

            # Penulis jarang ada di listing
            penulis = "Tidak Diketahui"

            relevan, keywords_found = self._find_keywords(title, topic_keywords)

            return {
                "judul_berita": title,
                "tanggal_berita": tanggal,
                "penulis_berita": penulis,
                "url_berita": url,
                "relevan": bool(relevan),
                "keywords_found": keywords_found,
            }
        except Exception as e:
            logging.warning(f"Gagal parsing artikel: {e}")
            return None

    def _parse_list_page(self, html: str) -> List:
        soup = BeautifulSoup(html, "html.parser")
        return self._select_article_nodes(soup)

    def scrape_date(self, date_str: str, max_pages: int, topic_keywords: List[str]) -> pd.DataFrame:
        logging.info(f"Memproses tanggal: {date_str}")
        all_rows = []

        for page in range(1, max_pages + 1):
            url = self._build_index_urls(date_str, page)[0]
            resp = self._get_with_retry(url)
            if not resp or not resp.text:
                logging.warning(f"Gagal memuat / tidak ada konten: {url}. Stop tanggal ini.")
                break

            nodes = self._parse_list_page(resp.text)
            logging.info(f"[{date_str}] URL: {url} | Artikel ditemukan: {len(nodes)}")

            if not nodes:
                logging.info(f"Halaman kosong untuk {date_str} page {page}. Stop iterasi tanggal.")
                break

            ok_count = 0
            for n in nodes:
                row = self._extract_article_data(n, fallback_date_iso=date_str, topic_keywords=topic_keywords)
                if row:
                    all_rows.append(row)
                    ok_count += 1

            logging.info(f"Artikel diparsing (page {page}): {ok_count}")
            time.sleep(1)  # pengendalian beban

        if not all_rows:
            logging.warning(f"Tidak ada data pada tanggal {date_str}.")
            return pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])

        df = pd.DataFrame(all_rows, columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])
        return df

    def scrape_many(self, dates: List[str], max_pages: int, topic_keywords: List[str]) -> pd.DataFrame:
        frames = []
        for d in dates:
            try:
                datetime.strptime(d, "%Y-%m-%d")
            except ValueError:
                logging.error(f"Format tanggal tidak valid: {d} (YYYY-MM-DD). Lewati.")
                continue

            df_day = self.scrape_date(d, max_pages=max(1, min(int(max_pages), 100)), topic_keywords=topic_keywords)
            if not df_day.empty:
                frames.append(df_day)

        if frames:
            df_all = pd.concat(frames, ignore_index=True)
            df_all = df_all.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
        else:
            df_all = pd.DataFrame(columns=["judul_berita","tanggal_berita","penulis_berita","url_berita","relevan","keywords_found"])
        return df_all


In [6]:
# === Sel 4: Eksekusi & Ringkasan ===
scraper = OkezoneIndexScraper()
df_all = scraper.scrape_many(dates=dates, max_pages=max_pages_per_date, topic_keywords=topic_keywords)

display(df_all)

total_artikel = len(df_all)
total_relevan = int(df_all["relevan"].sum()) if not df_all.empty else 0

print("\n--- Ringkasan ---")
print(f"Total artikel: {total_artikel}")
print(f"Artikel relevan: {total_relevan}")

if total_relevan > 0:
    contoh = df_all[df_all["relevan"]].head(5)[["judul_berita","keywords_found","url_berita"]]
    print("\nContoh 5 judul relevan:")
    for _, row in contoh.iterrows():
        print(f"- {row['judul_berita']}  |  keywords: {row['keywords_found']}  |  {row['url_berita']}")
else:
    print("Tidak ada judul relevan untuk ditampilkan.")


2025-09-26 07:55:44,535 | INFO | Memproses tanggal: 2025-09-26
2025-09-26 07:55:45,582 | INFO | [2025-09-26] URL: https://index.okezone.com/bydate/index/2025/09/26/0/ | Artikel ditemukan: 15
2025-09-26 07:55:45,584 | INFO | Artikel diparsing (page 1): 15
2025-09-26 07:55:48,399 | INFO | [2025-09-26] URL: https://index.okezone.com/home/index/15/ | Artikel ditemukan: 15
2025-09-26 07:55:48,401 | INFO | Artikel diparsing (page 2): 15
2025-09-26 07:55:52,116 | INFO | [2025-09-26] URL: https://index.okezone.com/home/index/30/ | Artikel ditemukan: 5
2025-09-26 07:55:52,117 | INFO | Artikel diparsing (page 3): 5
2025-09-26 07:55:54,152 | INFO | [2025-09-26] URL: https://index.okezone.com/home/index/45/ | Artikel ditemukan: 75
2025-09-26 07:55:54,158 | INFO | Artikel diparsing (page 4): 74
2025-09-26 07:55:57,704 | INFO | [2025-09-26] URL: https://index.okezone.com/home/index/60/ | Artikel ditemukan: 75
2025-09-26 07:55:57,710 | INFO | Artikel diparsing (page 5): 74


Unnamed: 0,judul_berita,tanggal_berita,penulis_berita,url_berita,relevan,keywords_found
0,Terungkap! Ini Biang Kerok Macet Horor di Jakarta,"Hot Issue | Jum'at, 26 September 2025 07:47 WI...",Tidak Diketahui,https://economy.okezone.com/read/2025/09/26/32...,False,
1,Calvin Verdonk Catat Statistik Apik saat LOSC ...,"Sepakbola Dunia | Jum'at, 26 September 2025 07...",Tidak Diketahui,https://bola.okezone.com/read/2025/09/26/51/31...,False,
2,"Gunung Lewotobi Laki-Laki Meletus, Waspada Abu...","Nusantara | Jum'at, 26 September 2025 07:41 WI...",Tidak Diketahui,https://news.okezone.com/read/2025/09/26/340/3...,False,
3,"KKB Bantai 5 Warga Sipil di Yahukimo Papua, DP...","Nasional | Jum'at, 26 September 2025 07:32 WIB...",Tidak Diketahui,https://news.okezone.com/read/2025/09/26/337/3...,False,
4,"Soal Korupsi Kuota Haji, Pakar Soroti Kewenang...","Nasional | Jum'at, 26 September 2025 07:27 WIB...",Tidak Diketahui,https://news.okezone.com/read/2025/09/26/337/3...,False,
...,...,...,...,...,...,...
84,Redaksi,2025-09-26,Tidak Diketahui,https://management.okezone.com/redaksi,False,
85,Kotak Pos,2025-09-26,Tidak Diketahui,https://management.okezone.com/pos,False,
86,Karier,2025-09-26,Tidak Diketahui,https://career.okezone.com,False,
87,Info Iklan,2025-09-26,Tidak Diketahui,https://client.okezone.com/advertising,False,



--- Ringkasan ---
Total artikel: 89
Artikel relevan: 1

Contoh 5 judul relevan:
- Polisi Tangkap 2 Admin Situs Judi Online Beromzet Ratusan Juta Rupiah di Kalideres  |  keywords: rupiah  |  https://news.okezone.com/read/2025/09/25/338/3172523/polisi-tangkap-2-admin-situs-judi-online-beromzet-ratusan-juta-rupiah-di-kalideres


In [7]:
# simpan data ke folder tersendiri
df_all.to_excel(cwd + f"/daftar_berita/okezone.xlsx", index=False)