In [16]:
# Reset execution state untuk mencegah duplikasi
import sys
if hasattr(sys.modules[__name__], '_already_executed'):
    print("Mencegah eksekusi ganda - gunakan restart kernel jika diperlukan")
else:
    sys.modules[__name__]._already_executed = True

In [17]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
# cwd = '/content/drive/MyDrive/Monitoring Berita'
os.chdir(cwd)

In [20]:
# Check and install required libraries only if not already installed
import importlib
import subprocess
import sys

def check_and_install_package(package_name, import_name=None):
    """Check if package is installed, if not install it"""
    if import_name is None:
        import_name = package_name
    
    try:
        importlib.import_module(import_name)
        print(f"✓ {package_name} sudah terinstall")
        return True
    except ImportError:
        print(f"⚠ {package_name} belum terinstall, menginstall...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"✓ {package_name} berhasil diinstall")
            return True
        except subprocess.CalledProcessError:
            print(f"✗ Gagal menginstall {package_name}")
            return False

# Check required packages
packages_to_check = [
    ("googlenewsdecoder", "googlenewsdecoder"),
    ("tqdm", "tqdm")
]

print("Mengecek dependencies...")
all_installed = True
for package, import_name in packages_to_check:
    if not check_and_install_package(package, import_name):
        all_installed = False

if all_installed:
    print("\n🎉 Semua dependencies siap!")
else:
    print("\n❌ Ada masalah dengan instalasi dependencies")

Mengecek dependencies...
✓ googlenewsdecoder sudah terinstall
✓ tqdm sudah terinstall

🎉 Semua dependencies siap!


In [19]:
"""Tarik daftar berita via Google News RSS.

Menghasilkan DataFrame dengan kolom sama seperti scraper lain:
    query, judul_berita, tanggal_berita, penulis_berita, url_berita

Sumber: Google News RSS (hl=id, gl=ID)
Catatan:
 - Google News tidak selalu menyediakan penulis, hanya sumber (media). Itu kita mapping ke penulis_berita.
 - Tanggal di <pubDate> adalah GMT. Kita konversi ke zona Asia/Jakarta dan format "%Y-%m-%d %H:%M:%S".
 - Kembali ditambahkan filter tanggal: hanya tanggal (YYYY-MM-DD) yang ada di config['search_date'] yang diikutkan jika daftar itu tidak kosong.
 - Pembatas jumlah item per query diterapkan SETELAH filter tanggal (agar slot diisi item relevan tanggal target).

Pemakaian:
    python list_berita_google_news_rss.py  # hasil akan tersimpan ke daftar_berita/google_news_rss.xlsx

Opsi lingkungan (opsional melalui variabel environment):
    GNEWS_TIME_WINDOW_DAYS  (default 7)  -> batas pencarian relatif (when:7d) agar cakupan feed cukup.
"""

from __future__ import annotations

import os
import json
import time
import random
from typing import List, Dict, Optional
import datetime as dt
import zoneinfo
import re
import html
import urllib.parse as urlparse
import xml.etree.ElementTree as ET

import pandas as pd
import requests
from tqdm import tqdm

from googlenewsdecoder import gnewsdecoder

# --------------------------------------------------
# Parameter umum
# --------------------------------------------------
JAKARTA_TZ = zoneinfo.ZoneInfo("Asia/Jakarta")
USER_AGENTS = [
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:125.0) Gecko/20100101 Firefox/125.0",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36",
]
REQUEST_DELAY_RANGE = (0.8, 1.6)
RETRY_TOTAL = 3
TIME_WINDOW_DAYS = int(os.environ.get("GNEWS_TIME_WINDOW_DAYS", "7"))

# --------------------------------------------------
# Util: HTTP fetch dengan retry sederhana
# --------------------------------------------------

def fetch_url(url: str, timeout: float = 15.0) -> str:
    last_err: Optional[Exception] = None
    for attempt in range(1, RETRY_TOTAL + 1):
        try:
            headers = {"User-Agent": random.choice(USER_AGENTS)}
            r = requests.get(url, headers=headers, timeout=timeout)
            if r.status_code >= 400:
                raise RuntimeError(f"Status {r.status_code}")
            return r.text
        except Exception as e:  # noqa: BLE001
            last_err = e
            time.sleep(0.5 * attempt)
    raise RuntimeError(f"Gagal fetch setelah {RETRY_TOTAL} percobaan: {last_err}")

# --------------------------------------------------
# Build URL Google News RSS
# --------------------------------------------------

def build_google_news_rss_url(query: str, time_window_days: int = TIME_WINDOW_DAYS) -> str:
    q = query.strip()
    encoded = urlparse.quote_plus(f"{q} when:{time_window_days}d")
    base = "https://news.google.com/rss/search"
    return f"{base}?q={encoded}&hl=id&gl=ID&ceid=ID:id"

# --------------------------------------------------
# Parse RSS XML ke list item
# --------------------------------------------------

def parse_rss_items(xml_text: str) -> List[Dict[str, str]]:
    cleaned = re.sub(r"[\x00-\x08\x0b-\x0c\x0e-\x1f]", "", xml_text)
    root = ET.fromstring(cleaned)
    channel = root.find("channel")
    if channel is None:
        return []
    items_out: List[Dict[str, str]] = []
    for item in channel.findall("item"):
        title_el = item.find("title")
        link_el = item.find("link")
        pub_el = item.find("pubDate")
        source_el = item.find("source")

        title = html.unescape(title_el.text.strip()) if title_el is not None and title_el.text else ""
        link = link_el.text.strip() if link_el is not None and link_el.text else ""
        pub_raw = pub_el.text.strip() if pub_el is not None and pub_el.text else ""
        source = source_el.text.strip() if source_el is not None and source_el.text else ""

        final_url = resolve_final_article_url(link)

        items_out.append({
            "judul_berita": title,
            "url_berita": final_url,
            "pub_raw": pub_raw,
            "penulis_berita": source,
        })
    return items_out

# --------------------------------------------------
# Resolve final URL dari link news.google.com jika ada parameter url=...
# --------------------------------------------------

def resolve_final_article_url(link: str) -> str:
    if not link:
        return link
    try:
        if "news.google.com" in link and "url=" in link:
            parsed = urlparse.urlparse(link)
            qs = urlparse.parse_qs(parsed.query)
            if "url" in qs and qs["url"]:
                return qs["url"][0]
    except Exception:  # noqa: BLE001
        return link
    return link

# --------------------------------------------------
# Convert pubDate -> datetime lokal & format string
# --------------------------------------------------
RFC_PARSE_FORMATS = [
    "%a, %d %b %Y %H:%M:%S %Z",
    "%a, %d %b %Y %H:%M:%S %z",
]


def parse_pubdate(pub_raw: str) -> Optional[dt.datetime]:
    if not pub_raw:
        return None
    for fmt in RFC_PARSE_FORMATS:
        try:
            dt_obj = dt.datetime.strptime(pub_raw, fmt)
            if dt_obj.tzinfo is None:
                dt_obj = dt_obj.replace(tzinfo=dt.timezone.utc)
            return dt_obj.astimezone(JAKARTA_TZ)
        except Exception:  # noqa: BLE001
            continue
    return None

# --------------------------------------------------
# Ambil berita untuk satu query (filter tanggal + batasi max item)
# --------------------------------------------------

def scrape_google_news_query(query: str, max_items: int, date_filters: List[str], delay_range=REQUEST_DELAY_RANGE, pbar=None) -> pd.DataFrame:
    url = build_google_news_rss_url(query)
    if pbar:
        pbar.set_description(f"Fetching: {query}")
    
    try:
        xml_text = fetch_url(url)
    except Exception as e:  # noqa: BLE001
        return pd.DataFrame(columns=["query", "judul_berita", "tanggal_berita", "penulis_berita", "url_berita"])

    raw_items = parse_rss_items(xml_text)

    # Transform + filter tanggal jika disediakan
    out_items: List[Dict[str, str]] = []
    date_set = set(d.strip() for d in date_filters if d.strip()) if date_filters else None
    for it in raw_items:
        pub_dt = parse_pubdate(it.get("pub_raw", ""))
        tanggal_fmt = pub_dt.strftime("%Y-%m-%d %H:%M:%S") if pub_dt else ""
        date_only = tanggal_fmt[:10] if tanggal_fmt else None
        if date_set is not None and date_only not in date_set:
            continue
        out_items.append({
            "query": query,
            "judul_berita": it.get("judul_berita", ""),
            "tanggal_berita": tanggal_fmt,
            "penulis_berita": it.get("penulis_berita", ""),
            "url_berita": it.get("url_berita", ""),
        })

    # Batasi setelah filter
    if max_items > 0 and len(out_items) > max_items:
        out_items = out_items[: max_items]

    df = pd.DataFrame(out_items, columns=["query", "judul_berita", "tanggal_berita", "penulis_berita", "url_berita"])
    # Hapus duplikat dasar per query (hanya URL yang sama)
    if not df.empty:
        df = df.drop_duplicates(subset=["url_berita"]).reset_index(drop=True)
    time.sleep(random.uniform(*delay_range))
    return df

# --------------------------------------------------
# Ambil berita untuk banyak query
# --------------------------------------------------

def scrape_google_news_queries(queries: List[str], max_items: int, date_filters: List[str]) -> pd.DataFrame:
    all_df: List[pd.DataFrame] = []
    
    # Progress bar untuk scraping queries
    with tqdm(total=len(queries), desc="Scraping queries", unit="query") as pbar:
        for q in queries:
            df_q = scrape_google_news_query(q, max_items=max_items, date_filters=date_filters, pbar=pbar)
            all_df.append(df_q)
            pbar.update(1)
            pbar.set_postfix({'Articles': sum(len(df) for df in all_df)})
    
    if not all_df:
        return pd.DataFrame(columns=["query", "judul_berita", "tanggal_berita", "penulis_berita", "url_berita"])
    df = pd.concat(all_df, ignore_index=True)
    return df

# --------------------------------------------------
# Fungsi untuk menghapus duplikat menyeluruh
# --------------------------------------------------

def remove_duplicates_comprehensive(df: pd.DataFrame) -> pd.DataFrame:
    """
    Menghapus duplikat secara menyeluruh berdasarkan beberapa kriteria:
    1. URL berita yang sama
    2. Judul berita yang sangat mirip (untuk menangani judul dengan sedikit variasi)
    3. Kombinasi penulis dan tanggal yang sama dengan judul mirip
    
    Args:
        df: DataFrame dengan kolom [query, judul_berita, tanggal_berita, penulis_berita, url_berita]
    
    Returns:
        DataFrame yang sudah dibersihkan dari duplikat
    """
    if df.empty:
        return df
    
    initial_count = len(df)
    
    # Progress bar untuk deduplication
    with tqdm(total=4, desc="Removing duplicates", unit="step") as pbar:
        # 1. Hapus duplikat berdasarkan URL yang sama (antar query bisa ada duplikat)
        pbar.set_postfix_str("URLs")
        df_cleaned = df.drop_duplicates(subset=["url_berita"], keep='first').reset_index(drop=True)
        pbar.update(1)
        
        # 2. Hapus duplikat berdasarkan judul yang sangat mirip
        pbar.set_postfix_str("Titles")
        df_cleaned['judul_normalized'] = df_cleaned['judul_berita'].str.lower().str.strip()
        df_cleaned['judul_normalized'] = df_cleaned['judul_normalized'].str.replace(r'[^\w\s]', ' ', regex=True)
        df_cleaned['judul_normalized'] = df_cleaned['judul_normalized'].str.replace(r'\s+', ' ', regex=True)
        pbar.update(1)
        
        # Hapus duplikat berdasarkan judul yang sudah dinormalisasi
        pbar.set_postfix_str("Normalized titles")
        df_cleaned = df_cleaned.drop_duplicates(subset=["judul_normalized"], keep='first').reset_index(drop=True)
        pbar.update(1)
        
        # 3. Hapus duplikat berdasarkan kombinasi penulis + tanggal dengan judul sangat mirip
        pbar.set_postfix_str("Author+Date+Title")
        df_cleaned['tanggal_only'] = df_cleaned['tanggal_berita'].str[:10]  # ambil YYYY-MM-DD saja
        df_cleaned = df_cleaned.drop_duplicates(subset=["penulis_berita", "tanggal_only", "judul_normalized"], keep='first').reset_index(drop=True)
        
        # Hapus kolom bantuan yang tidak diperlukan
        df_cleaned = df_cleaned.drop(columns=["judul_normalized", "tanggal_only"])
        pbar.update(1)
    
    final_count = len(df_cleaned)
    removed_count = initial_count - final_count
    print(f"Duplikasi dihapus: {initial_count} → {final_count} artikel (dihapus: {removed_count})")
    
    return df_cleaned

# --------------------------------------------------
# Main CLI (aman untuk notebook & script)
# --------------------------------------------------

def main():  # noqa: D401
    if "__file__" in globals():
        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    else:
        base_dir = os.getcwd()

    config_path = os.path.join(base_dir, "config.json")
    if not os.path.exists(config_path):
        raise SystemExit(f"config.json tidak ditemukan di {base_dir}")
    with open(config_path, "r", encoding="utf-8") as f:
        config = json.load(f)

    queries = config.get("keywords", [])
    date_filters = config.get("search_date", [])  # daftar tanggal (YYYY-MM-DD)
    # ubah untuk hasil akhir
    max_items = 200
    if not queries:
        raise SystemExit("keywords kosong di config.json")

    print(f"Scraping Google News untuk {len(queries)} queries dengan max {max_items} artikel per query")
    if date_filters:
        print(f"Filter tanggal: {', '.join(date_filters)}")
    
    df = scrape_google_news_queries(queries, max_items=max_items, date_filters=date_filters)
    
    # Hapus duplikat secara menyeluruh sebelum proses selanjutnya
    df = remove_duplicates_comprehensive(df)
    
    return df


def convert_link():
    if "__file__" in globals():
        base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
    else:
        base_dir = os.getcwd()

    df = main()

    interval_time = 1  # interval is optional, default is None
    daftar_berita = df['url_berita']
    
    print(f"\nMemulai decode {len(daftar_berita)} Google News URLs...")
    
    decoded = []
    success_count = 0
    error_count = 0
    
    # Progress bar untuk proses decode URL
    with tqdm(total=len(daftar_berita), desc="Decode URLs", unit="url") as pbar:
        for i, source_url in enumerate(daftar_berita):
            try:
                decoded_url = gnewsdecoder(source_url, interval=interval_time)
                
                if decoded_url.get("status"):
                    decoded.append(decoded_url["decoded_url"])
                    success_count += 1
                    pbar.set_postfix({'✓': success_count, '✗': error_count})
                else:
                    decoded.append(source_url)  # Gunakan URL asli jika decode gagal
                    error_count += 1
                    pbar.set_postfix({'✓': success_count, '✗': error_count})
                    
            except Exception as e:
                decoded.append(source_url)  # Gunakan URL asli jika ada exception
                error_count += 1
                pbar.set_postfix({'✓': success_count, '✗': error_count})
            
            pbar.update(1)
    
    # Update DataFrame dengan URL yang sudah di-decode
    df['url_berita'] = decoded
    
    # Simpan hasil
    out_dir = os.path.join(base_dir, "daftar_berita")
    os.makedirs(out_dir, exist_ok=True)
    out_xlsx = os.path.join(out_dir, "google_news_rss.xlsx")
    df.to_excel(out_xlsx, index=False)
    
    print(f"\nSelesai! File disimpan: {out_xlsx}")
    print(f"Total artikel: {len(df)}")
    print(f"URL decode: {success_count} berhasil, {error_count} gagal")


# Jalankan hanya jika belum pernah dieksekusi
if not hasattr(sys.modules[__name__], '_conversion_done'):
    if __name__ == "__main__":
        convert_link()
        sys.modules[__name__]._conversion_done = True
else:
    print("Sudah dijalankan sebelumnya. Restart kernel jika ingin menjalankan ulang.")

Scraping Google News untuk 3 queries dengan max 200 artikel per query
Filter tanggal: 2025-10-02, 2025-10-01


Fetching: kilang: 100%|██████████| 3/3 [00:05<00:00,  1.86s/query, Articles=174]
Removing duplicates: 100%|██████████| 4/4 [00:00<00:00, 576.79step/s, Author+Date+Title]
Fetching: kilang: 100%|██████████| 3/3 [00:05<00:00,  1.86s/query, Articles=174]
Removing duplicates: 100%|██████████| 4/4 [00:00<00:00, 576.79step/s, Author+Date+Title]


Duplikasi dihapus: 174 → 162 artikel (dihapus: 12)

Memulai decode 162 Google News URLs...


Decode URLs:  20%|█▉        | 32/162 [00:59<04:01,  1.86s/url, ✓=32, ✗=0]


KeyboardInterrupt: 