# SETTING ENVIRONMENT


In [16]:
"""# mount the colab with google drive
from google.colab import drive
drive.mount('/content/drive')"""

"# mount the colab with google drive\nfrom google.colab import drive\ndrive.mount('/content/drive')"

In [17]:
# set folder tempat kerja (current working directory)
import os
cwd = '/Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita'
# cwd = '/content/drive/MyDrive/Monitoring Berita'

os.chdir(cwd)

# MAIN

In [18]:
# Daftar RSS Antara (hanya kategori yang diminta)
antaracategories = [
    'terkini',
    'ekonomi',
    'top-news'
]

# Bangun list URL RSS
url = [f'https://www.antaranews.com/rss/{cat}.xml' for cat in antaracategories]

print(f'Total RSS feed akan diproses: {len(url)}')
for i, u in enumerate(url, 1):
    print(f'{i:02d}. {u}')

Total RSS feed akan diproses: 3
01. https://www.antaranews.com/rss/terkini.xml
02. https://www.antaranews.com/rss/ekonomi.xml
03. https://www.antaranews.com/rss/top-news.xml


In [19]:
import pandas as pd
import requests, json, re, hashlib
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from pathlib import Path

# Load config
with open('config.json', 'r') as f:
    cfg = json.load(f)
search_dates = set(cfg.get('search_date', []))  # format: YYYY-MM-DD
keywords_raw = cfg.get('keywords', [])
# Normalisasi kata kunci (lower)
keywords = [k.lower().strip() for k in keywords_raw if k.strip()]

rss_urls = url  # dari sel sebelumnya

records = []

# Helper untuk ekstrak teks & cleaning
def clean_text(t):
    if t is None:
        return ''
    # Hilangkan HTML tags sederhana
    t = re.sub(r'<[^>]+>', ' ', t)
    # Normalisasi spasi
    return re.sub(r'\s+', ' ', t).strip()

# Tanggal parsing patterns (Antara biasanya RFC822)
from email.utils import parsedate_to_datetime

def parse_pub_date(s):
    if not s:
        return None, None
    try:
        dt = parsedate_to_datetime(s)
        if dt.tzinfo is None:
            dt = dt.replace(tzinfo=timezone.utc)
        # konversi ke WIB? (opsional) -> di sini tetap UTC
        return dt, dt.strftime('%Y-%m-%d')
    except Exception:
        # fallback coba beberapa format
        fmts = [
            '%a, %d %b %Y %H:%M:%S %z',
            '%Y-%m-%d %H:%M:%S%z',
            '%Y-%m-%dT%H:%M:%S%z',
            '%Y-%m-%d'
        ]
        for fmt in fmts:
            try:
                dt = datetime.strptime(s, fmt)
                return dt, dt.strftime('%Y-%m-%d')
            except Exception:
                pass
    return None, None

# Fungsi pencarian keywords (whole word case-insensitive)
keyword_pattern = re.compile(r'(' + '|'.join(re.escape(k) for k in keywords) + r')', re.IGNORECASE) if keywords else None

def find_keywords(text):
    if not keyword_pattern or not text:
        return []
    found = keyword_pattern.findall(text.lower())
    # normalisasi kembali ke bentuk list unik sesuai original keywords list order
    uniq = []
    for k in keywords:
        if k in found and k not in uniq:
            uniq.append(k)
    return uniq

for rss_url in rss_urls:
    try:
        resp = requests.get(rss_url, timeout=20)
        resp.raise_for_status()
    except Exception as e:
        print(f'Gagal ambil {rss_url}: {e}')
        continue

    try:
        root = ET.fromstring(resp.content)
    except ET.ParseError as e:
        print(f'Parse error {rss_url}: {e}')
        continue

    # Cari channel/items
    channel = root.find('channel') if root.tag != 'channel' else root
    if channel is None:
        print(f'Channel tidak ditemukan: {rss_url}')
        continue

    for item in channel.findall('item'):
        title = clean_text(item.findtext('title'))
        link = clean_text(item.findtext('link'))
        pub_raw = item.findtext('pubDate') or item.findtext('pubdate') or item.findtext('{http://purl.org/dc/elements/1.1/}date')
        dt_obj, dt_str = parse_pub_date(pub_raw)
        creator = item.findtext('{http://purl.org/dc/elements/1.1/}creator') or item.findtext('author') or ''
        creator = clean_text(creator)
        description = clean_text(item.findtext('description'))
        full_text_for_match = ' '.join([title, description])
        kws = find_keywords(full_text_for_match)
        relevan = 1 if kws else 0

        # Filter tanggal bila search_dates diset
        if search_dates and dt_str and dt_str not in search_dates:
            continue

        records.append({
            'judul_berita': title,
            'tanggal_berita': dt_str or '',
            'penulis_berita': creator,
            'url_berita': link,
            'relevan': relevan,
            'keywords_found': ', '.join(kws)
        })

# Buat DataFrame
_df = pd.DataFrame(records)

# Sort by tanggal desc lalu judul
if not _df.empty:
    _df = _df.sort_values(['tanggal_berita', 'judul_berita'], ascending=[False, True]).reset_index(drop=True)

print(f'Total baris hasil: {len(_df)}')


Total baris hasil: 67


In [23]:
from datetime import datetime
output_dir = Path(cwd + '/daftar_berita') 
base_name = f'daftar_berita_antara'


xlsx_path = output_dir / f'{base_name}.xlsx'

if '_df' in globals() and not _df.empty:

    _df.to_excel(xlsx_path, index=False)

    print(f'Saved: {xlsx_path}')
else:
    print('DataFrame kosong, tidak disimpan.')

Saved: /Users/yusufpradana/Library/CloudStorage/OneDrive-Personal/Pekerjaan BMN/05. 2025/98_monitoring_berita/monitoring-berita/daftar_berita/daftar_berita_antara.xlsx


## Ambil & Filter RSS Antara
Memuat `config.json`, membaca RSS dari daftar URL, memfilter berdasarkan `search_date`, mendeteksi kemunculan kata kunci, dan menghasilkan DataFrame dengan kolom:

`judul_berita, tanggal_berita, penulis_berita, url_berita, relevan, keywords_found`.