In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_article_content(link):
    try:
        res = requests.get(link, headers=headers, timeout=20)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, 'lxml')

        div_content = soup.find('div', class_='c-detail read')
        if not div_content:
            print(f"      ⚠️ Konten utama 'div.c-detail read' tidak ditemukan di: {link}")
            return ''

        elements_to_remove = div_content.select('div#baca-juga, div.vicon, div[style*="page-break-after"]')
        for element in elements_to_remove:
            element.decompose()

        content_tags = div_content.select('p, h2')
        content_list = [tag.get_text(strip=True) for tag in content_tags if tag.get_text(strip=True)]
        return ' '.join(content_list)

    except requests.exceptions.RequestException as e:
        print(f"      ❌ Error jaringan saat mengambil artikel: {link} | {e}")
        return ''
    except Exception as e:
        print(f"      ❌ Error umum saat memproses artikel: {link} | {e}")
        return ''

output_filename = 'okezone_timnas_indonesia.csv'

with open(output_filename, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['Judul', 'Link', 'Waktu', 'Konten'])

    page = 1
    while True:
        print(f"\n📄 Mencoba scraping halaman {page}...")

        url = f'https://search.okezone.com/loaddata/article/timnas%20indonesia/{page}'

        try:
            res = requests.get(url, headers=headers, timeout=20)
            res.raise_for_status()
            soup = BeautifulSoup(res.text, 'lxml')

            articles = soup.find_all('div', class_='desc-section')

            if not articles:
                print(f"✅ Halaman {page} kosong. Proses scraping selesai.")
                break

            print(f"   Ditemukan {len(articles)} artikel di halaman {page}.")

            for art in articles:
                try:
                    a_tag = art.find('a', class_='desc-text')
                    if not a_tag: continue
                    title, link = a_tag.get_text(strip=True), a_tag['href']

                    time_info_tag = art.find('a', class_='time-text')
                    time_info = time_info_tag.get_text(strip=True) if time_info_tag else 'N/A'

                    print(f"   -> Mengambil konten: {title}")
                    content = get_article_content(link)

                    if content:
                        writer.writerow([title, link, time_info, content])
                        print(f"   ✔️ Berhasil disimpan: {title}")

                    time.sleep(1)

                except Exception as e:
                    print(f"   ❌ Error memproses item artikel: {e}")
                    continue

            page += 1
            time.sleep(2)

        except Exception as e:
            print(f"❌ Gagal mengakses halaman data {page}: {e}. Menghentikan proses.")
            break

print(f"\n🎉 Semua data telah disimpan di file '{output_filename}'")


[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
   ✔️ Berhasil disimpan: Erick Thohir Mulai Tugaskan Simon Tahamata, Minta Sistem Scouting Timnas Indonesia Segera Dihidupkan
   -> Mengambil konten: Kapten Timnas Indonesia Jay Idzes Kasih Tips Jitu agar Pemain Muda Tembus Panggung Dunia
   ✔️ Berhasil disimpan: Kapten Timnas Indonesia Jay Idzes Kasih Tips Jitu agar Pemain Muda Tembus Panggung Dunia

📄 Mencoba scraping halaman 66...
   Ditemukan 10 artikel di halaman 66.
   -> Mengambil konten: Komentar Ryan Flamingo di Unggahan Ole Romeny Pertanda Bakal Bela Timnas Indonesia?
   ✔️ Berhasil disimpan: Komentar Ryan Flamingo di Unggahan Ole Romeny Pertanda Bakal Bela Timnas Indonesia?
   -> Mengambil konten: FA China Pecat Branko Ivankovic Usai Kalah dari Timnas Indonesia, Tumbal Pertama Patrick Kluivert!
   ✔️ Berhasil disimpan: FA China Pecat Branko Ivankovic Usai Kalah dari Timnas Indonesia, Tumbal Pertama Patrick Kluivert!
   -> Mengambil konten: Jadwal Timnas

KeyboardInterrupt: 