In [None]:
# Hubungkan Google Drive ke Colab
from google.colab import drive
drive.mount('/content/drive')

# Instal library yang dibutuhkan
!pip install requests beautifulsoup4 markdownify -q

print("✅ Setup Selesai! Library terinstal dan Google Drive terhubung.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Setup Selesai! Library terinstal dan Google Drive terhubung.


In [None]:
import requests
from bs4 import BeautifulSoup
import os
import re
from markdownify import markdownify as md
import time

# ==============================================================================
# KONFIGURASI UTAMA
# ==============================================================================

# 1. Tentukan di mana folder utama untuk menyimpan semua artikel akan dibuat.
base_output_directory = "/content/drive/MyDrive/Studi_independen/coffee_article"

# 2. Definisikan struktur knowledge base
#    - Key adalah nama folder.
#    - Membuat sub-folder dengan nested dictionary.
#    - Daftar URL harus berada di dalam key bernama "_urls".
knowledge_base_structure = {
    "01_Dasar_Kopi": {
        "_urls": [
            'https://en.wikipedia.org/wiki/Coffee',
            'https://en.wikipedia.org/wiki/Coffea_arabica',
            'https://en.wikipedia.org/wiki/Robusta_coffee',
            'https://en.wikipedia.org/wiki/Coffee_bean',
            'https://en.wikipedia.org/wiki/Specialty_coffee',
            'https://en.wikipedia.org/wiki/List_of_coffee_varieties'
        ]
    },
    "02_Metode_Proses": {
        "_urls": [
            'https://en.wikipedia.org/wiki/Coffee_processing',
            'https://en.wikipedia.org/wiki/Coffee_roasting',
            'https://en.wikipedia.org/wiki/Decaffeination'
        ]
    },
    "03_Asal_Usul": {
        "Afrika": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Coffee_production_in_Ethiopia',
                'https://en.wikipedia.org/wiki/Coffee_production_in_Kenya'
            ]
        },
        "Amerika_Latin": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Coffee_production_in_Colombia',
                'https://en.wikipedia.org/wiki/Coffee_production_in_Brazil',
            ]
        },
        "Asia": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Coffee_production_in_Indonesia',
                'https://en.wikipedia.org/wiki/Coffee_production_in_Vietnam'
            ]
        }
    },
    "04_Metode_Penyeduhan": {
        "Dripping_dan_Infusion": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Brewed_coffee',
                'https://en.wikipedia.org/wiki/Drip_brew',
                'https://en.wikipedia.org/wiki/Chemex_coffeemaker',
                'https://en.wikipedia.org/wiki/Hario_V60',
                'https://en.wikipedia.org/wiki/Kalita_Wave',
                'https://en.wikipedia.org/wiki/French_press',
                'https://en.wikipedia.org/wiki/AeroPress',
                'https://en.wikipedia.org/wiki/Cold_brew_coffee',
                'https://en.wikipedia.org/wiki/Nitro_cold_brew_coffee',
                'https://en.wikipedia.org/wiki/Indian_filter_coffee',
                'https://en.wikipedia.org/wiki/Vietnamese_iced_coffee',
                'https://en.wikipedia.org/wiki/Single-serve_coffee_container'
            ]
        },
        "Boiling_dan_Steeping": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Turkish_coffee',
                'https://en.wikipedia.org/wiki/Moka_pot',
                'https://en.wikipedia.org/wiki/Cowboy_coffee'
            ]
        },
        "Vacuum_dan_Pressure": {
            "_urls": [
                'https://en.wikipedia.org/wiki/Espresso',
                'https://en.wikipedia.org/wiki/Ristretto',
                'https://en.wikipedia.org/wiki/Lungo',
                'https://en.wikipedia.org/wiki/Doppio',
                'https://en.wikipedia.org/wiki/Vacuum_coffee_maker',
            ]
        }
    },
    "05_Terminologi_dan_Budaya": {
        "_urls": [
            'https://en.wikipedia.org/wiki/List_of_coffee_drinks',
            'https://en.wikipedia.org/wiki/Coffee_cupping',
            'https://en.wikipedia.org/wiki/History_of_coffee',
            'https://en.wikipedia.org/wiki/Coffee_culture',
        ]
    }
}

# ==============================================================================
# FUNGSI-FUNGSI UTAMA
# ==============================================================================

def clean_markdown_text(text):
    """Membersihkan teks Markdown dari artefak Wikipedia yang tidak diinginkan."""
    # Hapus bagian tidak relevan di akhir artikel
    stop_sections = ["## See also", "## References", "## Further reading", "## External links"]
    stop_index = len(text)
    for section in stop_sections:
        found_index = text.find(section)
        if found_index != -1 and found_index < stop_index:
            stop_index = found_index
    text = text[:stop_index]

    # Hapus sintaks gambar Markdown
    text = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*? ".*? Keterangan"\)', '', text)
    text = re.sub(r'\[!\[.*?\]\(.*?\)\]\(.*?\)','', text)

    # Ubah link Wikipedia menjadi teks biasa
    text = re.sub(r'\[(.*?)\]\(\/wiki\/.*?\)','\\1', text)

    # Hapus frasa-frasa umum Wikipedia
    text = re.sub(r'Main article: ', '', text)
    text = re.sub(r'" redirects here\..*?\n', '\n', text)

    # Bersihkan baris kosong yang berlebihan
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

def scrape_and_convert_wikipedia(url, save_directory):
    """
    Fungsi inti yang melakukan scraping, pembersihan, konversi, dan penyimpanan
    untuk satu URL.
    """
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        title_element = soup.find('h1', id='firstHeading')
        content_element = soup.find('div', id='mw-content-text')

        if not title_element or not content_element:
            print(f"  -> ❌ Gagal menemukan elemen penting untuk URL: {url}")
            return

        title = title_element.get_text(strip=True)

        # Hapus elemen HTML yang tidak diinginkan sebelum konversi
        elements_to_remove = [
            'div.reflist', 'div.navbox', 'table.infobox', 'div#toc',
            'sup.reference', 'span.mw-editsection', 'div.thumb', 'table.wikitable'
        ]
        for selector in elements_to_remove:
            for element in content_element.select(selector):
                element.decompose()

        # Konversi HTML yang bersih ke Markdown
        markdown_text = md(str(content_element), heading_style="ATX")

        # Bersihkan hasil Markdown dari sisa artefak
        cleaned_markdown = clean_markdown_text(markdown_text)

        # Buat nama file yang aman
        filename = re.sub(r'[\\/*?:"<>|]', "", title).replace(" ", "_") + ".md"
        filepath = os.path.join(save_directory, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(f"# {title}\n\n")
            f.write(cleaned_markdown)

        print(f"  -> ✅ Berhasil menyimpan: '{filename}'")

    except Exception as e:
        print(f"  -> ❌ Terjadi kesalahan saat memproses {url}: {e}")

def process_category(category_dict, current_path):
    """
    Fungsi rekursif untuk menjelajahi nested dictionary, membuat folder,
    dan memicu proses scraping.
    """
    for name, content in category_dict.items():
        if name == "_urls":
            # Jika menemukan daftar URL, proses semua URL di path saat ini
            print(f"\n--- Memproses URL di Path: '{current_path}' ---")
            os.makedirs(current_path, exist_ok=True)
            for url in content:
                scrape_and_convert_wikipedia(url, current_path)
                time.sleep(1) # Jeda antar request
        else:
            # Jika ini adalah sub-kategori, gali lebih dalam (rekursif)
            new_path = os.path.join(current_path, name)
            process_category(content, new_path)

# ==============================================================================
# BLOK EKSEKUSI UTAMA
# ==============================================================================

if __name__ == "__main__":
    print("==========================================================")
    print(" Memulai Script Scraping Knowledge Base PIKOPI ".center(58))
    print("==========================================================")

    # Memulai proses dari direktori dasar dan struktur yang telah didefinisikan
    process_category(knowledge_base_structure, base_output_directory)

    print("\n==========================================================")
    print(" Semua Proses Selesai ".center(58))
    print(f"Data disimpan di folder: '{base_output_directory}'".center(58))
    print("==========================================================")

      Memulai Script Scraping Knowledge Base PIKOPI       

--- Memproses URL di Path: '/content/drive/MyDrive/Studi_independen/coffee_article/01_Dasar_Kopi' ---
  -> ✅ Berhasil menyimpan: 'Coffee.md'
  -> ✅ Berhasil menyimpan: 'Coffea_arabica.md'
  -> ✅ Berhasil menyimpan: 'Coffea_canephora.md'
  -> ✅ Berhasil menyimpan: 'Coffee_bean.md'
  -> ✅ Berhasil menyimpan: 'Specialty_coffee.md'
  -> ✅ Berhasil menyimpan: 'List_of_coffee_varieties.md'

--- Memproses URL di Path: '/content/drive/MyDrive/Studi_independen/coffee_article/02_Metode_Proses' ---
  -> ✅ Berhasil menyimpan: 'Coffee_production.md'
  -> ✅ Berhasil menyimpan: 'Coffee_roasting.md'
  -> ✅ Berhasil menyimpan: 'Decaffeination.md'

--- Memproses URL di Path: '/content/drive/MyDrive/Studi_independen/coffee_article/03_Asal_Usul/Afrika' ---
  -> ✅ Berhasil menyimpan: 'Coffee_production_in_Ethiopia.md'
  -> ✅ Berhasil menyimpan: 'Coffee_production_in_Kenya.md'

--- Memproses URL di Path: '/content/drive/MyDrive/Studi_independen/cof