In [None]:
import  requests
import  datetime
import  json
import  time
from    multiprocessing import Pool, cpu_count
from    bs4             import BeautifulSoup
from    tqdm            import tqdm

BASE_API = "https://wiki.archlinux.org/api.php"
BASE_URL = "https://wiki.archlinux.org/title"
HEADERS = {"User-Agent": "ArchWikiScraper/1.0 (chatbot training)"}
OUTPUT_JSONL = "archwiki_en_dataset.jsonl"
TITLES_LIST = "/work/titles.txt"

def get_all_pages():
    titles = []
    apcontinue = None
    while True:
        params = {
            "action": "query",
            "format": "json",
            "list": "allpages",
            "aplimit": "500",
        }
        if apcontinue:
            params["apcontinue"] = apcontinue

        response = requests.get(BASE_API, params=params, headers=HEADERS).json()
        pages = response["query"]["allpages"]
        
        titles.extend([p["title"] for p in pages if "/" not in p["title"] and "(" not in p["title"]])

        if "continue" in response:
            apcontinue = response["continue"]["apcontinue"]
            time.sleep(0.3)
        else:
            break
    return titles


def scrape_single_page(title):
    url = f"{BASE_URL}/{title.replace(' ', '_')}?action=render"
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"[!] Failed to fetch {title}: {e}")
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    page_data = []
    current_question = f"What is {title}?"
    current_context = ""

    for elem in soup.find_all(["h2", "h3", "p", "ul", "ol", "pre", "code"]):
        if elem.name in ["h2", "h3"]:
            if current_context.strip():
                page_data.append({
                    "input_text": current_question,
                    "target_text": current_context.strip()
                })
            current_question = elem.get_text().strip()
            current_context = ""
        else:
            current_context += "\n" + elem.get_text()

    return page_data

def scrape_batch(titles):
    all_data = []
    for title in titles:
        entries = scrape_single_page(title)
        all_data.extend(entries)
    return all_data

def scrape_all_parallel():
    titles = get_all_pages()
    print(f"🔍 Total pages to scrape: {len(titles)}")
    with open(TITLES_LIST, "w", encoding="utf-8") as f:
        for title in titles:
            f.write(title + "\n")
    print(f"📝 Saved all titles to {TITLES_LIST}")

    chunksize = len(titles) // cpu_count()
    with Pool(processes=cpu_count()) as pool:
        results = []
        with tqdm(total=len(titles), desc="Scraping pages", unit="page") as pbar:
            for batch in pool.imap(scrape_batch, [titles[i:i+chunksize] for i in range(0, len(titles), chunksize)]):
                results.append(batch)
                pbar.update(len(batch))

    all_entries = [entry for batch in results for entry in batch]

    with open(OUTPUT_JSONL, "w", encoding="utf-8") as f:
        for entry in all_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"✅ Done. Scraped {len(all_entries)} entries to {OUTPUT_JSONL}")


In [4]:
scrape_all_parallel()

🔍 Total pages to scrape: 7362


Scraping pages:   0%|          | 0/7362 [00:00<?, ?page/s]

[!] Failed to fetch Template;AUR?: 404 Client Error: Not Found for url: https://wiki.archlinux.org/title/Template;AUR??action=render


Scraping pages:   0%|          | 0/7362 [18:16<?, ?page/s]Process ForkPoolWorker-13:
Process ForkPoolWorker-24:
Process ForkPoolWorker-16:
Process ForkPoolWorker-23:
Process ForkPoolWorker-18:
Process ForkPoolWorker-15:
Process ForkPoolWorker-20:
Process ForkPoolWorker-21:
Process ForkPoolWorker-19:



KeyboardInterrupt: 