In [3]:
import requests
import csv
import time
import concurrent.futures
import os
import sys

WIKIDATA_API = "https://www.wikidata.org/w/api.php"
WIKIPEDIA_API = "https://en.wikipedia.org/w/api.php"

OUTPUT_FILE = "../../datasets/fetched/wiki_entities.csv"
FAILED_FILE = "failed_qids.txt"

HEADERS = {
    "User-Agent": "KrishBot/1.0 (contact: saikdvs@gmail.com)"
}

def chunked(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

def fetch_with_retries(url, params, retries=5, backoff=1):
    for attempt in range(retries):
        try:
            resp = requests.get(url, params=params, headers=HEADERS, timeout=30)
            if resp.status_code == 200:
                return resp.json()
            elif resp.status_code in (429, 503):
                wait = backoff * (2 ** attempt)
                print(f"Rate limited ({resp.status_code}), retrying in {wait}s...")
                time.sleep(wait)
            else:
                print(f"HTTP error {resp.status_code}: {resp.text[:200]}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}, retrying...")
            time.sleep(backoff * (2 ** attempt))
    return None

def get_wikipedia_title_from_qid(qid):
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "sitelinks",
        "sitefilter": "enwiki",
        "format": "json"
    }
    data = fetch_with_retries(WIKIDATA_API, params)
    if not data:
        return None
    entities = data.get("entities", {})
    if qid in entities:
        sitelinks = entities[qid].get("sitelinks", {})
        if "enwiki" in sitelinks:
            return sitelinks["enwiki"]["title"]
    return None

def get_wikipedia_extracts(titles):
    params = {
        "action": "query",
        "prop": "extracts",
        "explaintext": True,
        "exintro": True,
        "titles": "|".join(titles),
        "format": "json"
    }
    data = fetch_with_retries(WIKIPEDIA_API, params)
    if not data:
        return {}
    pages = data.get("query", {}).get("pages", {})
    results = {}
    for page in pages.values():
        if "title" in page:
            title_norm = page["title"].replace("_", " ").lower()
            results[title_norm] = page.get("extract", "")
    return results

def process_batch(qids, writer, lock, failed_qids, row_counter):
    results = []
    for qid in qids:
        title = get_wikipedia_title_from_qid(qid)
        if not title:
            failed_qids.add(qid)
            continue
        results.append((qid, title))

    if not results:
        return row_counter

    titles = [title for _, title in results]
    extracts = get_wikipedia_extracts(titles)

    with lock:
        for qid, title in results:
            title_norm = title.replace("_", " ").lower()
            extract = extracts.get(title_norm, "")
            if not extract:
                failed_qids.add(qid)
                continue
            writer.writerow([qid, title, extract])
            row_counter += 1
        sys.stdout.flush()
    return row_counter

def main(qid_list):
    done_qids = set()
    row_counter = 0
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, newline="", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)
            for row in reader:
                if row:
                    done_qids.add(row[0])
                    row_counter += 1

    qid_list = [qid for qid in qid_list if qid not in done_qids]
    print(f"{len(done_qids)} QIDs already processed, {len(qid_list)} remaining.")

    failed_qids = set()

    with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        if row_counter == 0:
            writer.writerow(["QID", "Title", "Extract"])

        from threading import Lock
        lock = Lock()

        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for batch in chunked(qid_list, 50):
                futures.append(executor.submit(process_batch, batch, writer, lock, failed_qids, row_counter))
                time.sleep(0.1)

            for future in concurrent.futures.as_completed(futures):
                row_counter = future.result()
                print(f"Total written so far: {row_counter}")

    if failed_qids:
        with open(FAILED_FILE, "a", encoding="utf-8") as f:
            for qid in failed_qids:
                f.write(qid + "\n")

if __name__ == "__main__":
    qids = ["Q128306", "Q16562", "Q52996", "Q128282"]  # Replace with your 15000 QIDs
    main(qids)


0 QIDs already processed, 4 remaining.


FileNotFoundError: [Errno 2] No such file or directory: '../../datasets/fetched/wiki_entities.csv'