In [86]:
import requests
import csv
import time
import concurrent.futures
import os
import sys
import pandas as pd
from wikidata.client import Client

In [87]:
INPUT_FILE = "fb_wiki_scoped_to_dataset.csv"
OUTPUT_FILE = "wiki_entities_text.csv"
FAILED_FILE = "failed_qids.txt"
FAILED_RETRY_FILE = "failed_qids.txt"

HEADERS = {
    "User-Agent": "KrishBot/1.0 (contact: saikdvs@gmail.com)"
}

In [88]:
def chunked(iterable, size):
    for i in range(0, len(iterable), size):
        yield iterable[i:i + size]

In [89]:
def fetch_with_retries(url, params, retries=5, backoff=1):
    for attempt in range(retries):
        try:
            resp = requests.get(url, params=params, headers=HEADERS, timeout=30)
            if resp.status_code == 200:
                return resp.json()
            elif resp.status_code in (429, 503):
                wait = backoff * (2 ** attempt)
                print(f"Rate limited ({resp.status_code}), retrying in {wait}s...")
                time.sleep(wait)
            else:
                print(f"HTTP error {resp.status_code}: {resp.text[:200]}")
                return None
        except requests.exceptions.RequestException as e:
            print(f"Request error: {e}, retrying...")
            time.sleep(backoff * (2 ** attempt))
    return None

In [90]:
def get_wikipedia_title_from_qid(qid):
    """
    Fetch the Wikipedia title for a given Wikidata QID.
    Returns a tuple: (lang, title), e.g., ('en', 'Python (programming language)')
    Only considers Wikipedia sitelinks (ending with 'wiki').
    """
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "props": "sitelinks",
        "format": "json"
    }
    data = fetch_with_retries("https://www.wikidata.org/w/api.php", params)
    if not data:
        return None, None

    entities = data.get("entities", {})
    if qid not in entities:
        return None, None

    sitelinks = entities[qid].get("sitelinks", {})

    # Filter only Wikipedia sitelinks ending with 'wiki'
    wiki_sitelinks = {k: v for k, v in sitelinks.items() if k.endswith("wiki")}

    if not wiki_sitelinks:
        return None, None

    # Prefer English Wikipedia if available
    if "enwiki" in wiki_sitelinks:
        return "en", wiki_sitelinks["enwiki"]["title"]

    # Otherwise return the first available Wikipedia sitelink
    first_key = next(iter(wiki_sitelinks))
    lang = first_key.replace("wiki", "")
    return lang, wiki_sitelinks[first_key]["title"]


In [91]:
from collections import defaultdict

def get_wikipedia_extracts(triplets):
    # Group titles by language
    lang_to_titles = defaultdict(list)
    for qid,title,lang in triplets:
        lang_to_titles[lang].append(title)

    for item in lang_to_titles:
        print(item)

    results = {}

    # Fetch all titles for each language in one call
    for lang, titles in lang_to_titles.items():
        params = {
            "action": "query",
            "prop": "extracts",
            "explaintext": True,
            "exintro": True,
            "titles": "|".join(titles),
            "format": "json"
        }
        data = fetch_with_retries(f"https://{lang}.wikipedia.org/w/api.php", params)
        if not data:
            continue
        pages = data.get("query", {}).get("pages", {})
        for page in pages.values():
            if "title" in page:
                title_norm = page["title"].replace("_", " ").lower()
                results[title_norm] = page.get("extract", "")
    return results

In [92]:
def process_batch(qids, writer, lock, failed_qids, row_counter):
    results = []
    for qid in qids:
        lang,title = get_wikipedia_title_from_qid(qid)
        if not title:
            failed_qids.add(qid)
            continue
        results.append((qid, title, lang))

    if not results:
        return row_counter

    extracts = get_wikipedia_extracts(results)

    with lock:
        for qid, title, lang in results:
            title_norm = title.replace("_", " ").lower()
            extract = extracts.get(title_norm, "")
            if not extract:
                failed_qids.add(qid)
                continue
            writer.writerow([qid, title, extract])
            row_counter += 1
        sys.stdout.flush()
    return row_counter

In [93]:
def fetch(qid_list, failed_file=FAILED_FILE):
    done_qids = set()
    row_counter = 0
    if os.path.exists(OUTPUT_FILE):
        with open(OUTPUT_FILE, newline="", encoding="utf-8") as f:
            reader = csv.reader(f)
            next(reader, None)
            for row in reader:
                if row:
                    done_qids.add(row[0])
                    row_counter += 1

    qid_list = [qid for qid in qid_list if qid not in done_qids]
    print(f"{len(done_qids)} QIDs already processed, {len(qid_list)} remaining.")

    failed_qids = set()

    with open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        if row_counter == 0:
            writer.writerow(["QID", "Title", "Extract"])

        from threading import Lock
        lock = Lock()

        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for batch in chunked(qid_list, 50):
                futures.append(executor.submit(process_batch, batch, writer, lock, failed_qids, row_counter))
                time.sleep(0.1)

            for future in concurrent.futures.as_completed(futures):
                row_counter = future.result()
                print(f"Total written so far: {row_counter}")

    if failed_qids:
        with open(failed_file, "a", encoding="utf-8") as f:
            for qid in failed_qids:
                f.write(qid + "\n")

In [94]:
def dedupe_csv(input_filepath, output_filepath):
    """
    Removes duplicate rows from a CSV file and saves the result to a new file.

    Args:
        input_filepath (str): The path to the input CSV file.
        output_filepath (str): The path to save the deduplicated CSV file.
    """
    try:
        # Read the CSV file into a pandas DataFrame
        df = pd.read_csv(input_filepath)

        # Remove duplicate rows. By default, it keeps the first occurrence.
        # Set inplace=True to modify the DataFrame directly.
        df.drop_duplicates(inplace=True)
        df.sort_values(by=["QID"], inplace=True)

        # Save the deduplicated DataFrame to a new CSV file
        # index=False prevents pandas from writing the DataFrame index as a column
        df.to_csv(output_filepath, index=False)
        print(f"Successfully deduplicated '{input_filepath}' and saved to '{output_filepath}'.")

    except FileNotFoundError:
        print(f"Error: The file '{input_filepath}' was not found.")
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{input_filepath}' is empty.")
    except Exception as e:
        print(f"An error occurred: {e}")

dedupe_csv(input_filepath=OUTPUT_FILE, output_filepath="wiki_entities_text_dedup.csv")

Successfully deduplicated 'wiki_entities_text.csv' and saved to 'wiki_entities_text_dedup.csv'.


In [95]:
df = pd.read_csv(INPUT_FILE)
col = "wikidata_id"
df[col].astype(str)
qids = df[col].to_numpy()

In [96]:
def retry_failed(batch_size=50, max_workers=5):
    if not os.path.exists(FAILED_FILE):
        print("No failed_qids.txt found, nothing to retry.")
        return

    with open(FAILED_FILE, "r", encoding="utf-8") as f:
        failed_qids = [line.strip() for line in f if line.strip()]

    if not failed_qids:
        print("No failed QIDs to retry.")
        return

    print(f"Retrying {len(failed_qids)} failed QIDs in batches of {batch_size}...")
    fetch(failed_qids, failed_file=FAILED_RETRY_FILE)


In [97]:
print("Uncomment to start fetching")
# fetch(qids)
# retry_failed(batch_size=50, max_workers=5)

Uncomment to start fetching
