#### Load WebQuestions Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("stanfordnlp/web_questions")

#### Convert dataset to dataframe

In [None]:
import pandas as pd

def to_dataframe(split):
    return pd.DataFrame({
        "question": [ex["question"] for ex in split],
        "answers": [ex["answers"] for ex in split],
        "fb_url": [ex["url"] for ex in split]
    })

df_train = to_dataframe(dataset["train"])
df_test = to_dataframe(dataset["test"])

#### Grab freebase name from URL

In [None]:
df_train["freebase_name"] = df_train["fb_url"].apply(lambda x: x.split("/")[-1])
df_test["freebase_name"] = df_test["fb_url"].apply(lambda x: x.split("/")[-1])

#### Map questions to the appropriate QID

In [None]:
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import lru_cache
import pandas as pd

session = requests.Session()

@lru_cache(maxsize=1000)
def search_wikidata_qid(name):
    """Cached version of Wikidata search - much faster for repeated queries"""
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": name,
        "language": "en",
        "format": "json"
    }
    try:
        response = session.get(url, params=params, timeout=10)
        response.raise_for_status()
        result = response.json()
        if result.get("search"):
            return result["search"][0]["id"]
        return None
    except Exception as e:
        print(f"Error searching for '{name}': {e}")
        return None

def search_wikidata_qid_batch(names, max_workers=10):
    """Process multiple names concurrently for much faster processing"""
    results = {}
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:

        future_to_name = {executor.submit(search_wikidata_qid, name): name for name in names}
        
        for future in as_completed(future_to_name):
            name = future_to_name[future]
            try:
                results[name] = future.result()
            except Exception as e:
                print(f"Error processing '{name}': {e}")
                results[name] = None
    
    return results

df_train["subject_qid"] = df_train["freebase_name"].apply(search_wikidata_qid)
df_test["subject_qid"] = df_test["freebase_name"].apply(search_wikidata_qid)

df_train.to_parquet("checkpoints/train_webquestions.parquet")
df_test.to_parquet("checkpoints/test_webquestions.parquet")

#### Grab all unique Q-IDs from WebQuestions

In [None]:
import pandas as pd

df_train = pd.read_parquet("checkpoints/train_webquestions.parquet")
df_test = pd.read_parquet("checkpoints/test_webquestions.parquet")

df_all = pd.concat([df_train, df_test], ignore_index=True)

subject_qids = df_all["subject_qid"].dropna().unique().tolist()

print(f"Total unique subject Q-IDs: {len(subject_qids)}")

#### Get Wikidata predicates related to QIDs from WebQuestions

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
import time
import random
import pandas as pd
from tqdm import tqdm
import time

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

def get_triples(qid, max_retries=5, base_delay=2):
    query = f"""
        SELECT ?s ?p ?propEntity ?propLabel ?o WHERE {{
        BIND(wd:{qid} AS ?s)
        ?s ?p ?o .
        FILTER(STRSTARTS(STR(?p), "http://www.wikidata.org/prop/direct/"))
        ?propEntity wikibase:directClaim ?p .
        ?propEntity rdfs:label ?propLabel .
        FILTER(LANG(?propLabel)="en")
        }}
    """
    for attempt in range(max_retries):
        try:
            sparql.setQuery(query)
            results = sparql.query().convert()
            triples = []
            for r in results["results"]["bindings"]:
                object_uri = r.get("o", {}).get("value", "")
                object_id = object_uri.split("/")[-1] if "wikidata.org" in object_uri else object_uri

                predicate_id = r["propEntity"]["value"].split("/")[-1]
                predicate_label = r.get("propLabel", {}).get("value", predicate_id)

                triples.append({
                    "subject_id": qid,
                    "subject_label": qid,  # placeholder
                    "predicate_id": predicate_id,
                    "predicate_label": predicate_label,
                    "object_id": object_id,
                    "object_label": object_id  # placeholder
                })
            return triples
        except Exception as e:
            if "429" in str(e) or "Too Many Requests" in str(e):
                if attempt < max_retries - 1:
                    delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
                    print(f"Rate limited for {qid}, attempt {attempt + 1}/{max_retries}. Waiting {delay:.1f}s...")
                    time.sleep(delay)
                    continue
                else:
                    print(f"Failed to get triples for {qid} after {max_retries} attempts")
                    return []
            else:
                print(f"Error for {qid}: {e}")
                return []
    return []

all_triples = []
for qid in tqdm(subject_qids):
    triples = get_triples(qid)
    all_triples += triples
    time.sleep(1.0)

df_triples = pd.DataFrame(all_triples)
df_triples.to_csv("checkpoints/webquestions_kg_descriptive.csv", index=False)

#### Labels for Subjects and Objects

In [None]:
import re
from more_itertools import chunked
import pandas as pd
from tqdm import tqdm
import time
import random
from SPARQLWrapper import SPARQLWrapper, JSON

df_kg = pd.read_csv("checkpoints/webquestions_kg_descriptive.csv")

valid_qid_pattern = re.compile(r"^Q\d+$")

valid_ids = {qid for qid in set(df_kg['subject_id']) | set(df_kg['object_id']) if valid_qid_pattern.match(str(qid))}

id_to_label = {}

sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setReturnFormat(JSON)

def get_labels_batch(qids, max_retries=3):
    if not qids:
        return {}
    qid_str = " ".join(f"wd:{qid}" for qid in qids)
    query = f"""
    SELECT ?item ?itemLabel WHERE {{
      VALUES ?item {{ {qid_str} }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    for attempt in range(max_retries):
        try:
            sparql.setQuery(query)
            results = sparql.query().convert()
            return {
                r['item']['value'].split("/")[-1]: r['itemLabel']['value']
                for r in results["results"]["bindings"]
            }
        except Exception as e:
            if "429" in str(e) or "403" in str(e):
                delay = 2 ** attempt + random.random()
                print(f"Rate limit or forbidden (403). Retry {attempt+1}. Waiting {delay:.1f}s...")
                time.sleep(delay)
                continue
            else:
                print(f"Label fetch error: {e}")
                return {}

    return {}

chunks = list(chunked(valid_ids, 50))

for chunk in tqdm(chunks, total=len(chunks)):
    label_map = get_labels_batch(chunk)
    id_to_label.update(label_map)
    time.sleep(2.0)

df_kg['subject_label'] = df_kg['subject_id'].map(id_to_label)
df_kg['object_label'] = df_kg['object_id'].map(id_to_label)

df_kg['subject_label'] = df_kg['subject_label'].fillna(df_kg['subject_id'])
df_kg['object_label'] = df_kg['object_label'].fillna(df_kg['object_id'])

df_kg.to_csv("checkpoints/webquestions_kg_descriptive_labeled.csv", index=False)

#### Filter out Subjects, Objects and Relationships without standard format

In [None]:
knowledge_graph = pd.read_csv('checkpoints/webquestions_kg_descriptive_labeled.csv')

condition = (knowledge_graph['subject_id'].str.match(r'^Q\d+$', na=False) 
             & knowledge_graph['object_id'].str.match(r'^Q\d+$', na=False)
             & knowledge_graph['predicate_id'].str.match(r'^P\d+$', na=False))

matching = len(knowledge_graph[condition])
non_matching = len(knowledge_graph[~condition])

print(f"Number of IDs matching pattern '^Q\\d+$': {matching}")
print(f"Number of IDs not matching pattern '^Q\\d+$': {non_matching}")

knowledge_graph = knowledge_graph[condition]

knowledge_graph.to_csv('checkpoints/webquestions_kg_descriptive_labeled_filtered.csv', index=False)