# Imports

In [None]:
import requests
import time

import pyarrow.parquet as pq
import pandas as pd

from time import sleep
from urllib.error import HTTPError


# Part 2: SPARQL Endpoint and Rate Limiting


In [None]:
# SPARQL endpoint for WikiData
sparql_endpoint = "https://query.wikidata.org/sparql"

# Function to add a delay between requests to respect rate limits
def rate_limited(max_per_minute):
    min_interval = 60.0 / float(max_per_minute)
    def decorate(func):
        last_time_called = [0.0]
        def rate_limited_function(*args, **kwargs):
            elapsed = time.time() - last_time_called[0]
            left_to_wait = min_interval - elapsed
            if left_to_wait > 0:
                time.sleep(left_to_wait)
            ret = func(*args, **kwargs)
            last_time_called[0] = time.time()
            return ret
        return rate_limited_function
    return decorate


In [None]:
# Retry decorator
def retry_request(function):
    DEFAULT_TIMEOUT = 5
    timeout = 5

    @rate_limited(10)
    def retried_function(*args, **kwargs):
        nonlocal timeout
        try:
            timeout = DEFAULT_TIMEOUT
            return function(*args, **kwargs)

        except HTTPError as e:
            if e.code == 429:
                timeout += 5
                print(f"Encountered 429. Gonna sleep for {timeout} and retry")
                sleep(timeout)
                return retried_function(*args, **kwargs)
            elif e.code == 403:
                timeout += 5
                print(f"Encountered 403. Gonna sleep for {timeout} and retry")
                sleep(timeout)
                return retried_function(*args, **kwargs)
            else:
                raise

    return retried_function


# WikiData SPARQL Querying




In [None]:
# Modified query_wikidata function with retry decorator
@retry_request
def query_predicate_description(predicate_id: str):
    # SPARQL query to retrieve description of the predicate
    sparql_query = f"""
    SELECT ?item ?itemLabel ?itemDescription
    WHERE {{
      wd:{predicate_id} rdfs:label ?itemLabel.
      wd:{predicate_id} schema:description ?itemDescription.
      FILTER(LANG(?itemLabel) = "en")
      FILTER(LANG(?itemDescription) = "en")
    }}
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json'
    }

    # Send the SPARQL query to WikiData
    response = requests.get(sparql_endpoint, params={'query': sparql_query, 'format': 'json'}, headers=headers)

    # Parse the JSON response
    data = response.json()

    # Extract relevant information
    if 'results' in data and 'bindings' in data['results'] and len(data['results']['bindings']) > 0:
        result = data['results']['bindings'][0]
        label = result['itemLabel']['value']
        description = result['itemDescription']['value']
        return {'predicate_id': predicate_id, 'label': label, 'description': description}
    else:
        return {'predicate_id': predicate_id, 'label': None, 'description': None}


In [None]:
# Read the list of predicate IDs from the txt file
file_path = '/content/predicates_filtered.txt'

with open(file_path, 'r') as file:
    predicate_ids = [line.strip() for line in file]


In [None]:
# Split the predicate IDs into batches (adjust the batch size as needed)
batch_size = 10
predicate_batches = [predicate_ids[i:i + batch_size] for i in range(0, len(predicate_ids), batch_size)]


In [None]:
print(predicate_ids)

['P1056', 'P1059', 'P10680', 'P10871', 'P1107', 'P112', 'P1128', 'P127', 'P1278', 'P1297', 'P1320', 'P138', 'P1448', 'P1451', 'P1454', 'P154', 'P155', 'P156', 'P159', 'P1616', 'P1619', 'P169', 'P17', 'P1716', 'P1789', 'P1796', 'P18', 'P1813', 'P1830', 'P199', 'P2002', 'P2003', 'P2013', 'P2088', 'P213', 'P2137', 'P2138', 'P2139', 'P214', 'P2226', 'P227', 'P2295', 'P2333', 'P2388', 'P2391', 'P2397', 'P2403', 'P2427', 'P244', 'P249', 'P2541', 'P2619', 'P2622', 'P2627', 'P2628', 'P2657', 'P276', 'P2771', 'P279', 'P281', 'P2828', 'P31', 'P3125', 'P3153', 'P3193', 'P3215', 'P3220', 'P3224', 'P3225', 'P3242', 'P3320', 'P3347', 'P3362', 'P3376', 'P3377', 'P3393', 'P3417', 'P349', 'P3500', 'P355', 'P3608', 'P373', 'P3744', 'P3797', 'P38', 'P3836', 'P407', 'P414', 'P4264', 'P4293', 'P4496', 'P452', 'P463', 'P488', 'P495', 'P5052', 'P5256', 'P528', 'P571', 'P576', 'P5798', 'P580', 'P585', 'P6204', 'P625', 'P6366', 'P6375', 'P642', 'P646', 'P669', 'P670', 'P6795', 'P740', 'P749', 'P856', 'P8563', 

In [None]:
# empty list to store the results
all_predicate_results = []
predicate_batches
for predicate in predicate_ids:
  predicate_info = query_predicate_description(predicate)
  all_predicate_results.append(predicate_info)



In [None]:
print(all_predicate_results)
wikidata_df = pd.DataFrame(all_predicate_results)
wikidata_df

[{'predicate_id': 'P1056', 'label': 'product or material produced or service provided', 'description': 'material or product produced by an organization, industry, facility, or process'}, {'predicate_id': 'P1059', 'label': 'CVR number', 'description': "unique identifier for a business in Denmark's Central Business Register (CVR), the official database of Danish businesses."}, {'predicate_id': 'P10680', 'label': 'franchisor', 'description': 'one who licenses some or all of its know-how, procedures, intellectual property, use of its business model, brand, and rights to sell its branded products and services to a franchisee'}, {'predicate_id': 'P10871', 'label': 'Delaware Division of Corporations file number', 'description': 'number associated with entities registered with the Delaware Department of State Division of Corporations'}, {'predicate_id': 'P1107', 'label': 'proportion', 'description': 'to be used as a qualifier, value must be between 0 and 1'}, {'predicate_id': 'P112', 'label': 

Unnamed: 0,predicate_id,label,description
0,P1056,product or material produced or service provided,material or product produced by an organizatio...
1,P1059,CVR number,unique identifier for a business in Denmark's ...
2,P10680,franchisor,"one who licenses some or all of its know-how, ..."
3,P10871,Delaware Division of Corporations file number,number associated with entities registered wit...
4,P1107,proportion,"to be used as a qualifier, value must be betwe..."
...,...,...,...
108,P910,topic's main category,main Wikimedia category
109,P9239,affiliated worker organisation,organization representing workers in a specifi...
110,P946,ISIN,identifier for a security
111,P972,catalog,"catalog for the item, or, as a qualifier of P5..."


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
parquet_file_path = "/content/gdrive/My Drive/wikidata_data_predicates_filtered.parquet"
wikidata_df.to_parquet(parquet_file_path)
from google.colab import files
files.download(parquet_file_path)



Mounted at /content/gdrive


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>