In [None]:
!pip install pyarrow
!pip install requests

# Imports

In [None]:
import requests
import time

import pyarrow.parquet as pq
import pandas as pd

from time import sleep
from urllib.error import HTTPError


# Loading Data

In [None]:
data_file = "/content/business_entities (1).parquet"
table = pq.read_table(data_file)
df = table.to_pandas()
df

Unnamed: 0,qid
0,Q7964731
1,Q881127
2,Q7552448
3,Q7569295
4,Q107519888
...,...
312671,Q4393194
312672,Q54912544
312673,Q11319953
312674,Q642622




# Rate-Limiting Decorators



In [None]:
def rate_limited(max_per_minute):
    min_interval = 60.0 / float(max_per_minute)
    def decorate(func):
        last_time_called = [0.0]
        def rate_limited_function(*args, **kargs):
            elapsed = time.time() - last_time_called[0]
            left_to_wait = min_interval - elapsed
            if left_to_wait > 0:
                time.sleep(left_to_wait)
            ret = func(*args, **kargs)
            last_time_called[0] = time.time()
            return ret
        return rate_limited_function
    return decorate

In [None]:
# Retry decorator
def retry_request(function):
    DEFAULT_TIMEOUT = 5
    timeout = 5

    @rate_limited(20)
    def retried_function(*args, **kwargs):
        nonlocal timeout
        try:
            timeout = DEFAULT_TIMEOUT
            return function(*args, **kwargs)

        except HTTPError as e:
            if e.code == 429:
                timeout += 5
                print(f"Encountered 429. Gonna sleep for {timeout} and retry")
                sleep(timeout)
                return retried_function(*args, **kwargs)
            elif e.code == 403:
                timeout += 5
                print(f"Encountered 403. Gonna sleep for {timeout} and retry")
                sleep(timeout)
                return retried_function(*args, **kwargs)
            else:
                raise

    return retried_function


# WikiData SPARQL Querying




In [None]:
# SPARQL endpoint for WikiData
sparql_endpoint = "https://query.wikidata.org/sparql"

@retry_request
def query_wikidata(qid):
    # SPARQL query to retrieve name and description in English
    sparql_query = f"""
    SELECT ?item ?itemLabel ?itemDescription
    WHERE {{
      wd:{qid} rdfs:label ?itemLabel.
      OPTIONAL {{ wd:{qid} schema:description ?itemDescription. FILTER(LANG(?itemLabel) = "en") FILTER(LANG(?itemDescription) = "en") }}
      FILTER(LANG(?itemLabel) = "en")
      FILTER(LANG(?itemDescription) = "en" || !BOUND(?itemDescription))
    }}
    """

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json'
    }

    # Sending SPARQL query to WikiData
    response = requests.get(sparql_endpoint, params={'query': sparql_query, 'format': 'json'}, headers=headers)

    # json response
    data = response.json()

    # Extracting relevant information
    if 'results' in data and 'bindings' in data['results'] and len(data['results']['bindings']) > 0:
        result = data['results']['bindings'][0]
        name = result['itemLabel']['value']
        description = None
        if 'itemDescription' in result:
          description = result['itemDescription']['value']

        return {'name': name, 'description': description}

    else:
        return None



# to check if code works fine for single example

In [None]:
# Example usage
qid = 'Q7964731                       '
company_info = query_wikidata(qid)
print(company_info)
if company_info:
    print(f"Name: {company_info['name']}")
    print(f"Description: {company_info['description']}")
else:
    print("No information found for the given QID.")


{'name': 'Walter E. Smithe', 'description': 'Illinois based Furnature Company'}
Name: Walter E. Smithe
Description: Illinois based Furnature Company


# Running code for 200 samples

In [None]:
# Sample 200 QIDs
sample_qids = df['qid'].head(200)

# empty lists to store the results
names = []
descriptions = []


# Looping through the sample QIDs and retrieve information
for qid in sample_qids:
    company_info = query_wikidata(qid)

    if company_info:
        names.append(company_info['name'])
        descriptions.append(company_info['description'])
    else:
        names.append(None)
        descriptions.append(None)


# Create a DataFrame with the retrieved information
wikidata_df = pd.DataFrame({
    'qid': sample_qids,
    'name': names,
    'description': descriptions
})

#  the DataFrame
print(wikidata_df)




            qid                                   name  \
0      Q7964731                       Walter E. Smithe   
1       Q881127                                Harting   
2      Q7552448       Society of Costa Rica Collectors   
3      Q7569295  Southeast Asia Basketball Association   
4    Q107519888                 Adobe Gastroenterology   
..          ...                                    ...   
195   Q18163724                   Laxey Towing Company   
196   Q54958332                              ASK Group   
197   Q94527372                   Carl Schneider & Co.   
198    Q1916070                                 Medela   
199   Q19894252  Progressive All-Student Unionist Camp   

                               description  
0         Illinois based Furnature Company  
1                                  company  
2                             organization  
3                     subzone of FIBA Asia  
4    organization in Tucson, United States  
..                               

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [None]:
# Assuming 'wikidata_df' is your DataFrame containing WikiData information
parquet_file_path = "/content/gdrive/My Drive/wikidata_data_qid_200_only_eng.parquet"
wikidata_df.to_parquet(parquet_file_path)


In [None]:
from google.colab import files
files.download(parquet_file_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>