In [None]:
import requests
import time

import pyarrow.parquet as pq
import pandas as pd
import numpy as np
from time import sleep
from urllib.error import HTTPError


In [None]:
data_file_qid = "/content/wikidata_data_2000_samples.parquet"
table_qid = pd.read_parquet(data_file_qid)
print(table_qid)
data_file_predicate = "/content/wikidata_data_predicates_filtered.parquet"
table_predicate = pd.read_parquet(data_file_predicate)
print(table_predicate)

             qid                                   name  \
0       Q7964731                       Walter E. Smithe   
1        Q881127                                Harting   
2       Q7552448       Society of Costa Rica Collectors   
3       Q7569295  Southeast Asia Basketball Association   
4     Q107519888                 Adobe Gastroenterology   
...          ...                                    ...   
1995  Q104427093                               Scandica   
1996  Q108528158                                   None   
1997   Q12834689                                   None   
1998  Q113144762                 Natural History Museum   
1999   Q27262555                  James Powell and Sons   

                                description  
0          Illinois based Furnature Company  
1                                   company  
2                              organization  
3                      subzone of FIBA Asia  
4     organization in Tucson, United States  
...            

In [None]:
nan_values_in_name_column = table_qid['name'].isna().sum()
print(f"\nNumber of None values in the 'name' column: {nan_values_in_name_column}")

nan_values_in_name_column = table_qid['description'].isna().sum()
print(f"\nNumber of None values in the 'description' column: {nan_values_in_name_column}")

print(len(table_predicate))
nan_values_in_name_column = table_predicate['label'].isna().sum()
print(f"\nNumber of None values in the 'label' column of predicates: {nan_values_in_name_column}")

nan_values_in_name_column = table_predicate['description'].isna().sum()
print(f"\nNumber of None values in the 'description' column of predicates: {nan_values_in_name_column}")


Number of None values in the 'name' column: 869

Number of None values in the 'description' column: 869
113

Number of None values in the 'label' column of predicates: 0

Number of None values in the 'description' column of predicates: 0


In [None]:
def format_query(name: str, label: str, name_description: str | None = None, label_description: str | None = None ):

    query = f"Generate search query for '{name}' "
    if name_description is not None and len(name_description)>1:
      query += f"({name_description})"

    query += f" to find '{label}' "
    if label_description:
      query+=f"({label_description}) in Google"
    else:
      query+=f"in Google"
    return query






In [None]:
# Generate 1000 LLM queries

buffer = []
while len(buffer)<1000:
    # Replace randint arguments with the actual range of indices in your DataFrames
    random_qid_index = np.random.randint(0, len(table_qid))
    random_predicate_index = np.random.randint(0, len(table_predicate)-1)

    random_name = table_qid.loc[random_qid_index, 'name']
    random_label = table_predicate.loc[random_predicate_index, 'label']
    if random_name is None or random_label is None:
      continue

    random_qid = table_qid.loc[random_qid_index, 'qid']
    random_predicate = table_predicate.loc[random_predicate_index, 'predicate_id']

    random_name_description = table_qid.loc[random_qid_index, 'description']

    random_label_description = table_predicate.loc[random_predicate_index, 'description']

    query = format_query(name = random_name, label = random_label, name_description =random_name_description, label_description =  random_label_description )
    buffer.append({
        "qid":random_qid,
        "predicate_id": random_predicate,
        "qid_label": random_name,
        "qid_description": random_name_description,
        "predicate_label": random_label,
        "predicate_description": random_label_description,
        "llm_query": query
      })


In [None]:
query_df = pd.DataFrame(buffer)
query_df

Unnamed: 0,qid,predicate_id,qid_label,qid_description,predicate_label,predicate_description,llm_query
0,Q4742512,P249,Americas Best Value Inn,American hotel chain,ticker symbol,identifier for a publicly traded share of a pa...,Generate search query for 'Americas Best Value...
1,Q75343246,P2627,Liga Drepturile si Datoriile Femeii,Romanian organisation for women's rights,ISO 9362 SWIFT/BIC code,Identifier ISO 9362 SWIFT/BIC code,Generate search query for 'Liga Drepturile si ...
2,Q30273275,P1278,Vallee Foundation,"organization in Boston, United States",Legal Entity Identifier,identifier for a legally distinct entity per I...,Generate search query for 'Vallee Foundation' ...
3,Q18290988,P3153,ISS,Firm,Crossref funder ID,identifier for an organisation that funds rese...,Generate search query for 'ISS' (Firm) to find...
4,Q30258898,P3608,Environmental Technologies (United States),"company in Pacific, United States",EU VAT number,VAT number assigned in the EU,Generate search query for 'Environmental Techn...
...,...,...,...,...,...,...,...
995,Q19873833,P1059,Hambro Magan,Defunct British private equity firm,CVR number,unique identifier for a business in Denmark's ...,Generate search query for 'Hambro Magan' (Defu...
996,Q111470544,P5256,Echo Street Capital,"Based in New York City, Echo Street manages ov...",OpenCorporates corporate grouping,companies grouped together at OpenCorporates,Generate search query for 'Echo Street Capital...
997,Q116892083,P8563,NANO CENTRUM AG,Czech company,Emporis company ID,former identifier for an individual architect ...,Generate search query for 'NANO CENTRUM AG' (C...
998,Q2996639,P6366,Coordination SUD,organization,Microsoft Academic ID,identifier for an object or topic in the Micro...,Generate search query for 'Coordination SUD' (...


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>