#### 1. Importing Libraries

In [2]:
from qwikidata.entity import WikidataItem, WikidataLexeme, WikidataProperty
from qwikidata.linked_data_interface import get_entity_dict_from_api
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import time
import os

2. Get the list of people using SPARQL

In [3]:
sparql = SPARQLWrapper("https://query.wikidata.org/sparql")

In [None]:
sparql.setQuery("""
    SELECT ?person ?personLabel WHERE {
      ?person wdt:P106 wd:Q43845 .  # Remaining QIDs for each occupation are Occupation: Scientist - Q901, Occupation: Businessperson - Q43845, Occupation: Politican - Q82955, Occupation: Actor - Q33999
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }  # Get English labels
    }
""")
sparql.setReturnFormat(JSON)
# Execute the query and get the results
results = sparql.query().convert()

In [5]:
print(len(results["results"]["bindings"]))

83193


In [6]:
person_data = []
for result in results["results"]["bindings"]:
    person_data.append({
        "businessperson": result["person"]["value"],
        "businesspersonLabel": result["personLabel"]["value"]
    })
print(person_data[:10])

[{'businessperson': 'http://www.wikidata.org/entity/Q181', 'businesspersonLabel': 'Jimmy Wales'}, {'businessperson': 'http://www.wikidata.org/entity/Q185', 'businesspersonLabel': 'Larry Sanger'}, {'businessperson': 'http://www.wikidata.org/entity/Q207', 'businesspersonLabel': 'George W. Bush'}, {'businessperson': 'http://www.wikidata.org/entity/Q306', 'businesspersonLabel': 'Sebastián Piñera'}, {'businessperson': 'http://www.wikidata.org/entity/Q360', 'businesspersonLabel': 'Julian Assange'}, {'businessperson': 'http://www.wikidata.org/entity/Q400', 'businesspersonLabel': 'Jenna Jameson'}, {'businessperson': 'http://www.wikidata.org/entity/Q607', 'businesspersonLabel': 'Michael Bloomberg'}, {'businessperson': 'http://www.wikidata.org/entity/Q1317', 'businesspersonLabel': 'Osama bin Laden'}, {'businessperson': 'http://www.wikidata.org/entity/Q1318', 'businesspersonLabel': 'Aníbal Zañartu'}, {'businessperson': 'http://www.wikidata.org/entity/Q1379', 'businesspersonLabel': 'Gary Gygax'}]


In [None]:
occupation = "politican"
output_dir = 'Data/Wikidata_JSON_'+occupation
os.makedirs(output_dir, exist_ok=True)

output_file = os.path.join(output_dir, occupation+'s_list.json')

# Save the results as a JSON file
with open(output_file, 'w') as f:
    json.dump(person_data, f, indent=4)

print(f"Data saved to {output_file}")

Data saved to Data/Wikidata_JSON_Businessperson/businesspersons_list.json


#### 2. Inputting List of monuments from SPARQL output of list of People for an Occupation

In [None]:
with open('Data/Wikidata_JSON_businessperson/'+occupation+'s_list.json') as f:
    person_list = json.load(f)

for i in range (0,len(person_list)):
    person_list[i][occupation] = person_list[i][occupation].split('/')[-1]

#### 3. Removing People Without English Labels

In [None]:
count_persons = 0

t0 = time.time()

en_labelled_persons = []
all_persons = []

for person in person_list[:5000]:
    try:
        # Get details from Wikidata
        person_details = get_entity_dict_from_api(person[occupation])
        all_persons.append(person_details)
        
        # Check for 'en' (English) label
        if 'en' in person_details['labels']:
            en_labelled_persons.append(person_details)
        
        count_persons += 1
        
        # Every 200 persons, save to JSON
        if count_persons % 200 == 0:
            partition_num = str(count_persons // 200)
            
            # Save English-labeled persons
            with open(f'Data/Wikidata_JSON_businessperson/en_{occupation}_dump_part{partition_num}.json', 'w') as fout:
                json.dump(en_labelled_persons, fout, indent=4)
            
            # Save all persons
            with open(f'Data/Wikidata_JSON_businessperson/{occupation}_dump_part{partition_num}.json', 'w') as fout:
                json.dump(all_persons, fout, indent=4)
            
            # Reset lists
            en_labelled_persons = []
            all_persons = []
            
            # Print progress
            print(f"Checkpoint {partition_num} reached, JSON dumps saved | Time Elapsed: {time.time() - t0:.2f} seconds")
    
    except Exception as e:
        print(f"Error processing {occupation} {person[occupation]}: {e}")

# Final timing
t1 = time.time()
total = t1 - t0
print(f"Total time elapsed: {total:.2f} seconds")



Checkpoint 1 reached, JSON dumps saved | Time Elapsed: 47.34 seconds
Checkpoint 2 reached, JSON dumps saved | Time Elapsed: 86.72 seconds
Checkpoint 3 reached, JSON dumps saved | Time Elapsed: 129.98 seconds
Checkpoint 4 reached, JSON dumps saved | Time Elapsed: 171.63 seconds
Checkpoint 5 reached, JSON dumps saved | Time Elapsed: 211.51 seconds
Checkpoint 6 reached, JSON dumps saved | Time Elapsed: 250.72 seconds
Checkpoint 7 reached, JSON dumps saved | Time Elapsed: 291.11 seconds
Checkpoint 8 reached, JSON dumps saved | Time Elapsed: 329.89 seconds
Checkpoint 9 reached, JSON dumps saved | Time Elapsed: 369.02 seconds
Checkpoint 10 reached, JSON dumps saved | Time Elapsed: 407.61 seconds
Checkpoint 11 reached, JSON dumps saved | Time Elapsed: 447.21 seconds
Checkpoint 12 reached, JSON dumps saved | Time Elapsed: 485.14 seconds
Checkpoint 13 reached, JSON dumps saved | Time Elapsed: 522.73 seconds
Checkpoint 14 reached, JSON dumps saved | Time Elapsed: 560.66 seconds
Checkpoint 15 rea