In [26]:
import requests
import pandas as pd
import os
import time

def preprocess_name(name):
    # Trim leading and trailing whitespaces and remove surrounding double quotes if present
    processed_name = name.strip().strip('"')
    # Capitalize the first letter of each word and convert the rest to lowercase
    processed_name = ' '.join(word.capitalize() for word in processed_name.split())
    return processed_name

def generate_name_variations(name):
    parts = name.split()
    variations = []

    # Generate combinations by progressively removing the last part of the name
    for i in range(len(parts)):
        variations.append(' '.join(parts[:len(parts) - i]))
    
    return variations

def get_wikidata_url(name, retry=0):
    # Preprocess the name
    processed_name = preprocess_name(name)
    name_variations = generate_name_variations(processed_name)
    url = 'https://query.wikidata.org/sparql'
    headers = {'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json'}
    
    for variation in name_variations:
        query = f'''
            SELECT ?item WHERE {{
            {{
                ?item rdfs:label "{processed_name}"@en.
            }} UNION {{
                ?item skos:altLabel "{processed_name}"@en.
            }}
            FILTER EXISTS {{ ?item wdt:P31 wd:Q5. }}  # Ensure the item is an instance of human (Q5)
            OPTIONAL {{ ?item wdt:P106 wd:Q82955. BIND(true AS ?isPolitician) }}  # Check if a politician
            FILTER NOT EXISTS {{ ?item wdt:P570 ?deathDate. }}  # Exclude deceased individuals
            FILTER NOT EXISTS {{ ?item wdt:P106 wd:Q937857. }}  # Exclude association football players
            FILTER NOT EXISTS {{ ?item wdt:P106 wd:Q177220. }}  # Exclude singers
            FILTER NOT EXISTS {{ ?item wdt:P106 wd:Q2066131. }}  # Exclude athletes
            FILTER NOT EXISTS {{ ?item wdt:P106 wd:Q214917. }}  # Exclude playwrights
            FILTER NOT EXISTS {{ ?item wdt:P106 wd:Q201788. }}  # Exclude historians
            MINUS {{ ?item wdt:P106/wdt:P279* wd:Q2066131. }}  # Exclude if occupation is a subclass of "sportsperson"
            MINUS {{ ?item wdt:P106/wdt:P279* wd:Q349. }}     # Exclude if occupation is directly associated with "sports"
            BIND(COALESCE(?isPolitician, false) AS ?isPolitician)
            }} ORDER BY DESC(?isPolitician) LIMIT 1 
        '''
        
        try:
            response = requests.get(url, headers=headers, params={'query': query, 'format': 'json'}, timeout=10)
            response.raise_for_status()  # Raises a HTTPError if the status is 4xx, 5xx
            data = response.json()
            if data['results']['bindings']:
                entity_url = data['results']['bindings'][0]['item']['value']
                wiki_url = entity_url.replace('http://www.wikidata.org/entity/', 'https://www.wikidata.org/wiki/')
                return wiki_url
        except requests.exceptions.HTTPError as e:
            if response.status_code == 429 or 500 <= response.status_code < 600:
                if retry < 3:  # Set a maximum number of retries
                    time.sleep(10 * (retry + 1))  # Exponential back-off
                    return get_wikidata_url(name, retry + 1)
                else:
                    return "API limit reached, stopped retrying."
        except requests.exceptions.RequestException as e:
            # For other types of exceptions like timeout, etc.
            return "Request failed"
    
    return "Not found"

def process_names_in_batches(input_file, output_file):
    # Adjust chunksize to the desired batch size
    chunksize = 100
    # Check if the output file already exists and has content
    file_exists = os.path.isfile(output_file) and os.path.getsize(output_file) > 0
    
    for chunk in pd.read_csv(input_file, chunksize=chunksize):
        chunk['wikidata_URL'] = chunk['name'].apply(get_wikidata_url)
        # Write without header if file already exists and has content; otherwise, write with header
        chunk.to_csv(output_file, mode='a', index=False, header=not file_exists)
        # Ensure header is not written in subsequent iterations
        file_exists = True


In [27]:
# for testing
get_wikidata_url("Nicolás GONZÁLEZ CASARES")

'https://www.wikidata.org/wiki/Q20535883'

In [28]:
# Specify your input and output file paths
input_file_path = '/Users/mel/Downloads/meps.csv'
output_file_path = '/Users/mel/Downloads/meps_annotated.csv'

# Create the output file and write headers
# pd.DataFrame(columns=['name', 'wikidata_URL']).to_csv(output_file_path, index=False)

# Process names in batches
process_names_in_batches(input_file_path, output_file_path)

print("Processing completed.")

Processing completed.
