In [1]:
import openpyxl
import requests
import os
import time

def get_wikidata_info(wikidata_id):
    """
    Safely fetches specified information from Wikidata for a given Wikidata ID.
    Returns a dictionary with the information, or None if the ID is invalid or an error occurs.
    """
    # Basic validation of the Wikidata ID format
    if not (isinstance(wikidata_id, str) and wikidata_id.startswith('Q') and wikidata_id[1:].isdigit()):
        print(f"  - Invalid Wikidata ID format: '{wikidata_id}'. Skipping.")
        return None

    url = 'https://www.wikidata.org/w/api.php'
    params = {
        'action': 'wbgetentities',
        'ids': wikidata_id,
        'format': 'json',
        'props': 'descriptions|aliases|claims'
    }

    try:
        response = requests.get(url, params=params, headers={'User-Agent': 'MyCoolTool/1.0'})
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        data = response.json()

        # Check if the entity exists in the response
        entity = data.get('entities', {}).get(wikidata_id)
        if not entity or 'missing' in entity:
            print(f"  - Could not find data for Wikidata ID: {wikidata_id}")
            return None

        info = {
            'DESCRIPTION_WIKIDATA': '',
            'ALSO_KNOWN_AS': '',
            'VIAF_CODE': '',
            'WIKIDATA_ID': wikidata_id,
            'COORDINATES': ''
        }

        #  SAFE DATA EXTRACTION 
        # Get description (safer using .get())
        info['DESCRIPTION_WIKIDATA'] = entity.get('descriptions', {}).get('en', {}).get('value', '')

        # Get "also known as" and join with semicolon
        aliases_en = entity.get('aliases', {}).get('en', [])
        if aliases_en:
            info['ALSO_KNOWN_AS'] = '; '.join([alias.get('value', '') for alias in aliases_en])

        # Get claims (VIAF code P214 and Coordinates P625)
        claims = entity.get('claims', {})
        
        # Get all VIAF codes (P214) and join with semicolon
        viaf_claims = claims.get('P214', [])
        if viaf_claims:
            viaf_codes = [claim.get('mainsnak', {}).get('datavalue', {}).get('value', '') for claim in viaf_claims]
            info['VIAF_CODE'] = '; '.join(filter(None, viaf_codes))

        # Get Coordinates (P625) only if they exist, and format them cleanly
        coords_claim = claims.get('P625', [])
        if coords_claim:
            coords_value = coords_claim[0].get('mainsnak', {}).get('datavalue', {}).get('value', {})
            lat = coords_value.get('latitude')
            lon = coords_value.get('longitude')
            if lat is not None and lon is not None:
                info['COORDINATES'] = f"{lat}, {lon}"
        
        return info

    except requests.exceptions.RequestException as e:
        print(f"  - An error occurred while fetching data for {wikidata_id}: {e}")
        return None
    except Exception as e:
        print(f"  - An unexpected error occurred while processing {wikidata_id}: {e}")
        return None

def process_excel_file(file_path):
    """
    Opens an XLSX file, scrapes data from Wikidata for specific worksheets,
    and saves the updated data back to the same file.
    """
    print(f"IMPORTANT: Please ensure the file '{os.path.basename(file_path)}' is closed to avoid errors.")

    try:
        workbook = openpyxl.load_workbook(file_path)
    except Exception as e:
        print(f"Error opening the Excel file: {e}. Is the file open in another program?")
        return

    #  1) PROCESS ONLY SPECIFIC WORKSHEETS 
    target_sheets = ['PLACE', 'PERSON', 'ABSTRACT_CHARACTER', 'EVENT', 'INSTITUTION']

    for sheet_name in target_sheets:
        if sheet_name not in workbook.sheetnames:
            print(f"\nWorksheet '{sheet_name}' not found in this file. Skipping.")
            continue

        worksheet = workbook[sheet_name]
        entity_name = sheet_name
        print(f"\nProcessing worksheet: '{sheet_name}'...")

        wikidata_link_col_name = f"WIKIDATA_LINK_{entity_name}"
        
        header = [cell.value for cell in worksheet[1]]
        
        try:
            wikidata_link_col_idx = header.index(wikidata_link_col_name) + 1
        except ValueError:
            print(f"  - Column '{wikidata_link_col_name}' not found. Skipping this sheet.")
            continue

        #  2) CONDITIONALLY DEFINE HEADERS (COORDINATES FOR PLACE/EVENT ONLY) 
        base_headers = ['DESCRIPTION_WIKIDATA', 'ALSO_KNOWN_AS', 'VIAF_CODE', 'WIKIDATA_ID']
        if entity_name in ['PLACE', 'EVENT']:
            base_headers.append('COORDINATES')
        
        #  3) DYNAMICALLY CREATE NEW HEADERS WITH SUFFIX 
        new_headers = [f"{h}_{entity_name}" for h in base_headers]
        
        col_map = {}
        
        # Find existing columns or determine the next available column
        last_col = worksheet.max_column
        for col_name in new_headers:
            if col_name in header:
                col_map[col_name] = header.index(col_name) + 1
            else:
                last_col += 1
                worksheet.cell(row=1, column=last_col, value=col_name)
                col_map[col_name] = last_col
        
        print(f"  - Data will be written to columns (Header: Index): {col_map}")

        # Process each row
        for row_idx in range(2, worksheet.max_row + 1):
            wikidata_url = worksheet.cell(row=row_idx, column=wikidata_link_col_idx).value
            
            if wikidata_url and isinstance(wikidata_url, str):
                wikidata_id = wikidata_url.split('/')[-1]
                print(f"  - Row {row_idx}: Processing Wikidata ID '{wikidata_id}'...")
                
                scraped_data = get_wikidata_info(wikidata_id)
                
                if scraped_data:
                    # Populate the data into the correct, dynamically named columns
                    worksheet.cell(row=row_idx, column=col_map[f'DESCRIPTION_WIKIDATA_{entity_name}'], value=scraped_data['DESCRIPTION_WIKIDATA'])
                    worksheet.cell(row=row_idx, column=col_map[f'ALSO_KNOWN_AS_{entity_name}'], value=scraped_data['ALSO_KNOWN_AS'])
                    worksheet.cell(row=row_idx, column=col_map[f'VIAF_CODE_{entity_name}'], value=scraped_data['VIAF_CODE'])
                    worksheet.cell(row=row_idx, column=col_map[f'WIKIDATA_ID_{entity_name}'], value=scraped_data['WIKIDATA_ID'])
                    
                    # Conditionally populate coordinates
                    if entity_name in ['PLACE', 'EVENT']:
                        worksheet.cell(row=row_idx, column=col_map[f'COORDINATES_{entity_name}'], value=scraped_data['COORDINATES'])
                
                # Be polite to the API and wait a little
                time.sleep(0.1) 
            else:
                print(f"  - Row {row_idx}: No Wikidata URL found. Skipping.")

    try:
        workbook.save(file_path)
        print(f"\n--- Successfully saved the updated data to '{file_path}' ---")
    except Exception as e:
        print(f"\nCRITICAL ERROR: Could not save the Excel file: {e}")
        print("Please ensure the file is not open elsewhere and you have permission to write to that location.")

if __name__ == '__main__':
    #  AUTOMATICALLY FIND AND PROCESS ALL XLSX FILES IN THE SCRIPT'S DIRECTORY 
    script_directory = os.getcwd()
    print(f"Searching for .xlsx files in: {script_directory}")
    
    excel_files = [f for f in os.listdir(script_directory) if f.endswith('.xlsx')]

    if not excel_files:
        print("No .xlsx files found in the script's directory.")
    else:
        print(f"Found files: {excel_files}")
        for file_name in excel_files:
            # Make sure not to process temporary excel files created when a file is open
            if file_name.startswith('~$'):
                continue
            
            file_path = os.path.join(script_directory, file_name)
            print(f"\n=========================================")
            print(f"Processing file: {file_name}")
            print(f"=========================================")
            process_excel_file(file_path)
    
    print("\nAll files processed.")

Searching for .xlsx files in: c:\Users\vojim\Desktop\ERA_IFF_FINAL\phase_1-excel_preparation_and_enrichment
Found files: ['BA-EG.xlsx', 'BPC-AB.xlsx', 'columns_trace.xlsx']

Processing file: BA-EG.xlsx
IMPORTANT: Please ensure the file 'BA-EG.xlsx' is closed to avoid errors.

Processing worksheet: 'PLACE'...
  - Data will be written to columns (Header: Index): {'DESCRIPTION_WIKIDATA_PLACE': 4, 'ALSO_KNOWN_AS_PLACE': 5, 'VIAF_CODE_PLACE': 6, 'WIKIDATA_ID_PLACE': 8, 'COORDINATES_PLACE': 9}
  - Row 2: No Wikidata URL found. Skipping.
  - Row 3: Processing Wikidata ID 'Q89804530'...
  - Row 4: Processing Wikidata ID 'Q64'...
  - Row 5: Processing Wikidata ID 'Q23725'...
  - Row 6: Processing Wikidata ID 'Q2044'...
  - Row 7: Processing Wikidata ID 'Q20050'...
  - Row 8: Processing Wikidata ID 'Q71'...
  - Row 9: Processing Wikidata ID 'Q1055'...
  - Row 10: Processing Wikidata ID 'Q41'...
  - Row 11: Processing Wikidata ID 'Q38'...
  - Row 12: Processing Wikidata ID 'Q2079'...
  - Row 13: 