In [9]:
import pandas as pd
import requests
from time import sleep
from tqdm import tqdm

def search_wikipedia_api(display_name):
    """
    Search Wikipedia API for an artist using their display name
    """
    search_url = "https://en.wikipedia.org/w/api.php"
    
    # Construct search query - try with and without "artist" qualifier
    search_terms = [
        f"{display_name} artist",
        f"{display_name} painter",
        f"{display_name}"
    ]
    
    headers = {
        'User-Agent': 'ArtistResearchBot/1.0 (your@email.com)'
    }
    
    for search_term in search_terms:
        try:
            params = {
                "action": "query",
                "format": "json",
                "list": "search",
                "srsearch": search_term,
                "srlimit": 1
            }
            
            response = requests.get(search_url, params=params, headers=headers)
            response.raise_for_status()
            data = response.json()
            
            if data["query"]["search"]:
                page_id = data["query"]["search"][0]["pageid"]
                
                # Get page content
                params = {
                    "action": "query",
                    "format": "json",
                    "prop": "extracts|categories",
                    "exintro": True,
                    "explaintext": True,
                    "pageids": page_id
                }
                
                content_response = requests.get(search_url, params=params, headers=headers)
                content_response.raise_for_status()
                content_data = content_response.json()
                
                page_content = content_data["query"]["pages"][str(page_id)]
                
                # Check if the page is about an artist by looking at categories
                categories = page_content.get("categories", [])
                is_artist = any("artist" in str(cat).lower() or 
                              "painter" in str(cat).lower() or 
                              "sculptor" in str(cat).lower() 
                              for cat in categories)
                
                if is_artist:
                    return {
                        "extract": page_content["extract"],
                        "page_id": page_id,
                        "confidence": "high" if "artist" in search_term else "medium"
                    }
                
        except Exception as e:
            print(f"Error searching for {display_name}: {str(e)}")
            continue
            
    return {"extract": "", "page_id": None, "confidence": "none"}

def process_artists_file(input_file, output_file):
    """
    Process CSV file containing artist information
    Expected column: display_name (in format "Firstname Lastname")
    """
    df = pd.read_csv(input_file)
    
    # Create empty lists to store results
    extracts = []
    page_ids = []
    confidence_levels = []
    
    # Process each artist with a progress bar
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing artists"):
        result = search_wikipedia_api(row['DisplayName'])
        extracts.append(result['extract'][:500] if result['extract'] else '')  # Limit to first 500 chars
        page_ids.append(result['page_id'])
        confidence_levels.append(result['confidence'])
        
        # Be nice to Wikipedia's servers
        # sleep(1)
    
    # Add new columns to the dataframe
    df['wikipedia_extract'] = extracts
    df['wikipedia_page_id'] = page_ids
    df['match_confidence'] = confidence_levels
    
    # Save the results
    df.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")
    
    # Print some statistics
    total = len(df)
    found = sum(1 for x in page_ids if x is not None)
    print(f"\nStats:")
    print(f"Total artists processed: {total}")
    print(f"Artists found: {found} ({(found/total)*100:.1f}%)")
    print(f"Artists not found: {total - found} ({((total-found)/total)*100:.1f}%)")

In [10]:
if __name__ == "__main__":
    input_file = "female_artists_with_work_counts.csv"
    output_file = "female_artists_with_work_counts_wiki.csv"
    process_artists_file(input_file, output_file)

Processing artists: 100%|██████████| 2435/2435 [33:23<00:00,  1.22it/s] 

Results saved to female_artists_with_work_counts_wiki.csv

Stats:
Total artists processed: 2435
Artists found: 1617 (66.4%)
Artists not found: 818 (33.6%)





'/Users/susiesyli/Desktop/poetic-data-final/data'