In [1]:
!pip install SPARQLWrapper



In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

In [5]:
# Initialize the subclasses list
subclasses = []

# Initialize SPARQL endpoint
dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql")

# Define the SPARQL query
dbpedia_sparql.setQuery("""
   SELECT ?subclass (COUNT(DISTINCT ?instance) AS ?count)
   WHERE {
     ?instance a ?subclass.
     ?subclass rdfs:subClassOf dbo:Person.
     FILTER (?subclass != dbo:Person)
     OPTIONAL {
        ?instance schema:about ?wikiPage .
        ?wikiPage foaf:isPrimaryTopicOf ?article .
        FILTER(STRSTARTS(STR(?article), "https://en.wikipedia.org"))
        FILTER(LANG(?article) = "en")
     }
   }
   GROUP BY ?subclass
   ORDER BY ASC(?count)
""")

# Set return format to JSON
dbpedia_sparql.setReturnFormat(JSON)

# Execute the query and process the results
results = dbpedia_sparql.query().convert()

subclasses = {}
for result in results["results"]["bindings"]:
    subclass_uri = result["subclass"]["value"]
    count = result["count"]["value"]
    subclass = subclass_uri.replace("http://dbpedia.org/ontology/", "")
    subclasses[subclass] = int(count)  # Convert count to integer

In [6]:
subclasses

{'Judge': 124,
 'Monarch': 245,
 'Spy': 261,
 'AmericanLeader': 264,
 'Pilot': 286,
 'HorseTrainer': 355,
 'PoliceOfficer': 413,
 'Presenter': 670,
 'BusinessPerson': 691,
 'Astronaut': 738,
 'Engineer': 885,
 'Chef': 897,
 'Youtuber': 900,
 'PlayboyPlaymate': 979,
 'Economist': 1720,
 'Journalist': 1858,
 'Model': 2045,
 'BeautyQueen': 2987,
 'Philosopher': 2987,
 'Religious': 4832,
 'Architect': 5574,
 'Criminal': 6081,
 'Noble': 7949,
 'Academic': 10663,
 'Coach': 10954,
 'Royalty': 22720,
 'Cleric': 25434,
 'SportsManager': 29156,
 'MilitaryPerson': 50255,
 'Writer': 51821,
 'Scientist': 52119,
 'OfficeHolder': 66597,
 'Artist': 107644,
 'Politician': 200848,
 'OrganisationMember': 456914,
 'Athlete': 578933}

In [7]:
subclasses.pop("OrganisationMember")

456914

In [8]:
subclasses

{'Judge': 124,
 'Monarch': 245,
 'Spy': 261,
 'AmericanLeader': 264,
 'Pilot': 286,
 'HorseTrainer': 355,
 'PoliceOfficer': 413,
 'Presenter': 670,
 'BusinessPerson': 691,
 'Astronaut': 738,
 'Engineer': 885,
 'Chef': 897,
 'Youtuber': 900,
 'PlayboyPlaymate': 979,
 'Economist': 1720,
 'Journalist': 1858,
 'Model': 2045,
 'BeautyQueen': 2987,
 'Philosopher': 2987,
 'Religious': 4832,
 'Architect': 5574,
 'Criminal': 6081,
 'Noble': 7949,
 'Academic': 10663,
 'Coach': 10954,
 'Royalty': 22720,
 'Cleric': 25434,
 'SportsManager': 29156,
 'MilitaryPerson': 50255,
 'Writer': 51821,
 'Scientist': 52119,
 'OfficeHolder': 66597,
 'Artist': 107644,
 'Politician': 200848,
 'Athlete': 578933}

In [5]:
import requests
from datetime import datetime

# Function to get the year of the first publication date of an article
def publication_year_MediaWiki(article_title):
    url = f"https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvlimit=1&rvprop=timestamp&rvdir=newer&titles={article_title}&format=json"
    response = requests.get(url)
    if response.status_code == 200:  # Check if the request was successful
        try:
            data = response.json()
            if "query" in data and "pages" in data["query"]:
                page_id = list(data["query"]["pages"].keys())[0]  # Extract page ID
                revisions = data["query"]["pages"][page_id]["revisions"]
                if revisions:
                    timestamp = revisions[0]["timestamp"]  # Get the timestamp
                    # Parse the timestamp string and extract the year
                    year = datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ").year
                    return year  # Return the year
            return None  # If there are no revisions or unexpected JSON structure, return None
        except Exception as e:
            # print("Error parsing JSON:", e)
            return None
    else:
        print("Request failed with status code:", response.status_code)
        return None

In [6]:
import time

def get_gender_from_wikidata(wikidata_id):
    # Initialize SPARQL endpoint
    wikidata_sparql = SPARQLWrapper('https://query.wikidata.org/sparql')
    # Define the endpoint and the query
    query = f"""
    SELECT ?genderLabel WHERE {{
      wd:{wikidata_id} wdt:P21 ?gender.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    
    # Initialize the SPARQL wrapper
    wikidata_sparql.setQuery(query)
    wikidata_sparql.setReturnFormat(JSON)
    
    try:
        # Execute the query and fetch results
        results = wikidata_sparql.query().convert()
        # Extract the gender label from the results
        if results["results"]["bindings"]:
            gender = results["results"]["bindings"][0]["genderLabel"]["value"]
            return gender
        else:
            return None
    # except Exception as e:
    #     print(f"Error: {e}")
    #     return None

    except Exception as e:
        print(f"Error: {e}. Retrying after 2 seconds...")
        time.sleep(2)
        get_gender_from_wikidata(wikidata_id)


In [7]:
retries = 2

def run_code(subclass, i, offset):
    # Initialize SPARQL endpoint
    dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql")
    
    # Initialize an empty list to store data
    data_list = []
    
    # Define the SPARQL query
    dbpedia_sparql.setQuery(f"""
    SELECT ?instance ?wikidataID ?calculatedAge ?birthYear
    WHERE {{
        SELECT DISTINCT ?instance ?wikidataID ?calculatedAge ?birthYear 
        WHERE {{
            ?instance rdf:type dbo:{subclass} .
            OPTIONAL {{ ?instance dbo:birthDate ?birthDate }}
            OPTIONAL {{ ?instance dbo:age ?age }}
            OPTIONAL {{ ?instance dbo:deathDate ?deathDate }}
            OPTIONAL {{
                ?instance owl:sameAs ?wikidataID .
                FILTER regex(str(?wikidataID), "wikidata.org/entity/")
            }}
            OPTIONAL {{
                ?instance schema:about ?wikiPage .
                ?wikiPage foaf:isPrimaryTopicOf ?article .
                FILTER(STRSTARTS(STR(?article), "https://en.wikipedia.org"))
                FILTER(LANG(?article) = "en")
            }}
    
            # Bind year only if birthDate is available and is a valid date
            BIND(IF(BOUND(?birthDate), YEAR(?birthDate), UNDEF) AS ?birthYear)
                
            # Bind calculatedAge only if birthYear is valid
            BIND(IF(BOUND(?age), ?age, 
                    IF(BOUND(?deathDate) , 
                        (YEAR(?deathDate) - IF(BOUND(?birthYear), ?birthYear, YEAR(NOW()))), 
                        (YEAR(NOW()) - IF(BOUND(?birthYear), ?birthYear, YEAR(NOW())))
                    )) AS ?calculatedAge)
            }}
            ORDER BY ?instance
        }}
        OFFSET {offset}
        LIMIT 10000
    """)

    # Define the query format
    dbpedia_sparql.setReturnFormat(JSON)
    dbpedia_sparql.setTimeout(60)  # Set timeout
    
    attempts = 0
    while attempts < retries:
        try:
            # Execute the query and parse the results
            results = dbpedia_sparql.query().convert()
            break
        except Exception as e:
            attempts += 1
            print(f"{e}, internal retrying... ({attempts}/{retries})")
            time.sleep(2)
            if attempts == retries:
                raise e
    
    # Execute the query and parse the results
    results = dbpedia_sparql.query().convert()

    # Iterate over the results and populate the data list
    for result in results["results"]["bindings"]:
        wikiDataID = result["wikidataID"]["value"].replace("http://www.wikidata.org/entity/", "") if "wikidataID" in result else None
        gender = get_gender_from_wikidata(wikiDataID)
        instance = result["instance"]["value"].replace("http://dbpedia.org/resource/", "") 
        birth_year = int(result["birthYear"]["value"].replace("http://dbpedia.org/resource/", "")) if "birth_year" in result else None
        age = result["calculatedAge"]["value"].replace("http://dbpedia.org/resource/", "")
        publication_year = publication_year_MediaWiki(instance)
        data_list.append({'subclass': subclass, 'instance': instance, 'wikiDataID':wikiDataID, 'gender':gender,'age':int(age), 'birthYear':birth_year,'publication_year': publication_year})
        
    # Create DataFrame from the data list
    df = pd.DataFrame(data_list)
    
    df.to_csv(f"{subclass}/{subclass}iteration{i}.csv", index=False)

In [8]:
retries = 5
for subclass in subclasses:
    offsets = range(0, subclasses[subclass]+1, 10000)
    i = 0
    for offset in offsets:
        print("Running iteration", i, "..... for subclass", subclass)
        attempts = 0
        while attempts < retries:
            try:
                run_code(subclass, i, offset)
                break
            except Exception as e:
                attempts += 1
                print(f"{e}, main retrying... ({attempts}/{retries})")
                time.sleep(2)
                if attempts == retries:
                    raise e
        i += 1

Running iteration 0 ..... for subclass Judge


KeyboardInterrupt: 

In [None]:
len(subclasses)

In [None]:
total = 0
for key, val in subclasses.items():
    if val < 400000:
        total += val
print(total + )