In [1]:
!pip install pandas
!pip install rdflib
!pip install SPARQLWrapper

Collecting rdflib
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-7.0.0
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Installing collected packages: SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0


In [2]:
# import pandas library using alias pd
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON
import numpy as np
import time
from tqdm import tqdm
import pickle

In [3]:
import time
import datetime
import logging
    
logger = logging.getLogger()
    
def setup_file_logger(log_file):
    hdlr = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
    hdlr.setFormatter(formatter)
    logger.addHandler(hdlr) 
    logger.setLevel(logging.INFO)
    
def log(message):
    #outputs to Jupyter console
    print('{} {}'.format(datetime.datetime.now(), message))
    #outputs to file
    logger.info(message)
    
setup_file_logger('out.log')

## 1) Data Preparation

#### 1.1 Get List of subclasses and the number of instance for each of them

In [4]:
# Initialize SPARQL endpoint
dbpedia_sparql = SPARQLWrapper("https://dbpedia.org/sparql")

# Define the SPARQL query
dbpedia_sparql.setQuery("""
   SELECT ?subclass (COUNT(DISTINCT ?instance) AS ?count)
   WHERE {
     ?instance a ?subclass.
     ?subclass rdfs:subClassOf dbo:Person.
     FILTER (?subclass != dbo:Person)
   }
   GROUP BY ?subclass
   ORDER BY ASC(?count)
""")

# Set return format to JSON
dbpedia_sparql.setReturnFormat(JSON)

# Execute the query and process the results
results = dbpedia_sparql.query().convert()

subclasses = {}
for result in results["results"]["bindings"]:
    subclass_uri = result["subclass"]["value"]
    count = result["count"]["value"]
    subclass = subclass_uri.replace("http://dbpedia.org/ontology/", "")
    subclasses[subclass] = int(count)  # Convert count to integer

In [5]:
subclasses

{'Judge': 124,
 'Monarch': 245,
 'Spy': 261,
 'AmericanLeader': 264,
 'Pilot': 286,
 'HorseTrainer': 355,
 'PoliceOfficer': 413,
 'Presenter': 670,
 'BusinessPerson': 691,
 'Astronaut': 738,
 'Engineer': 885,
 'Chef': 897,
 'Youtuber': 900,
 'PlayboyPlaymate': 979,
 'Economist': 1720,
 'Journalist': 1858,
 'Model': 2045,
 'BeautyQueen': 2987,
 'Philosopher': 2987,
 'Religious': 4832,
 'Architect': 5574,
 'Criminal': 6081,
 'Noble': 7949,
 'Academic': 10663,
 'Coach': 10954,
 'Royalty': 22720,
 'Cleric': 25434,
 'SportsManager': 29156,
 'MilitaryPerson': 50255,
 'Writer': 51821,
 'Scientist': 52119,
 'OfficeHolder': 66597,
 'Artist': 107644,
 'Politician': 200848,
 'OrganisationMember': 456914,
 'Athlete': 578933}

In [6]:
subclasses.pop("Judge")

124

#### 1.2 Combine Collected Data into a Dataframe

In [7]:
import math

df = pd.read_csv('Judge.csv')

# load the data
for subclass, count in subclasses.items():
    try:
        if count < 10000:
            temporary_df = pd.read_csv(f'{subclass}.csv')
            df = pd.concat([df, temporary_df], ignore_index=True)
        else:
            for i in range(0, math.ceil(count / 10000) + 2):
                temporary_df = pd.read_csv(f'{subclass}/{subclass}iteration{i}.csv')
                df = pd.concat([df, temporary_df], ignore_index=True)
    except FileNotFoundError:
        print (f'{subclass}/{subclass}iteration{i}.csv NOT FOUND')
    
df.shape

Academic/Academiciteration1.csv NOT FOUND
Coach/Coachiteration2.csv NOT FOUND
Royalty/Royaltyiteration3.csv NOT FOUND
Cleric/Clericiteration3.csv NOT FOUND
SportsManager/SportsManageriteration3.csv NOT FOUND
MilitaryPerson/MilitaryPersoniteration6.csv NOT FOUND
Writer/Writeriteration6.csv NOT FOUND
Scientist/Scientistiteration7.csv NOT FOUND
OfficeHolder/OfficeHolderiteration7.csv NOT FOUND
Artist/Artistiteration12.csv NOT FOUND
Politician/Politicianiteration20.csv NOT FOUND
OrganisationMember/OrganisationMemberiteration0.csv NOT FOUND
Athlete/Athleteiteration1.csv NOT FOUND


(979648, 7)

In [8]:
# Check if all the subclasses are present in the dataframe to be preprocessed
df['subclass'].value_counts()

Athlete            250573
Politician         191191
Artist             120000
OfficeHolder        70000
Scientist           70000
Writer              60000
MilitaryPerson      56218
Cleric              30000
SportsManager       29192
Royalty             27736
Coach               10958
Academic            10000
Noble               10000
Philosopher          8875
Criminal             7076
Architect            5590
Religious            4868
BeautyQueen          2992
Economist            2738
Model                2048
Journalist           1865
PlayboyPlaymate       979
Youtuber              901
Chef                  899
Engineer              891
Astronaut             742
BusinessPerson        691
Presenter             671
PoliceOfficer         413
HorseTrainer          358
Pilot                 286
AmericanLeader        265
Spy                   261
Monarch               247
Judge                 124
Name: subclass, dtype: int64

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 979648 entries, 0 to 979647
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   subclass          979648 non-null  object 
 1   instance          979648 non-null  object 
 2   wikiDataID        948740 non-null  object 
 3   gender            926522 non-null  object 
 4   age               979648 non-null  int64  
 5   birthYear         250573 non-null  float64
 6   publication_year  957813 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 52.3+ MB


From above, it can be seen that the wikiDataID, , gender, birthYear and publication_year column has some null values.

#### 1.3 Removing Duplicated Rows

In [10]:
duplicated_rows = df[df.duplicated(subset=['instance', 'wikiDataID'], keep=False)].sort_values(by=['instance'])
duplicated_rows

Unnamed: 0,subclass,instance,wikiDataID,gender,age,birthYear,publication_year
537892,Politician,100th_Delaware_General_Assembly,Q4546229,,0,,2009.0
347886,OfficeHolder,100th_Delaware_General_Assembly,Q4546229,,0,,2009.0
537893,Politician,101st_Delaware_General_Assembly,Q4546348,,0,,2009.0
347888,OfficeHolder,101st_Delaware_General_Assembly,Q4546348,,0,,2009.0
537894,Politician,102nd_Delaware_General_Assembly,Q4546419,,0,,2009.0
...,...,...,...,...,...,...,...
728981,Politician,Ōyama_Iwao,Q359819,male,74,,2003.0
217876,MilitaryPerson,Živko_Budimir,Q2478723,male,62,,2011.0
729044,Politician,Živko_Budimir,Q2478723,male,62,,2011.0
729047,Politician,Živojin_Mišić,Q1138038,male,66,,2003.0


We can see above that there are 134,353 rows that have duplicated instance and wikiDataID, sicne we only need one rows of these duplicated rows, we wil kepp only the first rows of such.

In [11]:
df1 = df.drop_duplicates(subset=['instance', 'wikiDataID'], keep='first')
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 911850 entries, 0 to 979647
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   subclass          911850 non-null  object 
 1   instance          911850 non-null  object 
 2   wikiDataID        880999 non-null  object 
 3   gender            859866 non-null  object 
 4   age               911850 non-null  int64  
 5   birthYear         245539 non-null  float64
 6   publication_year  890068 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 55.7+ MB


In [12]:
df1.duplicated().sum() 

0

After dropping all the duplicates, we can infer that there are 979648 - 911850 = 67,798 rows that have duplicated insatnce and wikiDataID and have been removed.

In [13]:
df1['wikiDataID'].duplicated().sum()

148921

However, the WikiDataID columns still shows some duplication which should not happpened sicne we have deleted all the duplicates and since wikiDataID is unique. Thus, I think this happened for some error when collecting the data through the API's where two or more same instance might have been represented by mutiple wikiDataID but only one of them is correct

In [14]:
duplicated_wikidata = df1[df1.duplicated(subset=['subclass', 'instance'], keep=False)].sort_values(by=['instance'])
duplicated_wikidata

Unnamed: 0,subclass,instance,wikiDataID,gender,age,birthYear,publication_year
41338,Criminal,"2016_St._Cloud,_Minnesota_knife_attack",Q42915632,,-8,,2016.0
41337,Criminal,"2016_St._Cloud,_Minnesota_knife_attack",Q26933993,,-8,,2016.0
349407,OfficeHolder,A._K._Fazlul_Huq,Q3242246,male,89,,2005.0
349408,OfficeHolder,A._K._Fazlul_Huq,Q9258264,,89,,2005.0
217955,Writer,A._L._Kennedy,Q278872,female,59,,2004.0
...,...,...,...,...,...,...,...
728657,Politician,Éric_Dupond-Moretti,Q93228978,male,63,,2017.0
728831,Politician,Ömer_Çelik,Q91468051,male,56,,2013.0
728830,Politician,Ömer_Çelik,Q297516,male,56,,2013.0
102404,Royalty,Şehzade_Kasım,Q21523791,male,0,,2022.0


After some initial observation, it is found that my intuation is correct, for example the instance Ömer_Çelik have two wikiDataID but only one is correct which is Q297516. Meanwhile, the wikiDataID = Q91468051 associated with him deos not exists. I plan to run a sparql query in wikidata to get the actual wikidataId for each of these instances and replace them in the df

In [15]:
instances_with_multiple_ids = duplicated_wikidata.groupby(['subclass', 'instance'])['wikiDataID'].unique()
instances_with_multiple_ids

subclass  instance         
Academic  Alan_Martin_Boase                               [Q18917167, Q19258718]
          Albert_Elsen                                    [Q59628885, Q19753752]
          Andre_Franke                                   [Q67470670, Q114345084]
          Andrew_Lambert                                   [Q4757681, Q60103671]
          Autumn_Stanley       [Q95682724, Q95680876, Q95637638, Q95336311, Q...
                                                     ...                        
Writer    Max_Müller                                         [Q60074, Q55068911]
          Mbongeni_Ngema                                   [Q10327912, Q6799750]
          Mel_Odom_(author)                                 [Q4331780, Q6810802]
          Melinda_Metz         [Q22107416, Q238361, Q23780935, Q2375655, Q237...
Youtuber  TimTheTatman                                   [Q61789029, Q111584802]
Name: wikiDataID, Length: 2625, dtype: object

In [16]:
df2 = df1[~df1.duplicated(subset=['subclass', 'instance'], keep='first')].sort_values(by=['instance'])
df2

Unnamed: 0,subclass,instance,wikiDataID,gender,age,birthYear,publication_year
417884,Artist,!PAUS3,Q3466056,male,43,,2011.0
417885,Artist,$pacely,Q73507574,male,32,,2018.0
38134,Criminal,%22Baby_Lollipops%22_murder,Q64875934,,63,,2023.0
417886,Artist,%22Bassy%22_Bob_Brockmann,Q13416958,male,62,,2013.0
537884,Politician,%22Big%22_Donnie_MacLeod,Q18645783,male,75,,2014.0
...,...,...,...,...,...,...,...
102470,Royalty,Ḫarapšili,Q742087,female,0,,2012.0
102471,Royalty,Ḫattušili_III,Q297588,male,0,,2004.0
26705,Religious,Ṭhānissaro_Bhikkhu,Q7710407,male,75,,2006.0
102472,Royalty,Ỷ_Lan,Q10843033,female,0,,2010.0


In [17]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 791004 entries, 417884 to 102473
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   subclass          791004 non-null  object 
 1   instance          791004 non-null  object 
 2   wikiDataID        760153 non-null  object 
 3   gender            743194 non-null  object 
 4   age               791004 non-null  int64  
 5   birthYear         242387 non-null  float64
 6   publication_year  769224 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 48.3+ MB


Actual df = 911850, duplicated wikiDataID = 123471, unique wikidataid from the duplicated = 2625; Cleaned df = 911850 - 123471 + 2625 = **791004** which is consistent with the information above.

In [18]:
i=0
for (subclass, instance), ids in instances_with_multiple_ids.items():
    if len(ids) > 5:
        i+=1
print(i)

144


In [19]:
def get_instance_from_dbpedia(wikidata_id):
    # Initialize the SPARQL wrapper
    dbpedia_sparql = SPARQLWrapper('https://dbpedia.org/sparql')

    # Define the query to get the DBpedia resource URI
    query = f"""
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX wd: <http://www.wikidata.org/entity/>

    SELECT ?instance WHERE {{
      ?instance owl:sameAs wd:{wikidata_id}.
    }}
    """
    
    dbpedia_sparql.setQuery(query)
    dbpedia_sparql.setReturnFormat(JSON)
    
    while True:
        try:
            # Execute the query and fetch results
            results = dbpedia_sparql.query().convert()
                
            if results["results"]["bindings"]:
                instance = results["results"]["bindings"][0]["instance"]["value"]
                return instance.replace("http://dbpedia.org/resource/", "")
            else:
                return None
        
        except Exception as e:
            print(f"Error: {e}. Retrying after 2 seconds...")
            time.sleep(2)
            get_instance_from_dbpedia(wikidata_id)

def get_correct_wikidata_ids(instances_with_multiple_ids):
    mapping = {}
    total_instances = len(instances_with_multiple_ids)
    
    with tqdm(total=total_instances, desc='Processing Instances') as pbar:
        for (subclass, instance), ids in instances_with_multiple_ids.items():
            if len(ids) <= 5:
                for wikidata_id in ids:
                    name = get_instance_from_dbpedia(wikidata_id)
                    if instance == name:
                        mapping[(subclass, instance)] = wikidata_id
                        break
            else: 
                mapping[(subclass, instance)] = None
            pbar.update(1)
                
    return mapping

# Example usage:
# Assuming `instances_with_multiple_ids` is defined somewhere
# wikidata_mapping = get_correct_wikidata_ids(instances_with_multiple_ids)

In [20]:
# wikidata_mapping = get_correct_wikidata_ids(instances_with_multiple_ids)

In [21]:
# wikidata_mapping

In [22]:
# # Specify the path where you want to save the pickle file
# file_path = "wikidata_mapping.pkl"

# # Open the file in write-binary mode and save the dictionary
# with open(file_path, 'wb') as file:
#     pickle.dump(wikidata_mapping, file)

# print("Dictionary saved to file using pickle.")

In [23]:
# # Specify the path of the file you want to read
file_path = "wikidata_mapping.pkl"

# Open the file in read-binary mode and load the dictionary
with open(file_path, 'rb') as file:
    wikidata_mapping = pickle.load(file)

print("Dictionary loaded from file using pickle.")

Dictionary loaded from file using pickle.


In [24]:
df2['wikiDataID'].duplicated().sum()

31641

In [25]:
# Define function to get the correct wikiDataID from the mapping
def get_correct_wikidata_id(row, mapping):
    key = (row['subclass'], row['instance'])
    return mapping.get(key, row['wikiDataID'])  # Fallback to the original wikiDataID if no mapping is found

# Update df2 with the correct wikiDataID
df2['correct_wikiDataID'] = df2.apply(lambda row: get_correct_wikidata_id(row, wikidata_mapping), axis=1)

# Drop the old wikiDataID column and rename new column
df2 = df2.drop(columns='wikiDataID')
df2 = df2.rename(columns={'correct_wikiDataID': 'wikiDataID'})

print("Updated df2 with Correct WikiDataID:")
df2

Updated df2 with Correct WikiDataID:


Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
417884,Artist,!PAUS3,male,43,,2011.0,Q3466056
417885,Artist,$pacely,male,32,,2018.0,Q73507574
38134,Criminal,%22Baby_Lollipops%22_murder,,63,,2023.0,Q64875934
417886,Artist,%22Bassy%22_Bob_Brockmann,male,62,,2013.0,Q13416958
537884,Politician,%22Big%22_Donnie_MacLeod,male,75,,2014.0,Q18645783
...,...,...,...,...,...,...,...
102470,Royalty,Ḫarapšili,female,0,,2012.0,Q742087
102471,Royalty,Ḫattušili_III,male,0,,2004.0,Q297588
26705,Religious,Ṭhānissaro_Bhikkhu,male,75,,2006.0,Q7710407
102472,Royalty,Ỷ_Lan,female,0,,2010.0,Q10843033


In [26]:
df2['wikiDataID'].duplicated().sum()

31578

In [27]:
#### Initial combined dataset (without prerpocessing/cleaning)
# df1.to_csv(f"tryfinal.csv", index=False)

## 2) Handling Missing Data 

<strong> Identify the Number of Missing Values </strong>

In [28]:
df2.isna().sum()

subclass                 0
instance                 0
gender               47810
age                      0
birthYear           548617
publication_year     21780
wikiDataID           30995
dtype: int64

#### 2.1 Missing Data: Gender Column

In [29]:
# summarization of Culmen length (mm) before imputation
df2["gender"].describe()

count     743194
unique        52
top         male
freq      615154
Name: gender, dtype: object

In [30]:
gender_distribution_count = df2["gender"].value_counts()
gender_distribution_count

male                                                                          615154
female                                                                        127442
trans woman                                                                      225
non-binary                                                                       158
trans man                                                                         54
male organism                                                                     30
genderfluid                                                                       17
eunuch                                                                            16
intersex woman                                                                    15
intersex man                                                                      10
genderqueer                                                                        7
transgender                                                      

There are 52 unique values for the gender column, with male being the majority gender for the instances. There are also 53,078 instances that are not assigned to a gender which we need to handle later. There are also some unwanted values that can be seem to be starting with a common link which is "http://www.wikidata.org/.well-known/genid". We will start by replacing those values (associated to 28 rows) with null before deciding what to do with the null values 

In [31]:
# Replace NaN values with an empty string temporarily
s1 = df2['gender'].fillna('')

# Create a boolean mask for unwanted gender values
unwanted_gender_mask = s1.str.contains('http://www.wikidata.org/.well-known/genid', regex=False)

# Count the unwanted gender values
unwanted_gender_count = unwanted_gender_mask.sum()

# Replace unwanted gender values with NaN
df2.loc[unwanted_gender_mask, 'gender'] = np.nan

# Output the count of unwanted gender values
unwanted_gender_count

28

In [32]:
gender_distribution_count = df2["gender"].value_counts()
gender_distribution_count

male                  615154
female                127442
trans woman              225
non-binary               158
trans man                 54
male organism             30
genderfluid               17
eunuch                    16
intersex woman            15
intersex man              10
transgender                7
genderqueer                7
intersex                   6
two-spirit                 5
female organism            4
agender                    4
transmasculine             4
cisgender woman            2
androgynos                 1
bigender                   1
travesti                   1
cisgender man              1
faʻafafine                 1
undisclosed gender         1
Name: gender, dtype: int64

In [33]:
# summarization of Culmen length (mm) before imputation
df2["gender"].describe()

count     743166
unique        24
top         male
freq      615154
Name: gender, dtype: object

In [34]:
df1['gender'].isna().sum()

51984

#### 2.2 Missing Data: Age Column

There age column should not have any missing values. However, we will check if the age values make sense.

In [35]:
df2['age'].describe()

count    791004.000000
mean         36.583676
std          80.368008
min       -3395.000000
25%           0.000000
50%          44.000000
75%          70.000000
max        4431.000000
Name: age, dtype: float64

We can see that the minimum value for age is -3395 which is not possible in the real world. Thus, we will inspect further and get all rows with age less than and equal to 0.

In [36]:
df2.loc[df2['age'] <= 0] 

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
537885,Politician,%22Bulldog%22_Ben_Robinson,male,0,,2022.0,Q112440117
417888,Artist,%22Frantic%22_Fay_Thomas,female,-46,,2020.0,Q101445578
161666,MilitaryPerson,%60Adnan_%60Uqla,,0,,2019.0,
347884,OfficeHolder,''Congregatio_Immaculati_Cordis_Mariae'',,0,,,
74738,Royalty,'Abd_Allah_II_ibn_'Ali_'Abd_ash-Shakur,male,0,,2005.0,Q2820897
...,...,...,...,...,...,...,...
27093,Religious,Ḥayyim_Yitzḥak_Mussafia,male,0,,2020.0,Q104528636
102470,Royalty,Ḫarapšili,female,0,,2012.0,Q742087
102471,Royalty,Ḫattušili_III,male,0,,2004.0,Q297588
102472,Royalty,Ỷ_Lan,female,0,,2010.0,Q10843033


There are 276,088 rows with age <= 0. We could try using the wikidata api to retrieve the value for age since we have only used dbpedia to retrieve the age before. To do that, we still need the wikiDataID of the instances. Thus, we will make sure that majority of the instances with age<=0 does have wikiDataID as some of them might have null values.

In [37]:
df2.loc[((df2['age'] <= 0) & (df2['wikiDataID'].isna()))]

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
161666,MilitaryPerson,%60Adnan_%60Uqla,,0,,2019.0,
347884,OfficeHolder,''Congregatio_Immaculati_Cordis_Mariae'',,0,,,
26240,Philosopher,'Abd_al-Haqq_al-Dehlawi__1,,0,,,
161667,MilitaryPerson,'Ali_ibn_Aban_al-Muhallabi,,0,,,
161668,MilitaryPerson,'Ali_ibn_Muhammad_(Zanj_leader),,0,,,
...,...,...,...,...,...,...,...
102435,Royalty,Şehzade_Sultan,,0,,,
102441,Royalty,Şemsiruhsar_Hatun,,0,,2014.0,
217870,MilitaryPerson,Žanis_Bļumbergs,,0,,2020.0,
217874,MilitaryPerson,Željko_Ražnatović,,0,,2020.0,


In [95]:
wrong_age_subset = df2.loc[((df2['age'] <= 0) & (df2['wikiDataID'].notna())), ['instance', 'wikiDataID', 'age']].values.tolist()
len(wrong_age_subset)

213170

Out of all 237,923 instances that has age <=0, there are 213,170 instances that have the wikidataID and 24,753 instances without wikidataID. We will try to get the age of those instances with wikidataID as they have more counts. After that, we will decide what to do with those without wikidataID

In [39]:
import time
from tqdm import tqdm

def get_age_from_wikidata(wikidata_id):
    """
    Fetches the age of an individual from Wikidata using their Wikidata ID.
    
    Args:
        wikidata_id (str): The Wikidata ID of the individual.
    
    Returns:
        int or None: The calculated age if found, otherwise None.
    """
    from SPARQLWrapper import SPARQLWrapper, JSON
    
    # Initialize the SPARQL wrapper
    wikidata_sparql = SPARQLWrapper('https://query.wikidata.org/sparql')

    # Define the SPARQL query to get birth and death dates
    query = f"""
    SELECT ?birthDate ?deathDate (YEAR(COALESCE(?deathDate, NOW())) - YEAR(?birthDate) AS ?age)
    WHERE {{
        wd:{wikidata_id} wdt:P569 ?birthDate.  # Date of birth
        OPTIONAL {{ wd:{wikidata_id} wdt:P570 ?deathDate. }}  # Date of death (optional)
    }}
    """
    
    wikidata_sparql.setQuery(query)
    wikidata_sparql.setReturnFormat(JSON)
    
    max_retries = 3
    retries = 0

    while retries < max_retries:
        try:
            # Execute the query and fetch results
            results = wikidata_sparql.query().convert()
            
            if results["results"]["bindings"]:
                # Fetch age from results
                age = results["results"]["bindings"][0]["age"]["value"]
                return int(age)
            else:
                return None
        
        except Exception as e:
            print(f"Error: {e}. Retrying after 2 seconds...")
            retries += 1
            time.sleep(3)
    
    print("Max retries reached. Could not fetch data.")
    return None

def subset_get_age(wrong_age_subset, log_file="age0_processed_instances.txt"):
    mapping = {}
    total_instances = len(wrong_age_subset)
    
    # Read the log file to find already processed instances
    processed = set()
    try:
        with open(log_file, 'r') as f:
            for line in f:
                instance, wikidata_id, age = line.strip().split('\t')
                processed.add((instance, wikidata_id))
                mapping[(instance, wikidata_id)] = int(age)
    except FileNotFoundError:
        pass
    
    with tqdm(total=total_instances, desc='Processing Instances') as pbar, open(log_file, 'a') as f:
        for instance, wikidata_id, _ in wrong_age_subset:
            if (instance, wikidata_id) in processed:
                pbar.update(1)
                continue
            
            age = get_age_from_wikidata(wikidata_id)
            mapping[(instance, wikidata_id)] = age
            
            # Log the instance to the file
            f.write(f"{instance}\t{wikidata_id}\t{age}\n")
            f.flush()  # Ensure the data is written to the file immediately
            
            pbar.update(1)
    
    return mapping

# Example usage:
# wrong_age_subset = [("Instance1", "Q42", 42), ("Instance2", "Q123", 36), ...]
# subset_get_age(wrong_age_subset)

In [55]:
def age_txt_to_dict(filename): 
    age_mapping = {} 
    with open(filename, 'r') as f:
        for line in f:
            instance, wikidata_id, age = line.strip().split('\t')
            if age and age != 'None':  # Ensure age is neither empty nor 'None'
                age_mapping[(instance, wikidata_id)] = int(age)
            else: 
                age_mapping[(instance, wikidata_id)] = None
    return age_mapping

In [68]:
# age_mapping = subset_get_age(wrong_age_subset)
age_mapping = age_txt_to_dict('age0_processed_instances')

In [69]:
len(age_mapping)

213170

In [57]:
# Define function to get the correct age using WikidataID
def get_correct_age(row, mapping):
    key = (row['instance'], row['wikiDataID'])
    if row['age'] == 0:
        row['age'] = -1      #reassign all age == 0 in the initial df to -1 so that we can distigush distinguish intances with actual age = 0
    return mapping.get(key, row['age'])  # Fallback to the original wikiDataID if no mapping is found

# Create a copy of df2 to keep the original data intact
df3 = df2.copy()

# Apply the function to update the 'age' column in the new DataFrame with a progress bar
df3['age'] = df3.apply(lambda row: get_correct_age(row, age_mapping), axis=1)

# The df3 DataFrame now has the updated 'age' column with fetched data from Wikidata where applicable
df3

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
417884,Artist,!PAUS3,male,43.0,,2011.0,Q3466056
417885,Artist,$pacely,male,32.0,,2018.0,Q73507574
38134,Criminal,%22Baby_Lollipops%22_murder,,63.0,,2023.0,Q64875934
417886,Artist,%22Bassy%22_Bob_Brockmann,male,62.0,,2013.0,Q13416958
537884,Politician,%22Big%22_Donnie_MacLeod,male,75.0,,2014.0,Q18645783
...,...,...,...,...,...,...,...
102470,Royalty,Ḫarapšili,female,3574.0,,2012.0,Q742087
102471,Royalty,Ḫattušili_III,male,0.0,,2004.0,Q297588
26705,Religious,Ṭhānissaro_Bhikkhu,male,75.0,,2006.0,Q7710407
102472,Royalty,Ỷ_Lan,female,67.0,,2010.0,Q10843033


In [58]:
df3.isna().sum()

subclass                 0
instance                 0
gender               47838
age                  58765
birthYear           548617
publication_year     21780
wikiDataID           30995
dtype: int64

In [59]:
df3.loc[((df3['age'] == 0) & (df3['wikiDataID'].notna()))]

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
418092,Artist,A_Ge,female,0.0,,2011.0,Q4656945
74759,Royalty,Abalgamash,male,0.0,,2020.0,Q305264
102508,Cleric,Abaskhiron_the_Soldier,male,0.0,,2013.0,Q16965864
102549,Cleric,Abercius_of_Hieropolis,male,0.0,,2007.0,Q319319
102603,Cleric,Abraham_of_Egypt,male,0.0,,2007.0,Q4055992
...,...,...,...,...,...,...,...
102017,Royalty,Zabibe,female,0.0,,2006.0,Q8063479
102020,Royalty,Zacharias_III_of_Makuria,male,0.0,,2005.0,Q139495
102200,Royalty,Zhuozi_(Jin),male,0.0,,2012.0,Q870356
102304,Royalty,Æthelflæd_of_Damerham,female,0.0,,2007.0,Q4129783


In [60]:
df3.loc[((df3['age'] == 0) & (df3['wikiDataID'].isna()))]

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID


In [61]:
df3.loc[((df3['age'] <=-1) & (df3['wikiDataID'].isna()))] 

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
161666,MilitaryPerson,%60Adnan_%60Uqla,,-1.0,,2019.0,
347884,OfficeHolder,''Congregatio_Immaculati_Cordis_Mariae'',,-1.0,,,
26240,Philosopher,'Abd_al-Haqq_al-Dehlawi__1,,-1.0,,,
161667,MilitaryPerson,'Ali_ibn_Aban_al-Muhallabi,,-1.0,,,
161668,MilitaryPerson,'Ali_ibn_Muhammad_(Zanj_leader),,-1.0,,,
...,...,...,...,...,...,...,...
102435,Royalty,Şehzade_Sultan,,-1.0,,,
102441,Royalty,Şemsiruhsar_Hatun,,-1.0,,2014.0,
217870,MilitaryPerson,Žanis_Bļumbergs,,-1.0,,2020.0,
217874,MilitaryPerson,Željko_Ražnatović,,-1.0,,2020.0,


#### Rechecking for Age more than 100

In [62]:
df3.loc[((df3['age'] >= 100) & (df3['wikiDataID'].notna()))]

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
28501,Religious,6th_Dalai_Lama,male,341.0,,2004.0,Q25251
74753,Royalty,A'annepada,male,100.0,,2020.0,Q277645
28222,Religious,A'isha_bint_Talha,female,100.0,,2006.0,Q4646815
538089,Politician,A._Andrews,male,206.0,,2012.0,Q4647512
538102,Politician,A._B._Comfort,male,140.0,,2015.0,Q19560274
...,...,...,...,...,...,...,...
102445,Royalty,Şivekar_Sultan,female,389.0,,2020.0,Q25477824
27207,Religious,Şükrullah,male,100.0,,2013.0,Q16854270
729002,Politician,Şükrü_Elekdağ,male,100.0,,2011.0,Q4353302
729054,Politician,Ștefan_Foriș,male,132.0,,2006.0,Q838826


In [70]:
age100_subset = df3.loc[((df3['age'] >= 100) & (df3['wikiDataID'].notna())), ['instance', 'wikiDataID', 'age']].values.tolist()
len(age100_subset)

22811

In [73]:
# age100_mapping = subset_get_age(age100_subset, 'age100_processed_instances')
age100_mapping = age_txt_to_dict('age100_processed_instances')

In [74]:
# Define function to get the correct age using WikidataID
def get_correct_age2(row, mapping):
    key = (row['instance'], row['wikiDataID'])
    return mapping.get(key, row['age'])  # Fallback to the original wikiDataID if no mapping is found

# Create a copy of df2 to keep the original data intact
df4 = df3.copy()

# Apply the function to update the 'age' column in the new DataFrame with a progress bar
df4['age'] = df4.apply(lambda row: get_correct_age(row, age100_mapping), axis=1)

# The df3 DataFrame now has the updated 'age' column with fetched data from Wikidata where applicable
df4

Unnamed: 0,subclass,instance,gender,age,birthYear,publication_year,wikiDataID
417884,Artist,!PAUS3,male,43.0,,2011.0,Q3466056
417885,Artist,$pacely,male,32.0,,2018.0,Q73507574
38134,Criminal,%22Baby_Lollipops%22_murder,,63.0,,2023.0,Q64875934
417886,Artist,%22Bassy%22_Bob_Brockmann,male,62.0,,2013.0,Q13416958
537884,Politician,%22Big%22_Donnie_MacLeod,male,75.0,,2014.0,Q18645783
...,...,...,...,...,...,...,...
102470,Royalty,Ḫarapšili,female,3574.0,,2012.0,Q742087
102471,Royalty,Ḫattušili_III,male,-1.0,,2004.0,Q297588
26705,Religious,Ṭhānissaro_Bhikkhu,male,75.0,,2006.0,Q7710407
102472,Royalty,Ỷ_Lan,female,67.0,,2010.0,Q10843033


In [75]:
age100_subset = df4.loc[((df4['age'] >= 100) & (df4['wikiDataID'].notna())), ['instance', 'wikiDataID', 'age']].values.tolist()
len(age100_subset)

8910

In [76]:
df4.isna().sum()

subclass                 0
instance                 0
gender               47838
age                  59347
birthYear           548617
publication_year     21780
wikiDataID           30995
dtype: int64

In [77]:
# extract the index of "." value in the ratings column
convert_null_index = df4.loc[df4["age"] <= -1, "age"].index

In [78]:
len(convert_null_index)

26043

In [79]:
# change the value of "-1" to null/nan
df4.loc[convert_null_index, "age"] = np.nan

In [80]:
df4.isna().sum()

subclass                 0
instance                 0
gender               47838
age                  85390
birthYear           548617
publication_year     21780
wikiDataID           30995
dtype: int64

In [81]:
check_null_index = df4.loc[df4["age"] <= -1, "age"].index
len(check_null_index)

0

In [82]:
df4['age'].describe()

count    705614.000000
mean         62.229974
std          56.987616
min           0.000000
25%          44.000000
50%          62.000000
75%          77.000000
max        6226.000000
Name: age, dtype: float64

In [83]:
df5 = df4.dropna(subset=['gender'])

In [85]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 743166 entries, 417884 to 102472
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   subclass          743166 non-null  object 
 1   instance          743166 non-null  object 
 2   gender            743166 non-null  object 
 3   age               692222 non-null  float64
 4   birthYear         239532 non-null  float64
 5   publication_year  740524 non-null  float64
 6   wikiDataID        743025 non-null  object 
dtypes: float64(3), object(4)
memory usage: 45.4+ MB


In [84]:
df5.isna().sum()

subclass                 0
instance                 0
gender                   0
age                  50944
birthYear           503634
publication_year      2642
wikiDataID             141
dtype: int64