In [1]:
# Import of the training data

from google.colab import files
uploaded = files.upload()

# Import of moduls

!pip install wikidata --quiet

import matplotlib.pyplot as plt
import pandas as pd
from wikidata.client import Client
import numpy as np

from tqdm.auto import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading the data
train_df = pd.read_csv('[MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv', sep='\t')

Saving [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv to [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv


In [2]:
def extract_entity_id(url):
    return url.strip().split("/")[-1]

In [3]:
def get_claims(wikidata_url):
  # Extract the id of the item
    entity_id = extract_entity_id(wikidata_url)

    # Extract information of the item
    client = Client()
    item = client.get(entity_id, load=True)

    # Extract the claims of the item
    claims = item.data.get("claims", {})

    return claims

In [4]:
def get_first_row_with_statement(df, statement):

  """
  Function that find the first row in a dataframe, that as a specific statement
  """

  founded = False

  for row in range(0, len(df)+1):

    if founded == False :

      wikidata_url = df.iloc[row,0]
      claims = get_claims(wikidata_url)

      # Extract the statement :
      unicode_statements = claims.get(statement, {})
      if unicode_statements:
        print(f"The first row with the statement {statement} is : ", row)
        return row

  if founded == False:
    print(f"No row in the dataset, has the statement {statement}")

# Test
#get_first_row_with_statement(df=train_df, statement="P571")

In [5]:
def extract_year_from_date(date):
    return date[1:5]


def get_dates(wikipedia_url):
    """
    Function that extract many dates of an item.

    Arguments:
      wikipedia_url : str

    Returns:
    """

    creation_date = np.nan
    date_of_birth = np.nan
    date_of_publication = np.nan
    start_time = np.nan
    start_time_in_event = np.nan

    claims = get_claims(wikipedia_url)

    # Extract the statement "P569" : Date of Birth
    unicode_statements = claims.get("P569", [])
    if unicode_statements:
        for statement in unicode_statements:
            mainsnak = statement.get("mainsnak", {})
            datavalue = mainsnak.get("datavalue", {})
            value = datavalue.get("value", {})
            time = value.get('time')
            if time:
                date_of_birth = extract_year_from_date(time)


    # Extract the statement "P571" : Creation Date
    unicode_statements = claims.get("P571", [])
    if unicode_statements:
        for statement in unicode_statements:
            mainsnak = statement.get("mainsnak", {})
            datavalue = mainsnak.get("datavalue", {})
            value = datavalue.get("value", {})
            time = value.get('time')
            if time:
                creation_date = extract_year_from_date(time)


    # Extract the statement "P574" : Start Time
    unicode_statements = claims.get("P574", [])
    if unicode_statements:
        for statement in unicode_statements:
            mainsnak = statement.get("mainsnak", {})
            datavalue = mainsnak.get("datavalue", {})
            value = datavalue.get("value", {})
            time = value.get('time')
            if time:
                start_time = extract_year_from_date(time)


    # Extract the statement "P577" : Date of Publication
    unicode_statements = claims.get("P577", [])
    if unicode_statements:
        for statement in unicode_statements:
            mainsnak = statement.get("mainsnak", {})
            datavalue = mainsnak.get("datavalue", {})
            value = datavalue.get("value", {})
            time = value.get('time')
            if time:
                date_of_publication = extract_year_from_date(time)


    # Extract the statement "P584" : Start Time in event
    unicode_statements = claims.get("P584", [])
    if unicode_statements:
        for statement in unicode_statements:
            mainsnak = statement.get("mainsnak", {})
            datavalue = mainsnak.get("datavalue", {})
            value = datavalue.get("value", {})
            time = value.get('time')
            if time:
                start_time_in_event = extract_year_from_date(time)

    return date_of_birth, creation_date, start_time, date_of_publication , start_time_in_event


# Due to some errors when we apply get_creation_date to the dataframe, we create a more safety function

def safe_get_dates(wikidata_url):
    try:
        return get_dates(wikidata_url)
    except:
        return None, None, None, None, None

In [7]:
# We merge the variables, in order to have only one.

def merge_variables(df, variables):
  """
  Take in input, the variables of a dataframe, and return a dataframe with a single variable => the first value founded across the rows
  """
  return df[variables].bfill(axis=1).iloc[:, 0]

In [9]:
# Application to the whole dataset

variables = ['date_of_birth', 'creation_date', 'start_time', 'date_of_publication', 'start_time_in_event']
train_df[variables] = train_df['item'].apply(lambda x: safe_get_dates(x)).apply(pd.Series)
train_df['date'] = merge_variables(train_df, variables)
train_df.drop(variables, axis=1, inplace=True)
train_df['date'] = pd.to_numeric(train_df['date'], errors='coerce').astype('Int64')

# Visualisation
train_df.head()

Unnamed: 0,item,name,description,type,category,subcategory,label,date
0,http://www.wikidata.org/entity/Q306,Sebastián Piñera,Chilean entrepreneur and politician (1949–2024),entity,politics,politician,cultural exclusive,1949
1,http://www.wikidata.org/entity/Q12735,John Amos Comenius,"Czech teacher, educator, philosopher and write...",entity,politics,politician,cultural representative,1592
2,http://www.wikidata.org/entity/Q1752,Macrinus,Roman emperor from 217 to 218,entity,politics,politician,cultural representative,165
3,http://www.wikidata.org/entity/Q1639,Lamine Diack,Senegalese sports manager (1933–2021),entity,politics,politician,cultural representative,1933
4,http://www.wikidata.org/entity/Q9588,Richard Nixon,President of the United States from 1969 to 1974,entity,politics,politician,cultural representative,1913


In [11]:
# Exportation

train_df.to_json("train_df_dates.json", orient="records", lines=True)
files.download('train_df_dates.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>