<a href="https://colab.research.google.com/github/miczkejedrzej/MNLP-project-1/blob/main/Data_collection_instances_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import of the data

from google.colab import files
uploaded = files.upload()

!pip install wikidata --quiet

import pandas as pd
from wikidata.client import Client
import requests
import seaborn as sns
import matplotlib.pyplot as plt
from collections import deque
from functools import lru_cache
from tqdm import tqdm

df_train = pd.read_csv('/content/[MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv', sep = '\t')

!pip install datasets --quiet

from datasets import load_dataset
from huggingface_hub import login

login(token="")

dataset = load_dataset('sapienzanlp/nlp2025_hw1_cultural_dataset')
df_dev = dataset['validation'].to_pandas()

Saving [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv to [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv


# Subclasses (P279)

The P279 property refers to the subclass, example : volcano is subclass of moutain.



## Subclass depth

 We are looking for the shortest path from the given item to a 'root' item that has not any subclass. We then define the depth of each item in the graph of subclasses as the number of nodes in the shortest path.

In [None]:
WIKIDATA_SPARQL_URL = "https://query.wikidata.org/sparql"
HEADERS = {
    "Accept": "application/sparql-results+json",
    "User-Agent": "ExplorateurWikidata/0.1 (truc@truc.com)"
}

def extract_entity_id(url):
    return url.strip().split("/")[-1]

#cache avoir requesting the same item several times during the graph search
@lru_cache(maxsize=None)

def get_superclasses(qid):
    """get the superclasses (P279)"""

    query = f"""
    SELECT ?superclass WHERE {{
      wd:{qid} wdt:P279 ?superclass.
    }}
    """
    response = requests.get(WIKIDATA_SPARQL_URL, params={"query": query}, headers=HEADERS)
    results = response.json()["results"]["bindings"]
    return tuple(r["superclass"]["value"].split("/")[-1] for r in results)

def get_subclass_depth_bfs(url, max_depth=20):
    """get the depth by finding the shortest path with BFS"""
    qid = extract_entity_id(url)
    visited = set()
    queue = deque([(qid, 1)])

    while queue:
        current_qid, depth = queue.popleft()
        if current_qid in visited:
            continue
        visited.add(current_qid)

        parents = get_superclasses(current_qid)
        if not parents:
            return depth
        if depth >= max_depth:
            return None
        for parent_qid in parents:
            queue.append((parent_qid, depth + 1))

    return None


In [None]:
# Application to the datasets
tqdm.pandas()
df_train["subclass_depth"]= df_train["item"].progress_apply(get_subclass_depth_bfs)

100%|██████████| 6251/6251 [17:11<00:00,  6.06it/s]


In [None]:
df_dev["subclass_depth"]= df_dev["item"].progress_apply(get_subclass_depth_bfs)

## Number of subclasses downward

here were a looking for the number of item that are subclass of the given item

In [None]:
def get_direct_subclasses(url):
    """return the number of subclasses """
    qid = extract_entity_id(url)
    query = f"""
    SELECT ?subclass WHERE {{
      ?subclass wdt:P279 wd:{qid} .
    }}
    """
    response = requests.get(WIKIDATA_SPARQL_URL, params={"query": query}, headers=HEADERS)
    results = response.json()["results"]["bindings"]

    return len(results)


In [None]:
# Application to the datasets
df_train["subclasses"] = df_train["item"].apply(get_direct_subclasses)

In [None]:
df_dev["subclasses"] = df_dev["item"].apply(get_direct_subclasses)

# Instances of (P31) downward

Here we explore the concept of the instances of the entity. Mainly we check how many entities on the wikipedia are the instances of the given entity, later we group the outcomes by the labels and plot them to see whether there is any meaningfull correlation

In [None]:
def get_instances_of(url):
  qid = extract_entity_id(url)
  """return  number of items that are 'instances of' the given entity"""
  query = f"""
  SELECT ?entity ?label WHERE {{
    ?entity wdt:P31 wd:{qid}.
    ?entity rdfs:label ?label .
    FILTER(LANG(?label) = "en")
  }}
  """
  response = requests.get(WIKIDATA_SPARQL_URL, params={"query": query}, headers=HEADERS)
  results = response.json()["results"]["bindings"]

  return len(results)


In [None]:
# Application to the datasets
df_train["instances_of"] = df_train["item"].apply(get_instances_of)

In [None]:
df_dev["instances_of"] = df_dev["item"].apply(get_instances_of)

# Instances of (P31) upward
Again instances of but now in reversed way how many other entites the given item is an instance of

In [None]:
def get_instances_of_up(url):
  qid = extract_entity_id(url)
  """return  number of  'instances of' the given entity"""
  query = f"""
  SELECT ?class WHERE {{
      wd:{qid} wdt:P31 ?class .
    }}
  """
  response = requests.get(WIKIDATA_SPARQL_URL, params={"query": query}, headers=HEADERS)
  results = response.json()["results"]["bindings"]

  return len(results)

In [None]:
# Application to the datasets
df_train["instances_of_up"] = df_train["item"].apply(get_instances_of_up)

In [None]:
df_dev["instances_of_up"] = df_dev["item"].apply(get_instances_of_up)

# Exportation

In [None]:
df_train.to_json('trainset_subclass_instances.json', orient='records', lines=True)
df_dev.to_json('devset_subclass_instances.json', orient='records', lines=True)

from google.colab import files
files.download('trainset_subclass_instances.json')
files.download('devset_subclass_instances.json')