<a href="https://colab.research.google.com/github/miczkejedrzej/MNLP-project-1/blob/main/Data_collection_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [158]:
# Import of the training data

from google.colab import files
uploaded = files.upload()

# Import of moduls

!pip install wikidata --quiet
!pip install googletrans==4.0.0-rc1 --quiet
!pip install Levenshtein --quiet

import matplotlib.pyplot as plt
import pandas as pd
from wikidata.client import Client
import requests
from googletrans import Translator
import Levenshtein

from tqdm.auto import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading the data
train_df = pd.read_csv('[MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv', sep='\t')

Saving [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned.tsv to [MNLP 2025 HW1] train set [PUBLIC] - train_cleaned (9).tsv


In [159]:
def extract_entity_id(url):
    return url.strip().split("/")[-1]

In [160]:
def get_sitelinks(wikidata_url):

  # Extract the id of the item
  entity_id = extract_entity_id(wikidata_url)

  # Extract information of the item
  client = Client()
  item = client.get(entity_id, load=True)

  # Extract the sitelinks of the item
  sitelinks = item.data.get("sitelinks", {})

  return sitelinks



def get_first_1000_characters(wikidata_url):

  """
  This function collects the 1000 first characters in every wikipedia page, associated to 10 languages, with respect to the wikidata url.

  Arguments:
    wikidata_url : str

  Return :
    dict_lang : the language, and it's associated first 1000 characters
  """

  # Initialisation of parameters, list and dictionnary

  selected_languages = ["en", "fr", "es", "zh", "ar", "hi", "ru", "pt", "ja", "sw"]
  dict_lang = {}

  # Extraction of the Wikipedia pages informations, for the selected item
  sitelinks = get_sitelinks(wikidata_url)

  # Extraction of the wikipedia page, for every languages
  for lang in selected_languages:
      lang_wiki = sitelinks.get(f"{lang}wiki")
      if not lang_wiki:
          continue

      title = lang_wiki["title"]

      api_url = f"https://{lang}.wikipedia.org/w/api.php"
      params = {
          "action": "query",
          "prop": "extracts",
          "explaintext": True,
          "titles": title,
          "format": "json",
          "redirects": 1
      }

      try:
        res = requests.get(api_url, params=params).json()
        page = next(iter(res["query"]["pages"].values()))
        text = page.get("extract", "")
        dict_lang[lang] = text[:1000] # Selection of the first 1000 characters only

        # Pause to avoid being blocked by the API
        time.sleep(1)

      except Exception as e:
          continue

  return dict_lang

# Test
#wikidata_url = train_df.iloc[0,0]
#dict_languages = get_first_1000_characters(wikidata_url)

In [161]:
def translate_to_english(sentence, lang):

  """
  Function that translate a text in english

  Argument :
    sentence : str, the text to translate
    lang : str, the source language

  Returns :
    translation : str, translated text
  """

  translator = Translator()

  try:
      # Translation
      translation = translator.translate(sentence, src=lang, dest='en')
      return translation.text

  except Exception as e:
    return None

# Test
#translate_to_english('essai', 'fr')

In [162]:
# Comparison between english texts, and translated texts :

# Jaccard Similitary

def jaccard_similarity(text1, text2):
  set1 = set(text1.lower().split())
  set2 = set(text2.lower().split())

  intersection = len(set1 & set2)
  union = len(set1 | set2)

  return intersection / union

# Levenstein Distance

def levenshtein_distance(text1, text2):
  return Levenshtein.distance(text1, text2)

In [163]:
# Add empty columns in the dataframe, to store the scores
languages = ["en", "fr", "es", "zh", "ar", "hi", "ru", "pt", "ja", "sw"]

for lang in languages:
  train_df[f'levenshtein_dist_{lang}'] = None
  train_df[f'jaccard_sim_{lang}'] = None


def get_similarity_metrics(row, wikidata_url):
  dict_languages = get_first_1000_characters(wikidata_url)
  dict_english = {}

  # Translate un each languages
  for lang, text in dict_languages.items():
    dict_english[lang] = translate_to_english(text, lang)

  # Get the scores between original english, and translated english text
  for lang, text in dict_english.items():
    if text:
      # Get the scores
      levenshtein_dist = levenshtein_distance(dict_languages['en'], dict_languages[lang])
      jaccard_sim = jaccard_similarity(dict_languages['en'], dict_languages[lang])


      # Prepare the vector to plug in the dataframe
      row[f'levenshtein_dist_{lang}'] = levenshtein_dist
      row[f'jaccard_sim_{lang}'] = jaccard_sim

  return row

# Test on the first 5 lines
train_df.iloc[1:5] = train_df.iloc[1:5].apply(lambda row: get_similarity_metrics(row, row[0]), axis=1)
train_df.head()

Unnamed: 0,item,name,description,type,category,subcategory,label,levenshtein_dist_en,jaccard_sim_en,levenshtein_dist_fr,...,levenshtein_dist_hi,jaccard_sim_hi,levenshtein_dist_ru,jaccard_sim_ru,levenshtein_dist_pt,jaccard_sim_pt,levenshtein_dist_ja,jaccard_sim_ja,levenshtein_dist_sw,jaccard_sim_sw
0,http://www.wikidata.org/entity/Q306,Sebastián Piñera,Chilean entrepreneur and politician (1949–2024),entity,politics,politician,cultural exclusive,,,,...,,,,,,,,,,
1,http://www.wikidata.org/entity/Q12735,John Amos Comenius,"Czech teacher, educator, philosopher and write...",entity,politics,politician,cultural representative,0.0,1.0,787.0,...,887.0,0.014019,918.0,0.026906,781.0,0.055046,982.0,0.007634,,
2,http://www.wikidata.org/entity/Q1752,Macrinus,Roman emperor from 217 to 218,entity,politics,politician,cultural representative,0.0,1.0,785.0,...,,,934.0,0.017937,671.0,0.042654,991.0,0.025,784.0,0.054054
3,http://www.wikidata.org/entity/Q1639,Lamine Diack,Senegalese sports manager (1933–2021),entity,politics,politician,cultural representative,0.0,1.0,801.0,...,,,925.0,0.030973,,,977.0,0.022222,,
4,http://www.wikidata.org/entity/Q9588,Richard Nixon,President of the United States from 1969 to 1974,entity,politics,politician,cultural representative,0.0,1.0,803.0,...,935.0,0.0,934.0,0.018433,783.0,0.061321,981.0,0.007812,793.0,0.043478
