# Data Prep

In [1]:
%pip install datasets tensorflow pandas python-terrier transformers sklearn tf-keras sentence-transformers sentencepiece torchvision torchaudio
%pip install --upgrade datasets ipywidgets

Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25l- error
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[15 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
  [31m   [0m - if the 'sklearn' package is used by one of your dependencies,
  [31m   [0m   it would be great if you take some time to track which package uses
  [31m   [0m   'sklearn' instead of 'sciki

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from datasets import load_dataset
from concurrent.futures import ThreadPoolExecutor, as_completed

import mltable
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

import pyterrier as pt
from transformers import pipeline, AutoModel, AutoTokenizer, TFAutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
print(pt.__version__)

In [None]:
ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("ir-project", version="1")

base_path = data_asset.path
filenames = ["de_train", "en_train", "fr_train"]

dataframes = {}
for filename in filenames:
    file_path = os.path.join(base_path, filename + '.csv')
    dataframes[filename] = pd.read_csv(file_path)

In [None]:
dataframes

In [None]:
np.random.seed=42

In [None]:
def clean_text(text):
  text = text.replace("_START_ARTICLE_", " ")
  text = text.replace("_START_PARAGRAPH_", ";")
  text = text.replace("_START_SECTION_", " ")
  text = text.replace("_NEWLINE_", " ")
  
  allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.,;?!$%₤£€&*/[{}] \\")
  cleaned_text = ''.join(char for char in text if char in allowed)
  
  return cleaned_text

In [None]:
dataframes['en_train']

In [None]:
dataframes['de_train']

In [None]:
dataframes['fr_train']

In [None]:
datasets = {'en': dataframes['en_train'],
            'de': dataframes['de_train'],
            'fr': dataframes['fr_train']}

for lang in datasets.keys():
  df = datasets[lang]
  print('language:', lang)
  print('# articles:', len(df['text']))

  articles = []
  for i, example in tqdm(df.iterrows(), total=len(df)):
    text = clean_text(example['text'].decode('utf-8') if isinstance(example['text'], bytes) else example['text'])
    version_id = int(example['version_id'].decode('utf-8') if isinstance(example['version_id'], bytes) else example['version_id'])
    wikidata_id = example['wikidata_id'].decode('utf-8') if isinstance(example['wikidata_id'], bytes) else example['wikidata_id']
    articles.append({'text': text, 'version_id': version_id, 'wikidata_id': wikidata_id})
    
      
  datasets[lang] = pd.DataFrame(articles)

In [None]:
datasets['en']

In [None]:
datasets['en']['text']

In [None]:
langs = ['en', 'de', 'fr']
output_path = 'Users/kgar/'

for lang in langs:
    datasets[lang].to_csv(os.path.join(output_path, lang + '_train_formatted.csv'))
datasets

In [None]:
df = datasets.copy()

In [None]:
langs = ['en', 'de', 'fr']

for lang in langs:
    df[lang]['title'] = df[lang]['text'].apply(lambda x: x.split(';')[0])
    df[lang]['text'] = df[lang]['text'].apply(lambda x: ';'.join(x.split(';')[1:]))

In [None]:
df['en'].head()

In [None]:
df['en'].to_csv(os.path.join(output_path, 'en' + '_train_formatted_with_titles.csv'))
df['de'].to_csv(os.path.join(output_path, 'de' + '_train_formatted_with_titles.csv'))
df['fr'].to_csv(os.path.join(output_path, 'fr' + '_train_formatted_with_titles.csv'))

In [None]:
output_path = 'Users/kgar/'
filenames = ["de_train_formatted_with_titles",
             "en_train_formatted_with_titles",
             "fr_train_formatted_with_titles"
            ]

df = {}

for filename in filenames:
    file_path = os.path.join(output_path, filename + '.csv')
    dataframes[filename[:2]] = pd.read_csv(file_path)

In [None]:
dataframes

In [None]:
dataset_en = dataframes['en']
dataset_de = dataframes['de']
dataset_fr = dataframes['fr']

dataset_en['wikidata_id'] = dataset_en['wikidata_id'].astype(str)
dataset_de['wikidata_id'] = dataset_de['wikidata_id'].astype(str)
dataset_fr['wikidata_id'] = dataset_fr['wikidata_id'].astype(str)

common_de_ids = set(dataset_en['wikidata_id']).intersection(set(dataset_de['wikidata_id']))

common_de_en = dataset_en[dataset_en['wikidata_id'].isin(common_de_ids)]
common_de_de = dataset_de[dataset_de['wikidata_id'].isin(common_de_ids)]
common_de = pd.concat([common_de_en, common_de_de], ignore_index=True)
common_de = common_de.reset_index()


common_fr_ids = set(dataset_en['wikidata_id']).intersection(set(dataset_fr['wikidata_id']))
common_fr_en = dataset_en[dataset_en['wikidata_id'].isin(common_fr_ids)]
common_fr_fr = dataset_fr[dataset_fr['wikidata_id'].isin(common_fr_ids)]
common_fr = pd.concat([common_fr_en, common_fr_fr], ignore_index=True)
common_fr = common_fr.reset_index()

common_de_fr_ids = common_de_ids.intersection(common_fr_ids)
common_de_fr = pd.concat([
    dataset_de[dataset_de['wikidata_id'].isin(common_de_fr_ids)],
    dataset_fr[dataset_fr['wikidata_id'].isin(common_de_fr_ids)],
    dataset_en[dataset_en['wikidata_id'].isin(common_de_fr_ids)]
], ignore_index=True)
common_de_fr = common_de_fr.reset_index()


print('number of common articles across en and de:', len(common_de))
print('number of common articles across en and fr:', len(common_fr))
print('number of common articles across en, de and fr:', len(common_de_fr))

In [None]:
common_de[common_de['wikidata_id'] == 'Q191828']

In [None]:
common_de_fr[common_de_fr['wikidata_id'] == 'Q191828']

In [None]:
common_fr[common_fr['wikidata_id'] == 'Q191828']

In [None]:
common_de.to_csv(os.path.join(output_path,'common_de_final.csv'))
print('common_de')
common_fr.to_csv(os.path.join(output_path,'common_fr_final.csv'))
print('common_fr')
common_de_fr.to_csv(os.path.join(output_path,'common_de_fr_final.csv'))
print('common_de_fr')