In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
dataset_path = '/content/drive/My Drive/IR Proposal/WikiLinkGraphs'

Mounted at /content/drive


#Downloading WikiLinkGraphs Dataset

In [39]:
# Define paths and languages to download
languages = ['de', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'ru', 'sv']
suffix = 'wiki.wikilink_graph.2018-03-01.csv.gz' # 2018 for data from 2018, 2010 dito
download = '?download=1'

In [3]:
# Download zipped data
for lang in languages: # Make sure correct year is specified in the below link
  !wget -P '/content/drive/My Drive/IR Proposal/WikiLinkGraphs' https://zenodo.org/records/2539424/files/{lang}wiki.wikilink_graph.2010-03-01.csv.gz?download=1


--2024-04-03 07:30:37--  https://zenodo.org/records/2539424/files/dewiki.wikilink_graph.2010-03-01.csv.gz?download=1
Resolving zenodo.org (zenodo.org)... 188.184.98.238, 188.185.79.172, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.184.98.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 322664409 (308M) [text/plain]
Saving to: ‘/content/drive/My Drive/IR Proposal/WikiLinkGraphs/dewiki.wikilink_graph.2010-03-01.csv.gz?download=1’


2024-04-03 07:30:52 (20.7 MB/s) - ‘/content/drive/My Drive/IR Proposal/WikiLinkGraphs/dewiki.wikilink_graph.2010-03-01.csv.gz?download=1’ saved [322664409/322664409]

--2024-04-03 07:30:52--  https://zenodo.org/records/2539424/files/enwiki.wikilink_graph.2010-03-01.csv.gz?download=1
Resolving zenodo.org (zenodo.org)... 188.184.103.159, 188.185.79.172, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.184.103.159|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 979640321 (934M) 

In [4]:
import os
import gzip
import shutil

# Language snapshots 2018 or 2010
for lang in languages:
    lang_path = os.path.join(dataset_path, lang + suffix + download)
    # Unzip file
    output_file = lang_path[:-14]  # Remove the '.gz' extension
    print(f'Unzipping {lang} to {output_file}...')
    with gzip.open(lang_path, 'rb') as file_in:
        with open(output_file, 'wb') as file_out:
            shutil.copyfileobj(file_in, file_out)
    print(f'Finished unzipping {lang}.')

Unzipping de to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/dewiki.wikilink_graph.2010-03-01.csv...
Finished unzipping de.
Unzipping en to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/enwiki.wikilink_graph.2010-03-01.csv...
Finished unzipping en.
Unzipping es to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/eswiki.wikilink_graph.2010-03-01.csv...
Finished unzipping es.
Unzipping fr to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/frwiki.wikilink_graph.2010-03-01.csv...
Finished unzipping fr.
Unzipping it to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/itwiki.wikilink_graph.2010-03-01.csv...
Finished unzipping it.
Unzipping nl to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/nlwiki.wikilink_graph.2010-03-01.csv...
Finished unzipping nl.
Unzipping pl to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/plwiki.wikilink_graph.2010-03-01.csv...
Finished unzipping pl.
Unzipping ru to /content/drive/My Drive/IR Proposal/WikiLinkGraphs/ruwiki.wikilink_graph.2

# Exploring the Graph

In [5]:
df = None
G = None
edgelist = None

In [7]:
import networkx as nx
import os
import pandas as pd

# Load the WikiLinkGraphs dataset
languages = ['de', 'en', 'es', 'fr', 'it', 'nl', 'pl', 'ru', 'sv']
# df = pd.read_csv(os.path.join(dataset_path, lang + suffix[:-3]), sep='\t')
# df = pd.read_csv(os.path.join(dataset_path, 'experiment.csv'), sep='\t')

In [6]:
df

Unnamed: 0,page_id_from,page_title_from,page_id_to,page_title_to
0,10,AccessibleComputing,411964,Computer accessibility
1,12,Anarchism,5013592,6 February 1934 crisis
2,12,Anarchism,2181459,Abstentionism
3,12,Anarchism,839656,Adolf Brand
4,12,Anarchism,2731583,Adolf Hitler
...,...,...,...,...
163380002,56716274,Patriarch Visarion I,50787040,Vissarion of Bulgaria
163380003,56716279,Visarion I,50787040,Vissarion of Bulgaria
163380004,56722248,Friction stir spot welding,56705430,Cuvillier Verlag
163380005,56722248,Friction stir spot welding,30809609,International Institute of Welding


In [40]:
for lang in languages:
  df = None
  G = None
  edgelist = None
  df = pd.read_csv(os.path.join(dataset_path, lang + suffix[:-3]), sep='\t')

  c0 = set(pd.unique(df.iloc[:,0]))
  c2 = set(pd.unique(df.iloc[:,2]))
  num_unique = len(c0.union(c2))
  print(f'{lang} Number of articles:', num_unique)
  print(f'{lang} Fraction of total entries: {num_unique/len(df)*100:0.2f}%')
  # Create the graph
  edgelist = df.drop(['page_title_from', 'page_title_to'], axis=1)
  df = None
  G = nx.from_pandas_edgelist(edgelist, source='page_id_from', target='page_id_to', create_using=nx.DiGraph())

  edgelist = None

  # Calculate the degree of each node
  degrees = dict(G.degree())
  # Calculate the average degree
  average_degree = sum(degrees.values()) / len(degrees)
  print(f'{lang} Average Degree:', average_degree)

  # Graph density
  density = nx.density(G)
  print(f'{lang} Density:', density)

pl Number of articles: 1684606
pl Fraction of total entries: 6.50%
ru Number of articles: 3360531
ru Fraction of total entries: 8.99%
nl Number of articles: 2626527
nl Fraction of total entries: 10.17%
sv Number of articles: 6131736
sv Fraction of total entries: 11.70%


In [37]:
df = pd.read_csv(os.path.join(dataset_path, 'sv' + suffix[:-3]), sep='\t')
c0 = set(pd.unique(df.iloc[:,0]))
c2 = set(pd.unique(df.iloc[:,2]))
num_unique = len(c0.union(c2))
print(len(c0),len(c2),len(c0)+len(c2),num_unique)

531677 404682 936359 542900


# Matching Articles by ID

In [None]:
%pip install datasets pywikibot qwikidata wikidataintegrator pandas numpy matplotlib seaborn requests qwikidata
%pip install tqdmL

In [12]:
class ArticleInfo:
  def __init__(self, name, categories=[], subclass_of=[], part_of=[], has_part=[]):
    self.name = name
    self.categories = categories  # Adjusted for direct list storage
    self.subclass_of = subclass_of
    self.part_of = part_of
    self.has_part = has_part

In [19]:
def get_label_for_qid(qid, lang: str):
    item = wdi_core.WDItemEngine(wd_item_id=qid)
    return item.get_label(lang)

def fetch_article_info(wikidata_id):
    item = wdi_core.WDItemEngine(wd_item_id=wikidata_id)
    item_data = item.get_wd_json_representation()

    name = item.get_label('en')

    category_ids = [claim['mainsnak']['datavalue']['value']['id']
                      for claim in item_data.get('claims', {}).get('P31', [])]
    category_names = [get_label_for_qid(qid) for qid in category_ids]

    subclass_of_ids = [claim['mainsnak']['datavalue']['value']['id']
                       for claim in item_data.get('claims', {}).get('P279', [])]
    subclass_of_names = [get_label_for_qid(qid) for qid in subclass_of_ids]

    part_of_ids = [claim['mainsnak']['datavalue']['value']['id']
                   for claim in item_data.get('claims', {}).get('P361', [])]
    part_of_name = [get_label_for_qid(qid) for qid in part_of_ids]

    has_part_ids = [claim['mainsnak']['datavalue']['value']['id']
                    for claim in item_data.get('claims', {}).get('P527', [])]
    has_part_names = [get_label_for_qid(qid) for qid in has_part_ids]

    return ArticleInfo(name, category_names, subclass_of_names, part_of_name, has_part_names)

In [14]:
from wikidataintegrator import wdi_core

item = wdi_core.WDItemEngine(wd_item_id='Q38533')
item_data = item.get_wd_json_representation()

In [17]:
wikidata_id = "Q56716274"
article_info = fetch_article_info(wikidata_id)

print(f"name: {article_info.name}")
print(f"instance of: {article_info.categories}")
print(f"subclass of: {article_info.subclass_of}")
print(f"part of: {article_info.part_of}")
print(f"has part: {article_info.has_part}")

name: 
instance of: ['photograph']
subclass of: []
part of: []
has part: []


In [23]:
get_label_for_qid("Q11111", 'ch')

''

In [None]:
import requests

def get_article_id(article_name, language_code):
    # Construct the Wikidata API URL
    url = f"https://{language_code}.wikipedia.org/w/api.php"

    # Set parameters for the API request
    params = {
        "action": "wbgetentities",
        "sites": f"{language_code}wiki",
        "titles": article_name,
        "format": "json"
    }

    # Send GET request to the API
    response = requests.get(url, params=params)

    # Parse JSON response
    data = response.json()

    # Extract article ID (QID) from the response
    entities = data.get("entities")
    if entities:
        article_id = next(iter(entities))
        return article_id

    # Return None if article ID is not found
    return None

# Example usage
article_name = "Albert Einstein"
language_code = "en"  # English Wikipedia
article_id = get_article_id(article_name, language_code)
if article_id:
    print(f"Article ID for '{article_name}' in {language_code} Wikipedia: {article_id}")
else:
    print(f"Article '{article_name}' not found in {language_code} Wikipedia.")

# Getting average article length

In [None]:
!pip install -q tensorflow-datasets

In [None]:
import os

# path = '/content/drive/My Drive/shared folder/IR Proposal/wiki40b_multi_languages'
path = '/content/drive/My Drive/IR Proposal/wiki40b_multi_languages'
if not os.path.exists(path):
    os.makedirs(path)

In [None]:
import tensorflow_datasets as tfds
import os

languages = ['en', 'de', 'ru', 'fr', 'es', 'it', 'pt', 'nl', 'hu', 'sv', 'fi', 'no', 'ro', 'tr', 'da']

# First download data to drive
# for lang in languages:
    # ds, _ = tfds.load(f'wiki40b/{lang}', split='train', with_info=True, download=True, data_dir=path)
# Subsequently just load data from drive instead of downloading
# ds_en, info = tfds.load(f'wiki40b/en', split='train', with_info=True, download=False, data_dir=path)
for lang in languages:
  ds, info = tfds.load(f'wiki40b/{lang}', split='train', with_info=True, download=False, data_dir=path)
  print('LANGUAGE:', lang)
  print('NUMBER OF ARTICLES:', len(ds))
  article_avg_len = sum(len(article["text"].numpy()) for article in ds) / len(ds)
  print(f"avg len of article => en: {article_avg_len}")