In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from urllib.parse import unquote
import requests
import tarfile
import os

In [2]:
url = 'https://snap.stanford.edu/data/wikispeedia/wikispeedia_paths-and-graph.tar.gz'
file_name = 'wikispeedia_paths-and-graph.tar.gz'

if not os.path.exists("wikispeedia_paths-and-graph"):
    response = requests.get(url, stream=True)
    with open(file_name, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    
    with tarfile.open(file_name, 'r:gz') as tar:
        root_folder = os.path.commonprefix(tar.getnames())
        tar.extractall(path='.')

### Articles

- The list of all articles.
- Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
- FORMAT:   article

In [3]:
articles = pd.read_csv('wikispeedia_paths-and-graph/articles.tsv',  sep='\t', comment='#', header=None)

In [4]:
articles['articles'] = articles[0].apply(lambda x: unquote(x))
articles = articles.drop(0, axis=1)
articles

Unnamed: 0,articles
0,Áedán_mac_Gabráin
1,Åland
2,Édouard_Manet
3,Éire
4,Óengus_I_of_the_Picts
...,...
4599,Zionism
4600,Zirconium
4601,Zoroaster
4602,Zuid-Gelders


### Categories

- Hierarchical categories of all articles.
- Many articles have more than one category. Some articles have no category.
- Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
- FORMAT:   article   category

In [5]:
categories = pd.read_csv('wikispeedia_paths-and-graph/categories.tsv',  sep='\t', comment='#', header=None)
categories.columns = ['article', 'category']

for col in categories.columns:
    categories[col] = categories[col].apply(lambda x:unquote(x))

categories

Unnamed: 0,article,category
0,Áedán_mac_Gabráin,subject.History.British_History.British_Histor...
1,Áedán_mac_Gabráin,subject.People.Historical_figures
2,Åland,subject.Countries
3,Åland,subject.Geography.European_Geography.European_...
4,Édouard_Manet,subject.People.Artists
...,...,...
5199,Zirconium,subject.Science.Chemistry.Chemical_elements
5200,Zoroaster,subject.People.Religious_figures_and_leaders
5201,Zuid-Gelders,subject.Geography.European_Geography
5202,Zuid-Gelders,subject.Language_and_literature.Languages


### Links

- The list of all links between articles.
- Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
- FORMAT:   linkSource   linkTarget

In [6]:
links = pd.read_csv('wikispeedia_paths-and-graph/links.tsv',  sep='\t', comment='#', header=None)
links.columns = ['linkSource', 'linkTarget']

for col in links.columns:
    links[col] = links[col].apply(lambda x:unquote(x))

links

Unnamed: 0,linkSource,linkTarget
0,Áedán_mac_Gabráin,Bede
1,Áedán_mac_Gabráin,Columba
2,Áedán_mac_Gabráin,Dál_Riata
3,Áedán_mac_Gabráin,Great_Britain
4,Áedán_mac_Gabráin,Ireland
...,...,...
119877,Zulu,South_Africa
119878,Zulu,Swaziland
119879,Zulu,United_Kingdom
119880,Zulu,Zambia


### Finished 

- Successful (i.e., finished) Wikispeedia paths.
- Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
- Articles in a path are separated by ";".
- Back clicks are represented as "<".
- Ratings are optionally given by the user after finishing the game and range from 1 ("easy") to 5 ("brutal").
- Missing ratings are represented as "NULL".
- FORMAT:   hashedIpAddress   timestamp   durationInSec   path   rating

In [7]:
paths_finished_infos = pd.read_csv('wikispeedia_paths-and-graph/paths_finished.tsv',  sep='\t', comment='#', header=None)
paths_finished_infos.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating']
paths_finished_infos['path'].apply(lambda x: unquote(x))
paths_finished_infos

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [8]:
paths_finished = paths_finished_infos['path']
paths_finished

0        14th_century;15th_century;16th_century;Pacific...
1        14th_century;Europe;Africa;Atlantic_slave_trad...
2        14th_century;Niger;Nigeria;British_Empire;Slav...
3           14th_century;Renaissance;Ancient_Greece;Greece
4        14th_century;Italy;Roman_Catholic_Church;HIV;R...
                               ...                        
51313                     Yagan;Ancient_Egypt;Civilization
51314    Yagan;Folklore;Brothers_Grimm;<;19th_century;C...
51315    Yagan;Australia;England;France;United_States;T...
51316    Yarralumla%2C_Australian_Capital_Territory;Aus...
51317                              Ziad_Jarrah;Germany;Jew
Name: path, Length: 51318, dtype: object

### Unfinished Paths

- Unsuccessful (i.e., unfinished) Wikispeedia paths.
- Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
- Articles in a path are separated by ";".
- Back clicks are represented as "<".
- There are two types of quitting:
- (1) "**timeout**" means that no click was made for 30 minutes;
- (2) "**restart**" means that the user started a new game without finishing the current one.
- FORMAT:   hashedIpAddress   timestamp   durationInSec   path   target   type

In [9]:
paths_unfinished_infos = pd.read_csv('wikispeedia_paths-and-graph/paths_unfinished.tsv',  sep='\t', comment='#', header=None)
paths_unfinished_infos.columns = ['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type']
paths_unfinished_infos['path'].apply(lambda x: unquote(x))
paths_unfinished_infos

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout
...,...,...,...,...,...,...
24870,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,Cholera,restart
24871,232f992e57d43e8d,1389787697,6,Modern_history,Hollandic,restart
24872,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,The_Beatles,timeout
24873,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,Alan_Turing,timeout


In [10]:
paths_unfinished = paths_unfinished_infos['path']
paths_unfinished

0                                           Obi-Wan_Kenobi
1                                            Julius_Caesar
2                     Malawi;Democracy;Alexander_the_Great
3                                                 Paraguay
4                                         Paraguay;Bolivia
                               ...                        
24870    Franz_Kafka;Tuberculosis;World_Health_Organiza...
24871                                       Modern_history
24872    Computer_programming;Linguistics;Culture;Popul...
24873    Jamaica;United_Kingdom;World_War_II;Battle_of_...
24874            Mark_Antony;Rome;Tennis;Hawk-Eye;Computer
Name: path, Length: 24875, dtype: object

### Shortest Path Distance Matrix

- The shortest-path distances between all pairs of articles, computed using the Floyd-Warshall algorithm.
- FORMAT: One row per article (the "source" of the shortest paths), in the same order as in articles.tsv. 
- Each row contains the distances from the source to all articles (the "targets" of the shortest paths),
- again in the order of articles.tsv.
- The shortest-path distance is represented as a single digit, with no separators between values. This
- is possible because the longest shortest path happens to be of length 9.
- An underscore ("_") is used to indicate that the target cannot be reached from the source.

In [11]:
spdm = pd.read_csv('wikispeedia_paths-and-graph/shortest-path-distance-matrix.txt', comment='#', header=None)
spdm.columns = ['distance']
spdm = spdm['distance'].apply(lambda x: pd.Series(list(x)))

In [12]:
spdm_np = spdm.to_numpy()
print(spdm_np)

[['0' '_' '_' ... '4' '4' '2']
 ['_' '0' '_' ... '3' '3' '3']
 ['_' '_' '0' ... '3' '3' '3']
 ...
 ['_' '_' '_' ... '0' '3' '3']
 ['_' '_' '_' ... '4' '0' '3']
 ['_' '_' '_' ... '3' '3' '0']]
