In [3]:
!open ./

In [95]:
from pandas import DataFrame
class PrintableList(list):
    def as_data_frame(self):
        if self and hasattr(self[0],"_fields"):
            return DataFrame(list(self), columns=self[0]._fields)
        return DataFrame(list(self))
    def _repr_html_(self):
        return self.as_data_frame()._repr_html_()

In [96]:
def read_wikipedia_data():
    pass

def parse_file(path):
    with open(path, encoding="utf8") as f:
        links = [list(map(unquote, line.replace("\n","").split('\t'))) for line in f.readlines()]
    return links
links_path = "./wikispeedia_paths-and-graph/links_edited.tsv"
category_path = "./wikispeedia_paths-and-graph/categories_edited.tsv"

class Article:
    def __init__(self, name, categories):
        self.name = name
        self.categories = categories.replace("subject.","").split(".")
        self.links = []
    def add_link(self, article):
        self.links.append(article)
    @property
    def _fields(self):
        return ["name","categories","links"]
    def __repr__(self):
        return "<{name} : {cat} => {links}>".format(name = self.name,
                                                   cat = self.categories,
                                                   links = [art for art in self.links])
    def __iter__(self):
        def gen():
            yield from [self.name, self.categories, self.links]
        return gen()

articles = {}
links_data = parse_file(links_path)
categories_data = parse_file(category_path)

for article, categories in categories_data:
    articles[article] = Article(article, categories)

for a,b in links_data:
    try:
        articles[a].add_link(b)
        articles[b].add_link(a)
    except KeyError:
        print("Cannot find key : {}".format((a,b)))

Cannot find key : ('Friend_Directdebit', 'Directdebit')
Cannot find key : ('Pac-Man', 'Pikachu')
Cannot find key : ('Pikachu', 'Book')
Cannot find key : ('Pikachu', 'Computer_and_video_games')
Cannot find key : ('Pikachu', 'Continent')
Cannot find key : ('Pikachu', 'Film')
Cannot find key : ('Pikachu', 'Forest')
Cannot find key : ('Pikachu', 'Homestar_Runner')
Cannot find key : ('Pikachu', 'Mammal')
Cannot find key : ('Pikachu', 'New_York_City')
Cannot find key : ('Pikachu', 'North_America')
Cannot find key : ('Pikachu', 'Pokémon_Trading_Card_Game')
Cannot find key : ('Pikachu', 'Popular_culture')
Cannot find key : ('Pikachu', 'The_Simpsons')
Cannot find key : ('Pikachu', 'Tree')
Cannot find key : ('Sonic_the_Hedgehog_(character)', 'Pikachu')
Cannot find key : ('Sponsorship_Directdebit', 'Directdebit')
Cannot find key : ('Sponsorship_Directdebit', 'Friend_Directdebit')


In [97]:
python = PrintableList([i for i in articles.values() if "Monty_Python" in i.links])

In [98]:
python

Unnamed: 0,name,categories,links
0,"Minneapolis,_Minnesota","[Geography, North_American_Geography]","[Bob_Dylan, Canadian_Pacific_Railway, Chicago_..."
1,Actor,"[Language_and_literature, Theatre]","[Charles_II_of_England, Drama, Film, Greece, H..."
2,Weston-super-Mare,"[Geography, Geography_of_Great_Britain]","[Avon_Gorge, Monty_Python, Tourism, Bristol, E..."
3,Scotland,"[Geography, Geography_of_Great_Britain]","[Áedán_mac_Gabráin, Óengus_I_of_the_Picts, 10t..."
4,Guinness,"[Everyday_life, Drink]","[Beer, Dublin, Economy_of_the_Republic_of_Irel..."
5,World_War_I,"[History, Military_History_and_War]","[19th_century, 20th_century, A._E._J._Collins,..."
6,Music,"[Music, Musical_genres_styles_eras_and_events]","[19th_century, 20th_century, A_cappella, Aesth..."
7,England,"[Geography, Geography_of_Great_Britain]","[Óengus_I_of_the_Picts, 11th_century, 12th_cen..."
8,"Theatre_Royal,_Drury_Lane","[Language_and_literature, Theatre]","[Colley_Cibber, George_III_of_the_United_Kingd..."
9,Douglas_Adams,"[People, Writers_and_critics]","[Arsenal_F.C., Atheism, Cambridge, Dad's_Army,..."


In [99]:
from collections import Counter
Counter(b for a,b in links_data).most_common(10)

[('United_States', 1551),
 ('United_Kingdom', 972),
 ('France', 959),
 ('Europe', 933),
 ('England', 751),
 ('World_War_II', 751),
 ('Germany', 743),
 ('India', 611),
 ('English_language', 598),
 ('London', 587)]

In [100]:
ls = PrintableList(articles.values())

In [103]:
import csv
data = []
for art in ls:
    for ln in art.links:
        try :
            linked = articles[ln]
            data.append([art.name, ".".join(art.categories), linked.name, ".".join(linked.categories)])
        except KeyError:
            pass
PrintableList(data)

with open("wikipedia_relation_data.csv", "w",encoding="utf8") as f:
    writer = csv.writer(f)
    writer.writerow(["article_name","article_category","linked_name","linked_category"])
    writer.writerows(data)

Unnamed: 0,0,1,2,3
0,Kuwait_City,Geography.Geography_of_the_Middle_East,15th_Marine_Expeditionary_Unit,History.Military_History_and_War
1,Kuwait_City,Geography.Geography_of_the_Middle_East,Asia,Geography.Geography_of_Asia
2,Kuwait_City,Geography.Geography_of_the_Middle_East,Kuwait,Geography.Geography_of_the_Middle_East.Middle_...
3,Kuwait_City,Geography.Geography_of_the_Middle_East,Arabic_language,Language_and_literature.Languages
4,Kuwait_City,Geography.Geography_of_the_Middle_East,Capital,Citizenship.Politics_and_government
5,Kuwait_City,Geography.Geography_of_the_Middle_East,City,Geography.General_Geography
6,Kuwait_City,Geography.Geography_of_the_Middle_East,Emirate,Citizenship.Politics_and_government
7,Kuwait_City,Geography.Geography_of_the_Middle_East,Iraq,Geography.Geography_of_the_Middle_East.Middle_...
8,Kuwait_City,Geography.Geography_of_the_Middle_East,Kuwait,Geography.Geography_of_the_Middle_East.Middle_...
9,Kuwait_City,Geography.Geography_of_the_Middle_East,Persian_Gulf,Geography.General_Geography
