In [1]:
# Setup 
from flask import Flask, render_template, request
import os
import pygraphviz as pgv
from pyvis.network import Network
from itertools import chain
import collections 
import networkx as nx
import requests
import json
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/"
DATA_FILE = "networkdataTEST.json" 

In [175]:
# Function: download data node_title from Wikimedia API and save to json file.  
def downloadNetwork(node_title, language = "en"): # other language are fr, nl, de. - ONLY ONE LANGUAGE CAN BE DEFINED HERE! 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the central (Ego) node of the network. 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS)
        wiki_data.append(response.json())
    
    # step 2: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS["plcontinue"] = wiki_data[-1]['continue']['plcontinue']
        response = S.get(url=API_ENDPOINT, params=PARAMS)
        wiki_data.append(response.json())
    
    # step 3: transforming and saving data to JSON file.    
    # Loading previously saved link data.
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # transforming data from API into unified dictionary. 
    for item in wiki_data:
        for node in list(item['query']['pages'].keys()):
            node_data = {'node_ID': node, 'title': item['query']['pages'][node]['title'], 'links': [], 'ego': [], 'language': language, 'AvailableLanguages': []} 

            if node in network_data.keys(): 
                node_data = network_data[node]

            if 'links' in item['query']['pages'][node].keys():
                for link in item['query']['pages'][node]['links']: 
                    node_data['links'].append(link['title'])
            node_data['links'] = list(set(node_data['links']))

            node_data['ego'].append(node_title)
            node_data['ego'] = list(set(node_data['ego']))
            
            if 'langlinks' in item['query']['pages'][node].keys():
                node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']
                        
            network_data[language + node] = node_data

    # Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki network name: " + node_title + "; downloaded in language: " + language + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#    finally:
#        return wiki_data


In [182]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_language = "en", requested_languages = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    availabe_langs_titles = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_language][0]
    list_available_langs = []
    for item in availabe_langs_titles: 
        list_available_langs.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if requested_languages == []:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_langs)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in availabe_langs_titles: 
            if item['lang'] in requested_languages: 
                downloadNetwork(node_title = item['*'], language = item['lang'])
    
    print("Download of additional languages finished.") 


In [178]:
# TEST 
# downloadNetwork('Michael_Lee_(basketball,_born_1986)')

Data succesfully saved. Wiki network name: Michael_Lee_(basketball,_born_1986); downloaded in language: en.


In [179]:
# TEST 
# downloadAdditionalLanguage(node_title='Michael_Lee_(basketball,_born_1986)', requested_languages = ['fr','it','de'])

Data succesfully saved. Wiki network name: Anoviara; downloaded in language: fr.
Data succesfully saved. Wiki network name: Anoviara; downloaded in language: it.
Download of additional languages finished. If no networks were downloaded, check if pages are available in requested language(s).


In [183]:
# TEST
downloadAdditionalLanguage(node_title='Japan', requested_languages = ['fr','it','de'])

TypeError: can only concatenate str (not "set") to str

In [154]:
# from here on scripts to use in runtime. 

In [155]:
# load data from json file. 
with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)

# create object that lists available networks 
ego_networks = [(v['ego']) for (k,v) in network_data.items()]
ego_networks = set(list(chain(*ego_networks)))

In [156]:
# initiate class Node. 
class WikiNode:
    def __init__(self,node_title):
        self.node_title = node_title
        self.node_links = network_data[node_title]['links']


In [157]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
    
    def __init__(self,node_title):
        # initiate network as class WikiNode, add additional attributes for class WikiNetwork 
        if node_title in available_networks:
            WikiNode.__init__(self, node_title)
            self.network_nodes = []
            self.network_edges = [] 
            self.network_status = []
        if node_title not in available_networks: 
            print("Node not available. Download using download Network function.") 
        
        # Links are here used to build the network.  
        node_title_links = [v['title'] for (k,v) in network_data.items() if v['ego'] == [node_title]]
        self.node_links.append(node_title_links)

        for link in self.node_links:
            Node2 = WikiNode(link)
            purged_nodes = [x for x in Node2.node_links if x in self.node_links]
            purged_edges = []
            for purged_node in purged_nodes:
                purged_edges.append((link,purged_node))  
            self.network_nodes = self.network_nodes + purged_nodes
            self.network_edges = self.network_edges + purged_edges                             
        self.nodes_count = Counter(self.network_nodes)
        print("Data Succesfully loaded.")

    def getStatusNetwork(self):     
        return Counter(self.network_status)

    def getNodesEdges(self,threshold):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        selected_edges = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        nodes_network = []
        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return (nodes_network,selected_edges)

In [158]:
def drawGraph(WikiNodesEdges):
    
    Graph = nx.Graph()

    # Graph.add_nodes_from(WikiNodesEdges[0])
    Graph.add_edges_from(WikiNodesEdges[1])

    netdraw = Network('2000px', '2000px')
    netdraw.from_nx(Graph)
    netdraw.barnes_hut()

    netdraw.show("wikigraphEN.html")

In [140]:
# FROM HERE RUN TIME STARTS 

In [159]:
downloadNetwork('London')

KeyError: 'plcontinue'

In [151]:
test = WikiNetwork('Netrin')

KeyError: 'Netrin'

In [420]:
# setup objects 


wiki_dicts = []

In [421]:
# setup API query and initial API call 
node_title = ""
S = requests.Session()
PARAMS = {
    "action": "query",
    "generator": "links",
    "titles": node_title,
    "gplnamespace": 0, 
    "gpllimit": 500, 
    "plnamespace": 0, 
    "pllimit": 500, 
    "prop": "links",
    "format": "json"
}
response = S.get(url=API_ENDPOINT, params=PARAMS)
wiki_dicts.append(response.json())


In [424]:
test = response.json()

In [433]:
test["query"]["pages"]['title'].keys()

KeyError: 'title'

In [24]:
# Continue API call until all data on network is downloaded. 
node_title = 'Robert_L._Spencer'
language = 'en'
wiki_data = test[1]
network_data = {}

In [25]:
for item in wiki_data:
    for node in list(item['query']['pages'].keys()):
        node_data = {'node_ID': node, "title": item['query']['pages'][node]['title'], 'links': [], 'ego': [], "language": []} 

        if node in network_data.keys(): 
            node_data = network_data[node]

        node_data['ego'].append(node_title)
        node_data['language'].append(language)
        if 'links' in item['query']['pages'][node].keys():
            for link in item['query']['pages'][node]['links']: 
                node_data['links'].append(link['title'])
        
        node_data['ego'] = list(set(node_data['ego']))
        node_data['language'] = list(set(node_data['language']))
        node_data['links'] = list(set(node_data['links']))
        
        network_data[node] = node_data

In [28]:
with open(PATH + DATA_FILE, 'w') as outfile:
    json.dump(network_data, outfile)

In [511]:
wiki_links = set()

for link in wiki_dicts[0]['query']['pages']['14019']['links']:
    wiki_links.add(link['title'])

In [512]:
wiki_links

{'ALAM',
 'Alliance of Automobile Manufacturers',
 'American Society of Mechanical Engineers',
 'Automobile',
 'BTU',
 'Basal rate',
 'Bibcode (identifier)',
 'Bore (engine)',
 'Brake specific fuel consumption',
 'Chevrolet Corvette',
 'Chrysler FirePower engine',
 'Citroën 2CV',
 'DIN',
 'Deutsches Institut für Normung',
 'Diesel engine',
 'Directional drilling',
 'Directive 80/1269/EEC',
 'Doi (identifier)',
 'Draft horse',
 'Draft horses',
 'Drawbar (haulage)',
 'Drilling mud',
 'Drilling rig',
 'Dynamometer',
 'Dynamometer car',
 'Electric motor',
 'Encyclopædia Britannica Eleventh Edition',
 'Ente Nazionale Italiano di Unificazione',
 'European units of measurement directives',
 'Exhaust manifold',
 'Flywheel',
 'Foot (unit)',
 'Foot-pound (energy)',
 'Force',
 'GM LS engine',
 'General Conference on Weights and Measures',
 'Germany',
 'HMS Agincourt (1865)',
 'HMS Albacore (1856)',
 'HMS Bellerophon (1865)',
 'HMS Dee (1832)',
 'HMS Harpy (1845)',
 'HMS Hector (1862)',
 'HMS Jack

In [502]:
'links' in wiki_dicts[0]['query']['pages']['14019'].keys()

True

In [507]:
wiki_dicts[0]['query']['pages']['14019']['links'][1]['title']

'Alliance of Automobile Manufacturers'

In [388]:
test.update(wiki_dicts[1])

In [485]:
for item in list(wiki_dicts): 
    for page in item['query']['pages']: 
        print(len(page.keys())) 

AttributeError: 'str' object has no attribute 'keys'

In [498]:
wiki_dicts[0]['query']['pages']['14019'].keys()

dict_keys(['pageid', 'ns', 'title', 'links'])

In [497]:
test = int(list(wiki_dicts[0]['query']['pages'].keys()))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

In [494]:
test.sort()

In [521]:
with open(PATH + DATA_FILE, 'w') as outfile:
    json.dump(network_data, outfile)

TypeError: Object of type set is not JSON serializable

In [437]:
node_data = {'node_ID': "node", 'links': "links_wiki", 'ego': set(), "language": set()}

In [439]:
type(node_data['ego'])

set

In [246]:
data_wiki_temp['continue']['plcontinue']

'1256|0|Claudius'

In [470]:
data_wiki['query']['pages']['12789341'].keys()

TypeError: 'set' object is not subscriptable

In [342]:
language = "en"

In [343]:
API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php"

In [344]:
API_ENDPOINT




'https://en.wikipedia.org/w/api.php'

In [88]:
len(test)

2