In [6]:
# Setup 
from flask import Flask, render_template, request
import os
import pygraphviz as pgv
from pyvis.network import Network
from itertools import chain
from collections import Counter
import collections 
import networkx as nx
import requests
import json
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/"
DATA_FILE = "DATA.json" 

# Loading JSON file: 
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [267]:
# Function: download data node_title from Wikimedia API and save to json file.  
def downloadNetwork(node_title, language = "en"): 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the central (Ego) node of the network. 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():
        
        PARAMS_CONT = PARAMS
        PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS_CONT)
        wiki_data.append(response.json())
    
    # step 2: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS_CONT = PARAMS
        if 'plcontinue' in wiki_data[-1]['continue']:
            PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 

        if 'gplcontinue' in wiki_data[-1]['continue']: 
            PARAMS_CONT["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']

        response = S.get(url=API_ENDPOINT, params = PARAMS_CONT)
        wiki_data.append(response.json())

    # step 3: transforming data from API into unified dictionary. 
    # 3a: creating list of available nodes. 
    list_nodes = []

    for item in wiki_data:
        list_nodes = list_nodes + list(item['query']['pages'].keys())
    list_nodes = list(set(list_nodes))

    # 3b: Using list_nodes to go through raw data from API
    for node in list_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': language, 'AvailableLanguages': []}
        
        item_name = language + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys():
                    node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[language + node] = node_data

    # Step 4: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki network name: " + node_title + "; downloaded in language: " + language + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#    finally:
#        return wiki_data


In [137]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_language = "en", requested_languages = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    availabe_langs_titles = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_language][0]
    list_available_langs = []
    for item in availabe_langs_titles: 
        list_available_langs.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if requested_languages == []:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_langs)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in availabe_langs_titles: 
            if item['lang'] in requested_languages: 
                downloadNetwork(node_title = item['*'], language = item['lang'])
    
        print("Download of additional languages finished.") 


In [294]:
# function: provide titles of networks that are saved in the JSON file. Also provides the language they were saved in. 
def getNetworks(): 

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make set of ego netowkrs and print 
    available_ego_networks = [(v['ego']) for (k,v) in network_data.items()]
    available_ego_networks = set(list(chain(*available_ego_networks)))
    
    overview = {}
    for item in available_ego_networks:
        langs = []
        nodes = [v for (k,v) in network_data.items() if v['title'] == item]
        for node in nodes: 
            langs.append(node['language']) 
        overview[item] = langs
    
    print(overview)

In [185]:
# initiate class Node. 
class WikiNode:
    def __init__(self, node_title, language):
        
        # Select node in JSON file. 
        node_data = [v for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == language][0]
        
        # Extract data and place in instance of Wikinode class. 
        self.node_title = node_data['title']
        self.node_ID = node_data['node_ID']
        self.node_links = node_data['links']
        self.node_language = node_data['language']


In [297]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title,language):
        # list available ego networks
        available_ego_networks = [(v['ego']) for (k,v) in network_data.items()]
        available_ego_networks = set(list(chain(*available_ego_networks)))
        
        # initiate network as class WikiNode, add additional attributes for class WikiNetwork 
        if node_title in available_ego_networks:
            WikiNode.__init__(self, node_title, language)
            # self.node_links = set([v['title'] for (k,v) in network_data.items() if v['ego'] == [node_title] if v['language'] == language])
            self.network_nodes = []
            self.network_edges = [] 
            self.network_status = []
        
            # Links are here used to build the network.
            for link in self.node_links:
                try:     
                    Node2 = WikiNode(link, language)                
                    purged_nodes = [x for x in Node2.node_links if x in self.node_links]
                    purged_edges = []
                    for purged_node in purged_nodes:
                        purged_edges.append((link,purged_node))  
                    self.network_nodes = self.network_nodes + purged_nodes
                    self.network_edges = self.network_edges + purged_edges
                except: 
                    print('Loading of node ' + link + ' failed.')
                self.nodes_count = Counter(self.network_nodes)
            print("Data Succesfully loaded.")
        
        else: 
            print("Node not available. Download using downloadNetwork function.") 

    def getStatusNetwork(self):
        print('WIP')

    def getNodesEdges(self,threshold):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        selected_edges = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        nodes_network = []
        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return (nodes_network,selected_edges)

In [242]:
def drawGraph(WikiNodesEdges):
    
    Graph = nx.Graph()

    Graph.add_nodes_from(WikiNodesEdges[0])
    Graph.add_edges_from(WikiNodesEdges[1])

    netdraw = Network('2000px', '2000px')
    netdraw.from_nx(Graph)
    netdraw.barnes_hut()

    title = "test.html"
    netdraw.show(title)

In [135]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [268]:
downloadNetwork('Secularism')

Data succesfully saved. Wiki network name: Secularism; downloaded in language: en.


In [269]:
downloadAdditionalLanguage(node_title = 'Secularism')

The wikipedia page is available in the following languages:
['af', 'ar', 'ary', 'arz', 'ast', 'az', 'be', 'be-x-old', 'bg', 'bh', 'bn', 'br', 'bs', 'ca', 'ckb', 'cs', 'cy', 'da', 'de', 'diq', 'el', 'eo', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hi', 'hr', 'hy', 'id', 'is', 'it', 'ja', 'ka', 'kn', 'ko', 'la', 'lad', 'lfn', 'li', 'lo', 'lv', 'mk', 'ml', 'mr', 'ms', 'mzn', 'ne', 'new', 'nl', 'pa', 'pl', 'pnb', 'pt', 'ro', 'ru', 'sd', 'sh', 'simple', 'sk', 'sq', 'sr', 'su', 'sv', 'ta', 'te', 'tg', 'th', 'tr', 'tt', 'uk', 'ur', 'uz', 'vi', 'wuu', 'xmf', 'yi', 'zh', 'zh-yue']
Download of additional languages finished.


In [270]:
downloadAdditionalLanguage(node_title = 'Secularism', original_language = "en", requested_languages = ["de", "fr", "ar", "nl"]) # 

Data succesfully saved. Wiki network name: علمانية; downloaded in language: ar.
Data succesfully saved. Wiki network name: Säkularismus; downloaded in language: de.
Data succesfully saved. Wiki network name: Laïcité; downloaded in language: fr.
Data succesfully saved. Wiki network name: Secularisme; downloaded in language: nl.
Download of additional languages finished.


In [295]:
getNetworks()

{'Secularism': ['en'], 'Terrorism': ['en'], 'Religion': ['de', 'fr', 'en'], 'Terrorismus': ['de'], 'Säkularismus': ['de'], 'England': ['en'], 'دين (معتقد)': ['ar'], 'علمانية': ['ar'], 'Terrorisme': ['fr'], 'Religie': ['nl'], 'إرهاب': ['ar'], 'Laïcité': ['fr'], 'Secularisme': ['nl']}


In [318]:
Networks = []
Networks.append(WikiNetwork('Secularism', language='en'))
Networks.append(WikiNetwork('Säkularismus', language='de'))
Networks.append(WikiNetwork('Laïcité', language='fr'))
Networks.append(WikiNetwork('علمانية', language='ar'))
Networks.append(WikiNetwork('Secularisme', language='nl'))

Data Succesfully loaded.
Data Succesfully loaded.
Loading of node Church of Scotland Act 1921 failed.
Loading of node Critique de la Scientologie failed.
Loading of node Gennade II Scholarius failed.
Data Succesfully loaded.
Data Succesfully loaded.
Data Succesfully loaded.


In [321]:
Graph = []

Graph.append(Networks[0].getNodesEdges(15))
Graph.append(Networks[1].getNodesEdges(2))
Graph.append(Networks[2].getNodesEdges(5))
Graph.append(Networks[3].getNodesEdges(2))
Graph.append(Networks[4].getNodesEdges(1))


In [322]:
drawGraph(Graph[4])


In [310]:
len(Networks)

4