In [20]:
# Setup 
from flask import Flask, render_template, request
import os
import pygraphviz as pgv
from pyvis.network import Network
from collections import Counter
import networkx as nx
import requests
import json

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Project one (wikipedia SNA)/Code/"
DATA_FILE = "networkdata3.json"
WIKI_URL = "https://en.wikipedia.org"
API_ENDPOINT = WIKI_URL + "/w/api.php"
LIMIT_LINKS_PER_NODE = 500
LIMIT_API_REQUESTS = 100

In [21]:
# Loading previously saved link data.
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [22]:
# Function: download data node_title from Wikimedia API, add to network_data runtime object and save to json file.  
def downloadNode(node_title, continue_pageid = None):

    # setup and load existing data node.
    links_wiki = []
    if node_title in network_data.keys(): 
        if network_data[node_title]['status'] == 'incomplete':
            links_wiki = links_wiki + network_data[node_title]['links']

    # requesting data via wikimedia API.  
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": node_title,
        "prop": "links",
        "plcontinue": continue_pageid, #NB: THIS SHOULD BE MAX PAGEID of result, then call AGAIN... 
        "plnamespace": 0, # only load wikipedia main/articles. 
        "pllimit": 500 # can go up to 500. Go for max? 
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)

    # Transforming dat response + error handling.  
    data_wiki = response.json()
    node = next(iter(data_wiki['query']['pages']))
    
    try: 
        for x in data_wiki['query']['pages'][node]['links']:
            links_wiki.append(x['title'])
                
        node_data = {'status': 'complete', 'node_ID': node, 'links': links_wiki, 'timestamp': 'TODO', 'ego': 0} # , 'revisions': 'TODO'
        if 'continue' in data_wiki.keys(): 
            node_data['status'] = 'incomplete' 
            node_data['plcontinue'] = data_wiki['continue']['plcontinue']
        network_data[node_title] = node_data
        return network_data[node_title]
        
    except:
        node_data = {'status': 'dead', 'timestamp': 'TODO'}
        network_data[node_title] = node_data
        return network_data[node_title]
    
    finally:   
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wikipage name: " + node_title + ". Status: " + network_data[node_title]['status'] + ".")


In [23]:
# initiate class Node. 
class WikiNode:
    def __init__(self,node_title):
        self.node_title = node_title
        self.node_status = 'empty'
        self.node_links = []
        
        # here the json file is read. If the data on wikipedia links is not available, it returns the node as 'empty'. 
        if node_title in network_data.keys():
            self.node_status = network_data[node_title]['status']
        else: self.node_status = 'empty'
        
        # if there is data on wikipedia links available (if the node is not a deadlink, thus 'alive') links are added to the instance of WikiNode. 
        if self.node_status == 'complete':
            self.node_links = network_data[node_title]['links']

    # In case data is not available, this method calls a function to call the wikimedia API to download data from wikimedia. 
    def downloadNode(self,continue_pageid=None,):
        
        if self.node_title not in network_data.keys(): 
            downloadNode(self.node_title)
        while network_data[self.node_title]['status'] == 'incomplete' or network_data[self.node_title]['status'] == 'empty':  
            if 'plcontinue' in network_data[self.node_title].keys(): 
                x = network_data[self.node_title]['plcontinue']
            else: x = None  
            downloadNode(self.node_title, continue_pageid = x)

        self.__init__(self.node_title)

In [24]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
    
    def __init__(self,node_title):
        # initiate network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title)
        self.network_nodes = []
        self.network_edges = [] 
        self.network_status = []
        
        # if the central node has links (if it is 'alive' and not a deadlink on wikipedia) then the links are here used to build the network.  
        if self.node_status == 'complete':
            self.node_links.append(node_title)

            for link in self.node_links:
                Node2 = WikiNode(link)
                self.network_status  = self.network_status + [Node2.node_status] 
                if Node2.node_status == 'complete':
                    purged_nodes = [x for x in Node2.node_links if x in self.node_links]
                    purged_edges = []
                    for purged_node in purged_nodes:
                        purged_edges.append((link,purged_node))  
                    self.network_nodes = self.network_nodes + purged_nodes
                    self.network_edges = self.network_edges + purged_edges                             
            self.nodes_count = Counter(self.network_nodes)
            print("Data Succesfully loaded.")


    def getStatusNetwork(self):     
        return Counter(self.network_status)


    def downloadNetwork(self,callLimit): 
        if self.node_status != 'complete': 
            self.downloadNode()

        call = 0
        for link in self.network_nodes:
            Node2 = WikiNode(link)
            if Node2.node_status == 'incomplete' or Node2.node_status == 'empty': 
                Node2.downloadNode()
                call = call + 1
                print(call)
                if call >= callLimit: break

    def getNodesEdges(self,threshold):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        selected_edges = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        nodes_network = []
        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return (nodes_network,selected_edges)

In [25]:
def drawGraph(WikiNodesEdges):
    
    Graph = nx.Graph()

    # Graph.add_nodes_from(WikiNodesEdges[0])
    Graph.add_edges_from(WikiNodesEdges[1])

    netdraw = Network('2000px', '2000px')
    netdraw.from_nx(Graph)
    netdraw.barnes_hut()

    netdraw.show("wikigraph.html")

In [36]:
wikinet = WikiNetwork("Terrorism")

Data Succesfully loaded.


In [41]:
graph_data = wikinet.getNodesEdges(50)

In [78]:
'Terrorism' in network_data.keys()

True

In [37]:
wikinet.getStatusNetwork()

Counter({'complete': 779, 'dead': 4})

In [42]:
drawGraph(graph_data)