In [1]:
# Setup 
from flask import Flask, render_template, request
import os
import pygraphviz as pgv
from pyvis.network import Network
import networkx as nx
import requests
import json

PATH = '/home/teijehidde/Documents/Git Blog and Coding/Project one (wikipedia SNA)/Code/'
DATA_FILE = 'networkdata.json'
WIKI_URL = "https://en.wikipedia.org"
API_ENDPOINT = WIKI_URL + "/w/api.php"
LIMIT_LINKS_PER_NODE = 500
LIMIT_API_REQUESTS = 100

In [2]:
# Function A: load data node_title from data file. 
def loadNode(node_title):

    try:
        with open(PATH + DATA_FILE) as json_file:
            network_data = json.load(json_file)
            return network_data[node_title]
    except IOError:
        print("Error: Could not find " + DATA_FILE + ". Please check if file is present in directory, or change DATA_FILE value.")
    else: 
        print("The file " + DATA_FILE + " found, succesfully loaded.")

In [3]:
# Function B: download data node_title from Wikimedia API
def downloadNode(node_title):
    
    # setup 
    edges = []

    # requesting data via wikimedia API.  
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": node_title,
        "prop": "links",
        # "plcontinue": ,
        "plnamespace": 0, # only load wikipedia main/articles. 
        "pllimit": 'max' # can go up to 500. Go for max? 
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)

    # Transforming response to network data format + error handling.  
    data_wiki = response.json()
    node = next(iter(data_wiki['query']['pages']))
    
    try: 
        for x in data_wiki['query']['pages'][node]['links']:
            edges.append(x['title'])
            
        node_data = {'node_ID': node, 'ego': 0, 'date_time': 'TODO', 'edges': edges, 'revisions': 'TODO'}
        network_data[node_title] = node_data
    except:
        print("Error: an exception occured while downloading " + node_title + ".")
    else: 
        print("Links data on page " + node_title + " successfully downloaded.")
        return network_data[node_title]
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully downloaded and saved.")

    # sleep(0.5)

In [4]:
# initiate class Node. 
class Node:
    def __init__(self,node_title):
        self.title = node_title
        try: 
            self.nodes = loadNode(node_title)['edges']
        except: 
            self.nodes = []
            print("Data on " + node_title + " not available in "  + DATA_FILE + ". Node returned empty")
        
    def getEdges(self):
        try: 
            x = {}
            for item in self.nodes: 
                x[item] = None
            return x
        except TypeError:
            return "Error: " + self.title + " is an empty node."

    def getUpdate(self): 
        try:
            self.nodes = downloadNode(self.title)['edges']
        except:
            self.nodes = None
            print("This title does not seem to exist on " + WIKI_URL + ".")


In [166]:
class WikiNetwork(Node):
    
    # global node_title = Node(node_title)

    def __init__(self, node_title):
        Node.__init__(self, node_title)
        self.nodes.append(self.title)

        self.network_nodes = {}
        self.network_nodes[node_title] = 1
        for item in self.nodes:
            x = Node(item)
            for node in x.nodes: 
                if node in self.network_nodes.keys():
                    self.network_nodes[node] = self.network_nodes[node] + 1
                else: 
                    self.network_nodes[node] = 1

    def getNodesEdges(self,threshold):
        selected_nodes = [k for k,v in self.network_nodes.items() if float(v) >= threshold]

        node_number = {} 
        nodes_network = []
        edges_network = []
        count = 1

        for node in selected_nodes:
            node_number[node] = node # count
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)
            count = count + 1
        
        for node in selected_nodes:
            x = Node(node)
            for node2 in x.nodes:
                    if node2 in selected_nodes:
                        edge_tuple = (node_number[node], node_number[node2])
                        edges_network.append(edge_tuple)

        return (nodes_network,edges_network)


In [175]:
zevenaar_network = WikiNetwork("Terrorism")

Data on Allied of World War II not available in networkdata.json. Node returned empty
Data on Dawabsheh not available in networkdata.json. Node returned empty
Data on International Journal of Disaster Medicine not available in networkdata.json. Node returned empty


In [176]:
nodes_edges = zevenaar_network.getNodesEdges(7)

turned empty
Data on Gaul not available in networkdata.json. Node returned empty
Data on Hinduism not available in networkdata.json. Node returned empty
Data on Hong Kong not available in networkdata.json. Node returned empty
Data on Georgetown University not available in networkdata.json. Node returned empty
Data on RAND Corporation not available in networkdata.json. Node returned empty
Data on SELIBR (identifier) not available in networkdata.json. Node returned empty
Data on 14th Dalai Lama not available in networkdata.json. Node returned empty
Data on Agnosticism not available in networkdata.json. Node returned empty
Data on Atheism not available in networkdata.json. Node returned empty
Data on Ethnic nationalism not available in networkdata.json. Node returned empty
Data on Foreign Affairs not available in networkdata.json. Node returned empty
Data on 9/11 not available in networkdata.json. Node returned empty
Data on Culture of fear not available in networkdata.json. Node returned

In [178]:
len(nodes_edges[1])

46076

In [186]:
Graph = nx.Graph()

In [187]:
Graph.add_nodes_from(nodes_edges[0])
Graph.add_edges_from(nodes_edges[1])

In [188]:
net = Network('2000px', '2000px')

In [189]:
net.from_nx(Graph)

In [190]:
net.barnes_hut()

In [191]:
# net.prep_notebook()
net.show("graph.html")

In [171]:
# check network size.. 
sum_len = 0 

for item in edges_zevenaar: 
    sum_len = sum_len + len(item)

average = sum_len / len(edges_zevenaar)

print(sum_len)
print(average)

967
13.430555555555555
