In [126]:
# Setup 
from flask import Flask, render_template, request
import os
import pygraphviz as pgv
from pyvis.network import Network
from collections import Counter
import networkx as nx
import requests
import json

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Project one (wikipedia SNA)/Code/"
DATA_FILE = "networkdata3.json"
WIKI_URL = "https://en.wikipedia.org"
API_ENDPOINT = WIKI_URL + "/w/api.php"
LIMIT_LINKS_PER_NODE = 500
LIMIT_API_REQUESTS = 100

In [127]:
# Loading previously saved link data.
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [121]:
# Function: download data node_title from Wikimedia API, add to network_data runtime object and save to json file.  
def downloadNode(node_title, continue_pageid = None):

    # setup 
    links_wiki = []

    # requesting data via wikimedia API.  
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": node_title,
        "prop": "links",
        "plcontinue": continue_pageid, #NB: THIS SHOULD BE MAX PAGEID of result, then call AGAIN... 
        "plnamespace": 0, # only load wikipedia main/articles. 
        "pllimit": 500 # can go up to 500. Go for max? 
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)

    # Transforming dat response + error handling.  
    data_wiki = response.json()
    node = next(iter(data_wiki['query']['pages']))
    
    try: 
        for x in data_wiki['query']['pages'][node]['links']:
            links_wiki.append(x['title'])
                
        node_data = {'status': 'complete', 'node_ID': node, 'links': links_wiki, 'timestamp': 'TODO', 'ego': 0, 'revisions': 'TODO'}
        if 'continue' in data_wiki.keys(): 
            node_data['status'] = 'incomplete' 
            node_data['plcontinue'] = data_wiki['continue']['plcontinue']
        network_data[node_title] = node_data
        return network_data[node_title]
        
    except:
        node_data = {'status': 'dead', 'timestamp': 'TODO'}
        network_data[node_title] = node_data
        return network_data[node_title]
    
    finally:   
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wikipage name: " + node_title + ". Status: " + network_data[node_title]['status'] + ".")


In [135]:
# initiate class Node. 
class WikiNode:
    def __init__(self,node_title):
        self.title = node_title
        self.status_node = 'empty'
        self.links = []
        
        # here the json file is read. If the data on wikipedia links is not available, it returns the node as 'empty'. 
        if node_title in network_data.keys():
            self.status_node = network_data[node_title]['status']
        
        # if there is data on wikipedia links available (if the node is not a deadlink, thus 'alive') links are added to the instance of WikiNode. 
        if self.status_node == 'complete':
            self.links = network_data[node_title]['links']

    # In case data is not available, this method calls a function to call the wikimedia API to download data from wikimedia. 
    def downloadNode(self,continue_pageid=None):
        
        if network_data[self.title]['status'] == 'incomplete': 
            downloadNode(self.title, continue_pageid = network_data[self.title]['plcontinue'])
        else: downloadNode(self.title)
        self.__init__(self.title)

In [136]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
    
    def __init__(self,node_title):
        # initiate network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title)
        self.nodes = []
        self.edges = [] 
        self.status_network = [] 
        
        # if the central node has links (if it is 'alive' and not a deadlink on wikipedia) then the links are here used to build the network.  
        if self.status_node == 'complete':
            self.links.append(node_title)

            for link in self.links:
                Node2 = WikiNode(link)
                self.status_network  = self.status_network + [Node2.status_node] 
                if Node2.status_node == 'complete':
                    purged_nodes = [x for x in Node2.links if x in self.links]
                    purged_edges = []
                    for purged_node in purged_nodes:
                        purged_edges.append((link,purged_node))  
                    self.nodes = self.nodes + purged_nodes
                    self.edges = self.edges + purged_edges                             
            self.nodes_count = Counter(self.nodes)
            print("Data Succesfully loaded.")
        
        self.status_network_overview = Counter(self.status_network)


    def downloadNetwork(self,callLimit): 
        if self.status_node != 'complete': 
            self.downloadNode()

        call = 0
        for link in self.nodes:
            Node2 = WikiNode(link)
            if Node2.status_node != 'complete':
                Node2.downloadNode()
                call = call + 1
                print(call)
                if call >= callLimit: break

    def getNodesEdges(self,threshold):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        selected_edges = [(a,b) for a,b in self.edges if a in selected_nodes and b in selected_nodes]

        nodes_network = []
        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return (nodes_network,selected_edges)

In [124]:
def drawGraph(WikiNodesEdges):
    
    Graph = nx.Graph()

    Graph.add_nodes_from(WikiNodesEdges[0])
    Graph.add_edges_from(WikiNodesEdges[1])

    netdraw = Network('2000px', '2000px')
    netdraw.from_nx(Graph)
    netdraw.barnes_hut()

    netdraw.show("wikigraph.html")

In [137]:
wikinet = WikiNetwork('Terrorism')

In [139]:
terror_temp = downloadNode("terrorism")
network_data.keys()

Data succesfully saved. Wikipage name: terrorism. Status: incomplete.


dict_keys(['Terrorism', 'terrorism'])

In [138]:
wikinet.downloadNetwork(700)

Data succesfully saved. Wikipage name: Terrorism. Status: complete.
Data Succesfully loaded.


KeyError: 'Online youth radicalization'

In [116]:
network_data['Terrorism']['status']

'complete'

In [114]:
wikinet.status_node

'complete'

In [86]:
terror_temp

'incomplete'

In [28]:
downloadNode("Zevenaar")

KeyError: 'query'

In [109]:
downloadNode("Terrorism", )

Data succesfully saved. Wikipage name: Terrorism. Status: incomplete.


{'status': 'incomplete',
 'node_ID': '30636',
 'links': ['13 Vendémiaire',
  '1975 Dutch train hostage crisis',
  '1983 Beirut barracks bombings',
  '1983 United States embassy bombing',
  '2002 Bali bombing',
  '2002 Bali bombings',
  '2003 invasion of Iraq',
  '2007 Lebanon conflict',
  '2008 Mumbai attacks',
  '7 July 2005 London bombings',
  'Abu Ghraib torture and prisoner abuse',
  'Abu Sayyaf',
  'Adam and Eve',
  'Advanced Imaging Technology',
  'Aerial warfare',
  'AfPak',
  'Afghan Mujahideen',
  'Afghanistan',
  'African National Congress',
  'Agence France Press',
  'Agro-terrorism',
  'Air combat manoeuvring',
  'Air supremacy',
  'Aircraft hijacking',
  'Al-Qaeda',
  'Al-Qaeda in the Arabian Peninsula',
  'Al-Qaeda insurgency in Yemen',
  'Al-Shabaab (militant group)',
  'Al Qaeda',
  'Alan B. Krueger',
  'Alberto Fujimori',
  'Algeria',
  'Allied of World War II',
  'American Political Science Review',
  'Amphibious warfare',
  'Anarchism',
  'Anarchism in France',
  'An

In [74]:
test = downloadNode("Terrorism")

Data succesfully saved. Wikipage name: Terrorism. Status: incomplete.


In [110]:
downloadNode('Terrorism', continue_pageid = network_data['Terrorism']['plcontinue'])

Data succesfully saved. Wikipage name: Terrorism. Status: complete.


{'status': 'complete',
 'node_ID': '30636',
 'links': ['Online youth radicalization',
  'Operation Active Endeavour',
  'Operation Condor',
  'Operation Eagle Assist',
  'Operation Enduring Freedom',
  'Operation Enduring Freedom – Horn of Africa',
  'Operation Enduring Freedom – Philippines',
  'Operation Juniper Shield',
  'Operation Linda Nchi',
  'Operation Noble Eagle',
  'Operational level of war',
  'Operational manoeuvre group',
  'Operations research',
  'Oppression',
  'Order of Assassins',
  'Organization of Ukrainian Nationalists',
  'Organized crime',
  'Osama bin Laden',
  'Outline of war',
  'Overmatch',
  'PMC (identifier)',
  'PMID (identifier)',
  'Pakistan',
  "Pakistan's role in the War on Terror",
  'Pakistan and state-sponsored terrorism',
  'Pakistan and state terrorism',
  'Palestine (region)',
  'Palestine Liberation Organization',
  'Palestinian Authority Martyrs Fund',
  'Paper terrorism',
  'Participants in Operation Enduring Freedom',
  'Patriot Act',
  'Pa

In [104]:
network_data['Terrorism']['plcontinue']

['plcontinue']

In [157]:
graph_data = wikinet.getNodesEdges(2)

In [158]:
drawGraph(graph_data)

In [51]:
  
S = requests.Session()
PARAMS = {
    "action": "query",
    "format": "json",
    "titles": "Terrorism",
    "generator": "links",
    # "plcontinue": ,
    "plnamespace": 0, # only load wikipedia main/articles. 
    "pllimit": 600 # can go up to 500. Go for max? 
}
response = S.get(url=API_ENDPOINT, params=PARAMS)

# Transforming response to network data format + error handling.  
data = response.json()
node = next(iter(data_wiki['query']['pages']))
links = data_wiki['query']['pages'][node]['pa']


KeyError: 'links'

In [61]:
# setup 
links_wiki = []
node_title = "Terrorism"
continue_pageid = 1

# requesting data via wikimedia API.
def APIcall(node_title,continue_pageid = None): 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": node_title,
        "prop": "links",
        "continue": '||', 
        "plcontinue": None, #NB: THIS SHOULD BE MAX PAGEID of result, then call AGAIN... 
        "plnamespace": 0, # only load wikipedia main/articles. 
        "pllimit": 500 # can go up to 500. Go for max? 
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)

    # Transforming dat response + error handling.  
    data_wiki = response.json()
    node = next(iter(data_wiki['query']['pages'])

    for x in data_wiki['query']['pages'][node]['links']:
        links_wiki.append(x['title'])

In [62]:
data_wiki['continue']

{'plcontinue': '30636|0|Online_youth_radicalization', 'continue': '||'}