In [2]:
# Setup packages
import os
from itertools import chain
from collections import Counter
import collections 
import networkx as nx
import requests
import json

# all these network algorithms are currently being tried out. 
from networkx.algorithms import approximation
from networkx.algorithms import community
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms.community import greedy_modularity_communities

# For network visualization: 
from pyvis.network import Network
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/"
DATA_FILE = "DATA.json" 

# Loading JSON file: 
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [3]:
def SaveData(wiki_data, node_title, language):
    # step 1: transforming data from API into unified dictionary. 
    # 1a: creating list of available nodes. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))

    # 1b: Using all_nodes to go through raw data from API -- in this case this should just by 1 node. 
    for node in all_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': language, 'AvailableLanguages': []}
        
        item_name = language + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys():
                    node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[language + node] = node_data

    # Step 2: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki node name: " + node_title + "; downloaded in language: " + language + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#   finally:
#       return wiki_data


In [4]:
# Function: download data on links, background data and available language for ONE node: 'node_title' using Wikimedia API and save to json file. 
def downloadNode(node_title, language = "en"): 
    
    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the node of the network. 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():
        
        PARAMS_CONT = PARAMS
        PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS_CONT)
        wiki_data.append(response.json())
    
    # step 2: transform and save data:  
    SaveData(wiki_data, node_title=node_title, language=language)   


In [5]:
# Function: download data on links from ALL PAGES linked to 'node_title' (but excluding node_title itself!) from Wikimedia API and save to json file.  
def downloadNetwork(node_title, language = "en"): 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    print("Downloading Wiki network name: " + node_title + " in language: " + language + ". Please note that this can take a while.")
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS_CONT = PARAMS
        if 'plcontinue' in wiki_data[-1]['continue']:
            PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 

        if 'gplcontinue' in wiki_data[-1]['continue']: 
            PARAMS_CONT["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']

        response = S.get(url=API_ENDPOINT, params = PARAMS_CONT)
        wiki_data.append(response.json())

    # step 2: transform and save data:  
    SaveData(wiki_data, node_title=node_title, language=language)   

In [6]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_language = "en", requested_languages = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_language][0]
    list_available_languages = []
    for item in available_languages: 
         list_available_languages.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if requested_languages == []:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_languages)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in available_languages: 
            if item['lang'] in requested_languages: 
                downloadNode(node_title = item['*'], language = item['lang'])
                downloadNetwork(node_title = item['*'], language = item['lang'])
    
        print("Download of additional languages finished.") 


In [7]:
# function: provide titles of networks that are saved in the JSON file. Also provides the language they were saved in. 
def getDownloadedNetworks(): 

    # download data from JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # create set of ego network names.  
    downloaded_networks = [(v['ego']) for (k,v) in network_data.items()]
    downloaded_networks = set(list(chain(*downloaded_networks)))
    downloaded_networks = [v for (k,v) in network_data.items() if v['title'] in downloaded_networks]

    # print names of ego networks and language that they have been downloaded in. 
    items = []  
    for network in downloaded_networks: 
        language_data = {}
        language_data['lang'] = network['language']
        language_data['*'] = network['title']
        items.append(language_data)
    
    return(items)

In [8]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, language):
        
        # Select node in JSON file (by title and language). 
        node_data = [v for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == language][0]
        
        # Extract data and place in instance of Wikinode class. 
        self.node_title = node_data['title']
        self.node_ID = node_data['node_ID']
        self.node_links = node_data['links']
        self.node_language = node_data['language']


In [54]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title,language):
        # make list available ego networks
        available_networks = [(v['ego']) for (k,v) in network_data.items()]
        available_networks = set(list(chain(*available_networks)))
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        if node_title in available_networks:
            WikiNode.__init__(self, node_title, language)
            self.network_nodes = {}
            self.network_links = []
            self.network_edges = [] 
            self.network_status = []
        
            # Go through node_links of the central node (node_title) to build network.
            for link in self.node_links:
                try:     
                    Node2 = WikiNode(link, language)             
                    purged_links = [x for x in Node2.node_links if x in self.node_links]
                    purged_edges = []
                    for purged_link in purged_links:
                        purged_edges.append((link,purged_link))  
                    self.network_nodes[Node2.node_ID] = Node2
                    self.network_links = self.network_links + purged_links
                    self.network_edges = self.network_edges + purged_edges
                except: 
                    print('Loading of node ' + link + ' failed.')
                self.nodes_count = Counter(self.network_nodes)
            print("Data Succesfully loaded.")
        
        else: 
            print("Node not available. Download using downloadNetwork() function.") 

    def getNodes(self,threshold=0):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        nodes_network = []

        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return nodes_network

    def getEdges(self,threshold=0):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        return edges_network

    def getStatsNetwork(self): 
        # TODO: return a dictionary with stats on network: 
        # - triangles
        # - degree_centrality 
        # - ... 

        print('WIP')

    def getStatsNodes(self, nodes):
        # TODO: return an numpy array with stats per node: 
        # - triangles
        # - degree_centrality 
        # - ... 
        # if nodes == None: 
          #  node = self.node_links

        print('WIP')
    
    def getNetworkCommunities(self,threshold=0):
        # TODO: return an numpy array with stats per community. Add in an overall library.  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(threshold))
        
        return greedy_modularity_communities(G)
    
    def drawGraph(self,threshold=0,name='no_name'):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(threshold))

        netdraw = Network('2000px', '2000px')
        netdraw.from_nx(G)
        netdraw.barnes_hut()

        title = name + ".html"
        netdraw.show(title)



In [83]:
# Class: a collection of wiki networks of the same topic, in different languages. Automatically takes all languages that have been downloaded before. 
class WikiNetworkCollection():

        def __init__(self,node_title,original_language):
        
                # make list available ego networks
                all_downloaded_networks = getDownloadedNetworks()
                available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title][0]
                
                topic_networks = []
                topic_networks = [{'lang': original_language, '*': node_title}] + [v for v in available_languages if v in all_downloaded_networks]

                # initiate tclass WikiNetwork for each available language.  
                self.networks = {}
                if topic_networks is not []:
                        for network in topic_networks:
                                self.networks[network['lang'] + '_' + network['*']] = WikiNetwork(node_title = network['*'], language = network['lang'])

        def getStatsIsomorphism(self):
        # TODO: return an numpy array with stats per network: 
        # - ... related to similarities / difference of network to other networks in collection. 
                print('WIP')


In [61]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [84]:
test = WikiNetworkCollection('Oxford', original_language = 'en')

Data Succesfully loaded.
Data Succesfully loaded.
Data Succesfully loaded.


In [91]:
test.networks['en_Oxford'].network_links

 Coachway',
 'Monmouth',
 'National Express Coaches',
 'Newport, Wales',
 'Oxford Ring Road',
 'Oxfordshire',
 'Oxfordshire County Council',
 'River Thames',
 'Wales',
 'Wayback Machine',
 'Wheatley, Oxfordshire',
 'Witney',
 'Worcester, England',
 'A40 road',
 'Oxford Ring Road',
 'A40 road',
 'A40 road (Great Britain)',
 'A423 road',
 'A44 road',
 'Bath, Somerset',
 'Botley, Oxfordshire',
 'Botley Road',
 'Bristol',
 'Chippenham',
 'Cumnor Hill',
 'Faringdon',
 'Geographic coordinate system',
 'Headington',
 'Headington Hill',
 'High Street, Oxford',
 'Magdalen Bridge',
 'Oxford Ring Road',
 'Oxford United F.C.',
 'Oxford railway station',
 'Oxpens Road',
 'River Thames',
 'South East England',
 "St Clement's, Oxford",
 'Swindon',
 'Vale of White Horse',
 'A34 road (England)',
 'A40 road',
 'A4142 road',
 'A420 road',
 'A44 road',
 'Banbury',
 'Berkshire',
 'Birmingham',
 'Coventry',
 'Geographic coordinate system',
 'M40 motorway',
 'Oxford Ring Road',
 'A34 road (England)',
 'A40 r

In [170]:
node_title = "Oxford"
original_language = 'en'

all_downloaded_networks = getDownloadedNetworks()
available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title][0]

topic_networks = []
topic_networks = [{'lang': original_language, '*': node_title}] + [v for v in available_languages if v in all_downloaded_networks]

In [172]:
networtopic_networks[1]

{'lang': 'de', '*': 'Oxford'}