In [22]:
# Setup packages
import os
from itertools import chain
from collections import Counter
import collections 
import requests
import json
import pandas as pd
import numpy as np

# all these network algorithms are currently being tried out. 
import networkx as nx
from networkx.algorithms import approximation
from networkx.algorithms import community
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms.community import greedy_modularity_communities
from networkx.utils import not_implemented_for 
__all__ = [
    "eccentricity",
    "diameter",
    "radius",
    "periphery",
    "center",
    "barycenter",
    "degree_centrality",
    "constraint", 
    "local_constraint", 
    "effective_size"
]


# For network visualization: 
from pyvis.network import Network
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
DATA_FILE = "DATA.json" 

# Loading JSON file: 
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [23]:
def SaveData(wiki_data, node_title, lang):
    # step 1: transforming data from API into unified dictionary. 
    # 1a: creating list of available nodes. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))

    # 1b: Using all_nodes to go through raw data from API -- in this case this should just by 1 node. 
    for node in all_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': lang, 'AvailableLanguages': []}
        
        item_name = lang + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys():
                    node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[lang + node] = node_data

    # Step 2: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki node name: " + node_title + "; downloaded in language: " + lang + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#   finally:
#       return wiki_data


In [24]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_lang, additional_langs = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_lang][0]
    list_available_languages = []
    for item in available_languages: 
         list_available_languages.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if additional_langs == []:
        pass

    if additional_langs == ["available_langs"]:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_languages)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in available_languages: 
            if item['lang'] in additional_langs:
                downloadNetworks(node_title = item['*'], original_lang = item['lang'], additional_langs = [])
    
        print("Download of additional languages finished.") 


In [25]:
# Function: download data on links from ALL PAGES linked to 'node_title' (but excluding node_title itself!) from Wikimedia API and save to json file.  
def downloadNetworks(node_title, original_lang = "en", additional_langs = ["ar" "de", "fr", "nl"]): 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + original_lang + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the central node of the network (incl. available languages). 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():
        
        PARAMS_CONT = PARAMS
        PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS_CONT)
        wiki_data.append(response.json())

    # step 2: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    print("Downloading Wiki network name: " + node_title + " in language: " + original_lang + ". Please note that this can take a while.")
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS_CONT = PARAMS
        if 'plcontinue' in wiki_data[-1]['continue']:
            PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 

        if 'gplcontinue' in wiki_data[-1]['continue']: 
            PARAMS_CONT["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']

        response = S.get(url=API_ENDPOINT, params = PARAMS_CONT)
        wiki_data.append(response.json())

    # step 3: transform and save data:  
    SaveData(wiki_data, node_title=node_title, lang=original_lang)

    # step 4: download additional languages: 
    downloadAdditionalLanguage(node_title = node_title, original_lang = original_lang, additional_langs = additional_langs)



In [26]:
# function: provide titles of networks that are saved in the JSON file. Also provides the language they were saved in. 
def getDownloadedNetworks(): 

    # download data from JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # create set of ego network names.  
    downloaded_networks = [(v['ego']) for (k,v) in network_data.items()]
    downloaded_networks = set(list(chain(*downloaded_networks)))
    downloaded_networks = [v for (k,v) in network_data.items() if v['title'] in downloaded_networks]

    # print names of ego networks and language that they have been downloaded in. 
    items = {}  
    for network in downloaded_networks: 
        items[network['title'] + ' (' + network['language'] + ')'] = {'lang':  network['language'], '*': network['title']}    
    return(items)

In [27]:
def degree_centrality(G):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/centrality/degree_alg.html#degree_centrality
    
    if len(G) <= 1:
        return {n: 1 for n in G}

    s = 1.0 / (len(G) - 1.0)
    centrality = {n: d * s for n, d in G.degree()}
    return centrality

def eccentricity(G, v=None, sp=None):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/distance_measures.html#eccentricity

    order = G.order()

    e = {}
    for n in G.nbunch_iter(v):
        if sp is None:
            length = nx.single_source_shortest_path_length(G, n)
            L = len(length)
        else:
            try:
                length = sp[n]
                L = len(length)
            except TypeError as e:
                raise nx.NetworkXError('Format of "sp" is invalid.') from e
        if L != order:
            if G.is_directed():
                msg = (
                    "Found infinite path length because the digraph is not"
                    " strongly connected"
                )
            else:
                msg = "Found infinite path length because the graph is not" " connected"
            raise nx.NetworkXError(msg)

        e[n] = max(length.values())

    if v in G:
        return e[v]  # return single value
    else:
        return e


In [28]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, lang):
        
        # Select node in JSON file (by title and language). 
        node_data = [v for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == lang][0]
        
        # Extract data and place in instance of Wikinode class. 
        self.node_title = node_data['title']
        self.node_ID = node_data['node_ID']
        self.node_links = node_data['links']
        self.node_lang = node_data['language']


In [29]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title, lang):
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title, lang)
        self.network_nodes = {}
        self.network_links = []
        self.network_edges = [] 
        self.network_status = []
    
        # Go through node_links of the central node (node_title) to build network.
        for link in self.node_links:
            try:     
                Node2 = WikiNode(link, lang)             
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_ID] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
            except: 
                print('Loading of node ' + link + ' failed.')
            self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape", threshold=0):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]

        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape", threshold=0):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        if type == 'networkx':
            return edges_network

        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]
    
    def getCommunities(self,threshold=0):  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx', threshold= threshold))
        return greedy_modularity_communities(G)

    def getStatsNodes(self,threshold=0):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx', threshold= threshold))

        data = {}
        degree_centrality_nodes = degree_centrality(G)
        eccentricity_nodes = eccentricity(G)

        for item in G.nodes: 
            data[item] = {'Centrality': degree_centrality_nodes[item], 'Eccentricity': eccentricity_nodes[item]} 

        return(data)
    
    def getStatsCommunities(self, node):
        # TODO: return an numpy array with stats per node: 
        # Algorithms to consider (see networkx):
        # - Distance measures: barycenter, center, [ALSO APPLY TO COMMUNITIES?]
        # - dominating_set(G, start_with=None) [ALSO APPLY TO COMMUNITIES?]
        # - Group Centrality
        # - ... 
        # if nodes == None: 
          #  node = self.node_links

        print('WIP')
    
    def getStatsNetwork(self): 
        return(
            pd.DataFrame(
                {
                    "A": self.node_ID,
                    "B": pd.Timestamp("20200102"),
                    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
                    "D": np.array([3] * 4, dtype="int32"),
                    "E": pd.Categorical(["test", "train", "test", "train"]),
                    "F": self.node_title,
                }
                )
            )   
        # TODO: return a dictionary with stats on network: 
        # Algorithms to consider (see networkx):
        # - number nodes, number edges, average edges per node. 
        # - Distance measures: barycenter, center, [ALSO APPLY TO COMMUNITIES?]
        # - dominating_set(G, start_with=None) [ALSO APPLY TO COMMUNITIES?]
        # - node_connectivity
        # - k_components
        # - average_clustering
        # - Small-world
        # - Summarization [NB - possibly use to improve render time graph visualizations.]


In [30]:
# Class: a collection of wiki networks of the same topic, in different languages. Automatically takes all languages that have been downloaded before. 
class WikiNetworkCollection():

        def __init__(self,chosen_networks):
        
                # make list available ego networks
                all_downloaded_networks = getDownloadedNetworks()
                available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title][0]
                
                topic_networks = []
                topic_networks = [{'lang': original_language, '*': node_title}] + [v for v in available_languages if v in all_downloaded_networks]

                # initiate tclass WikiNetwork for each available language.  
                self.networks = {}
                if topic_networks is not []:
                        for network in topic_networks:
                                self.networks[network['lang'] + '_' + network['*']] = WikiNetwork(node_title = network['*'], language = network['lang'])

        def getStatsIsomorphism(self):
        # TODO: return an numpy array with stats per network: 
        # - ... related to similarities / difference of network to other networks in collection. 
                print('WIP')
        # Algorithms to consider: 
        # - networkx: isomorphism. 
        # - networkx: Similarity Measures


In [10]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [11]:
getDownloadedNetworks()

{'Flask (en)': {'lang': 'en', '*': 'Flask'},
 'Kolba (de)': {'lang': 'de', '*': 'Kolba'},
 'Royston (en)': {'lang': 'en', '*': 'Royston'},
 'Oxford (en)': {'lang': 'en', '*': 'Oxford'},
 'Flask (fr)': {'lang': 'fr', '*': 'Flask'},
 'Oxford (de)': {'lang': 'de', '*': 'Oxford'},
 'Oxford (fr)': {'lang': 'fr', '*': 'Oxford'},
 'Vaccine (en)': {'lang': 'en', '*': 'Vaccine'},
 'Vaccine (fr)': {'lang': 'fr', '*': 'Vaccine'},
 'Vaccin (fr)': {'lang': 'fr', '*': 'Vaccin'},
 'Vaccine (de)': {'lang': 'de', '*': 'Vaccine'},
 'Impfstoff (de)': {'lang': 'de', '*': 'Impfstoff'},
 'Secularism (en)': {'lang': 'en', '*': 'Secularism'},
 'علمانية (ar)': {'lang': 'ar', '*': 'علمانية'},
 'Laïcité (fr)': {'lang': 'fr', '*': 'Laïcité'},
 'Secularisme (nl)': {'lang': 'nl', '*': 'Secularisme'},
 'لقاح (ar)': {'lang': 'ar', '*': 'لقاح'},
 'Aşı (tıp) (tr)': {'lang': 'tr', '*': 'Aşı (tıp)'}}

In [13]:
# downloadNetworks("Vaccine", original_lang='en', additional_langs=['de', 'ar', 'fr', 'tr'])

In [237]:
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)
value = 'Oxford (en)'
all_networks = getDownloadedNetworks()

wiki_page = WikiNetwork(node_title=all_networks[value]['*'], lang=all_networks[value]['lang'] )
list_colours = ['red', 'blue', 'purple','orange','green','olive', 'maroon', 'brown','lime','teal' ]

nodes = wiki_page.getNodes(type='cytoscape', threshold=0)
edges = wiki_page.getEdges(type='cytoscape', threshold=0)
communities = wiki_page.getCommunities()
stats_nodes = wiki_page.getStatsNodes(threshold=0)

d = {'selector': [ ],
      'style':   [ ] } 

for item in stats_nodes: 
    d['selector'].append(item)
    d['style'].append({ 'background-opacity': stats_nodes[item]['Centrality'], 
                        'width': stats_nodes[item]['Centrality'], 
                        'height': stats_nodes[item]['Centrality']
                    })



Loading of node C. S. Lewis Nature Reserve failed.
Loading of node Labstep failed.


In [239]:
my_stylesheet = pd.DataFrame(data=d)

In [240]:
my_stylesheet

Unnamed: 0,selector,style
0,102 Dalmatians,{'background-opacity': 0.0012422360248447205}
1,Wayback Machine,{'background-opacity': 0.2}
2,2011 United Kingdom census,{'background-opacity': 0.03850931677018634}
3,Birmingham,{'background-opacity': 0.1863354037267081}
4,British Asian,{'background-opacity': 0.03229813664596273}
...,...,...
801,List of attractions in Oxford,{'background-opacity': 0.0012422360248447205}
802,Oxford Town Hall,{'background-opacity': 0.0012422360248447205}
803,Ramallah,{'background-opacity': 0.0012422360248447205}
804,Tourist attraction,{'background-opacity': 0.0012422360248447205}
