In [1]:
# Setup packages (might be able to remove a few of these.)
# packages needed for downloading, saving and loading data 
import os
from collections import Counter
import collections
from dash_bootstrap_components._components.Col import Col 
import json
from networkx.algorithms.traversal.depth_first_search import dfs_labeled_edges
import pandas as pd
import numpy as np
from sklearn import preprocessing

# packages for creation classes and network analysis 
import networkx as nx
from itertools import chain
# import communities
# from networkx.algorithms import approximation
# from networkx.algorithms import community
from networkx.algorithms.community import greedy_modularity_communities
from networkx.utils import not_implemented_for 
__all__ = [
    "eccentricity",
    "diameter",
    "radius",
    "periphery",
    "center",
    "barycenter",
    "degree_centrality",
    "constraint", 
    "local_constraint", 
    "effective_size"
]

# Dash packages for presentation analysis  
import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_cytoscape as cyto
import dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import plotly.express as px

# setup layout and paths
path = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
data_file = "data_new2.json" 
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
styles = {
    'pre': {
        'border': 'thin lightgrey solid',
        'overflowX': 'scroll'
    }
}

In [2]:
def degree_centrality(G):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/centrality/degree_alg.html#degree_centrality
    
    if len(G) <= 1:
        return {n: 1 for n in G}

    s = 1.0 / (len(G) - 1.0)
    centrality = {n: d * s for n, d in G.degree()}
    return centrality

def eccentricity(G, v=None, sp=None):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/distance_measures.html#eccentricity

    order = G.order()

    e = {}
    for n in G.nbunch_iter(v):
        if sp is None:
            length = nx.single_source_shortest_path_length(G, n)
            L = len(length)
        else:
            try:
                length = sp[n]
                L = len(length)
            except TypeError as e:
                raise nx.NetworkXError('Format of "sp" is invalid.') from e
        if L != order:
            if G.is_directed():
                msg = (
                    "Found infinite path length because the digraph is not"
                    " strongly connected"
                )
            else:
                msg = "Found infinite path length because the graph is not" " connected"
            raise nx.NetworkXError(msg)

        e[n] = max(length.values())

    if v in G:
        return e[v]  # return single value
    else:
        return e


In [42]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, lang, network_data):

        node_data = network_data.loc[network_data['title'] == node_title].loc[network_data['lang'] == lang]
        
        self.node_title = node_data[['title']].iloc[0,0] # iloc[0,0] needed because there can be two instance of same wikipage in the dataframe: one as centralnode (with langlinks) and one as a normal node of other network (without langlinks).  
        self.node_ID = node_data[['uniqueid']].iloc[0,0]
        self.node_links = node_data[['links']].iloc[0,0]
        self.node_lang = node_data[['lang']].iloc[0,0]

In [43]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title, lang):
        
        saved_network_data = pd.read_json((path + data_file), orient='split')
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title, lang, network_data = saved_network_data)
        self.network_nodes = {}
        self.network_links = []
        self.network_edges = [] 
        self.network_status = []
        
        # Go through node_links of the central node (node_title) to build network.
        try: 
            for link in self.node_links + [self.node_title]:
                Node2 = WikiNode(link, lang, network_data = saved_network_data) # NB: the links are not always in the same language as the network. - for now it just skips. 
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_ID] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
        except: 
            pass
        self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape", threshold=0):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]
        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape", threshold=0):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]
        
        if type == 'networkx':
            return edges_network
        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]

    def getCommunities(self,threshold=0):  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx', threshold= threshold))
        return greedy_modularity_communities(G)

    def getStatsNodes(self, threshold = 0):
        # I think this will work much better with pandas... For next sprint. (see: https://stackoverflow.com/questions/46711557/calculating-min-and-max-over-a-list-of-dictionaries-for-normalizing-dictionary-v)
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx', threshold= threshold))

        data = {}
        degree_centrality_nodes = degreeCentrality(G)
        eccentricity_nodes = eccentricity(G) 

        for item in G.nodes: 
            data[item] = {'Centrality': round(degree_centrality_nodes[item], 4), 'Eccentricity': eccentricity_nodes[item]} 

        return(data)


In [72]:
network_data_df = pd.read_json((path + data_file), orient='split')
available_wiki_networks = network_data_df.loc[network_data_df['langlinks'].notnull()].loc[network_data_df['lang'] == 'en']['title'].values.tolist()

In [74]:
network_data_df = pd.read_json((path + data_file), orient='split')
# wiki_page_options = [v['AvailableLanguages'] for v in network_data.values() if v['title'] == all_networks[selected_network]['*'] if v['language'] == all_networks[selected_network]['lang']]
# language_options = [selected_network] + [k for k,v in all_networks.items() if {'lang': v['lang'], '*': v['*']} in wiki_page_options[0]]

In [83]:
all_networks_keys = network_data_df.loc[network_data_df['langlinks'].notnull()]['title'].values.tolist()
all_networks_values = network_data_df.loc[network_data_df['langlinks'].notnull()]['lang'].values.tolist()
all_networks = dict(zip(all_networks_keys, all_networks_values))


In [95]:
node_title = 'Verkhovna Rada'
lang = 'en'

node_title_langlinks = network_data_df.loc[network_data_df['langlinks'].notnull()].loc[network_data_df['title'] == node_title].loc[network_data_df['lang'] == lang]['langlinks'].values.tolist()[0]
node_title_langlinks = [i['*'] for i in node_title_langlinks]

In [102]:
language_options = [{k,v} for k,v in all_networks.items() if k in node_title_langlinks] 
language_options = ["{} ({})".format(v,k) for k,v in language_options] 


In [103]:
test 

['Verkhovna Rada (en)',
 'المجلس الأعلى الأوكراني (ar)',
 'de (Werchowna Rada)',
 'fr (Rada (Ukraine))',
 'nl (Verchovna Rada)']

In [55]:
test = WikiNetwork(node_title='Hydrology', lang = 'en')

In [61]:
test.getNodes(type = 'networkx')

[('Atmospheric science', {'name': 'Atmospheric science'}),
 ('Bibcode (identifier)', {'name': 'Bibcode (identifier)'}),
 ('Doi (identifier)', {'name': 'Doi (identifier)'}),
 ('Earth science', {'name': 'Earth science'}),
 ('Geochemistry', {'name': 'Geochemistry'}),
 ('Geodesy', {'name': 'Geodesy'}),
 ('Geological Society of America', {'name': 'Geological Society of America'}),
 ('Geology', {'name': 'Geology'}),
 ('Geophysics', {'name': 'Geophysics'}),
 ('ISBN (identifier)', {'name': 'ISBN (identifier)'}),
 ('John Wiley & Sons', {'name': 'John Wiley & Sons'}),
 ('Meteorology', {'name': 'Meteorology'}),
 ('Oceanography', {'name': 'Oceanography'}),
 ('Paleontology', {'name': 'Paleontology'}),
 ('Petrology', {'name': 'Petrology'}),
 ('Robert E. Horton', {'name': 'Robert E. Horton'}),
 ('S2CID (identifier)', {'name': 'S2CID (identifier)'}),
 ('Seismology', {'name': 'Seismology'}),
 ('Tectonophysics', {'name': 'Tectonophysics'}),
 ('United States Geological Survey',
  {'name': 'United States 

In [11]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [10]:
all_networks = getDownloadedNetworks()

In [19]:
# test download. 
test = downloadNetworks(node_title="Cambridge", original_lang="fr")

Downloading Wiki network name: Cambridge in language: fr. Please note that this can take a while.
Data succesfully saved. Wiki node name: Cambridge; downloaded in language: fr.
Downloading Wiki network name: Cambridge (Verenigd Koninkrijk) in language: nl. Please note that this can take a while.
Data succesfully saved. Wiki node name: Cambridge (Verenigd Koninkrijk); downloaded in language: nl.
Download of additional languages finished.
Download of additional languages finished.


In [130]:
# turning wikipedia API output into a list of panda dataframes 
df = [pd.DataFrame.from_dict(item['query']['pages']).transpose() for item in test]

In [138]:
# merging all these dataframes into one, using 'update' method. 
# note that does not work yet for intiial node, because different data (columns) were downloaded. I need to download langlinks and info for all nodes. Using zip to make files smaller.  
for item in df: df[1].update(item)

In [141]:
# the resulting df is the one on which the 'update' method was run. 
df_result = df[1]

ns                                                         0
title                               Chichester (Royaume-Uni)
missing                                                  NaN
pageid                                                769145
links      [{'ns': 0, 'title': 'Aberdeen'}, {'ns': 0, 'ti...
Name: 769145, dtype: object

In [88]:
result = pd.concat(test3)

In [89]:
result

Unnamed: 0,contentmodel,langlinks,lastrevid,length,links,ns,pageid,pagelanguage,pagelanguagedir,pagelanguagehtmlcode,title,touched,missing
15789,wikitext,"[{'lang': 'af', '*': 'Cambridge'}, {'lang': 'a...",179254172,17453,"[{'ns': 0, 'title': '1025'}, {'ns': 0, 'title'...",0,15789,fr,ltr,fr,Cambridge,2021-08-02T19:50:20Z,
-1,,,,,,0,,,,,Cambridge City F.C.,,
-2,,,,,,0,,,,,"Chesterton, Cambridge",,
-3,,,,,,0,,,,,City Ground (Cambridge),,
-4,,,,,,0,,,,,Gerri Bird,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40398,,,,,,0,40398,,,,York,,
10300,,,,,,0,10300,,,,Écosse,,
27816,,,,,,0,27816,,,,Édimbourg,,
11495371,,,,,"[{'ns': 0, 'title': 'Accolade (architecture)'}...",0,11495371,,,,Église Saint-Bene't,,


In [118]:
result2 = result.groupby(['pageid','title'])['links'].apply(lambda x: x + x).reset_index()

In [119]:
result2

Unnamed: 0,index,links
0,15789,"[{'ns': 0, 'title': '1025'}, {'ns': 0, 'title'..."
1,15789,
2,15789,
3,15789,
4,15789,
...,...,...
23216,1377936,
23217,1377936,
23218,1377936,
23219,1377936,


In [112]:
result2.iloc[50]['links']

nan

In [100]:
if 'nan' in result2: result2.remove('nan')

In [101]:
result2

Unnamed: 0,pageid,title,links
0,269,Alphabet phonétique international,"[{'ns': 0, 'title': '1886'}, {'ns': 0, 'title'..."
1,1348,Héraldique,"[{'ns': 0, 'title': '1806'}, {'ns': 0, 'title'..."
2,1367,Hongrie,"nan,[{'ns': 0, 'title': '.hu'}, {'ns': 0, 'tit..."
3,1490,Irlande du Nord,"nan,nan,nan,nan,[{'ns': 0, 'title': '12 juille..."
4,1712,Liste des pays du monde,"nan,nan,nan,nan,nan,[{'ns': 0, 'title': 'Abkha..."
...,...,...,...
210,13226163,Sawtry,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
211,13230180,Yaxley (Cambridgeshire),"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
212,13793011,Kettle's Yard,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
213,14131764,Lord Lieutenant du Cambridgeshire,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
