In [1]:
# Setup packages 
from collections import Counter
from dash_bootstrap_components._components.Row import Row
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms
import networkx.utils # (algorithms to check out: "approximation", "eccentricity", "diameter", "radius", "periphery", "center", "barycenter", "Community" "degree_centrality", "constraint", "local_constraint", "effective_size") 

# Dash packages needed for building dash app 

import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_cytoscape as cyto
import dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate

# setup layout and paths
path = "/home/teijehidde/Documents/Git Blog and Coding/"
data_file = "data_dump/data_new2.json" 
external_stylesheets = path + 'Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/stylesheet.css' # downloaded from: https://codepen.io/chriddyp/pen/bWLwgP.css. Should appear in credits! 
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
styles = {
    'pre': {
        'border': 'thin lightgrey solid',
        'overflowX': 'scroll'
    }
}
colors = {
    'background': 'white', # use color coding later. 
    'text': 'gray'
}
list_colours = ['red', 'blue', 'purple','orange','green','olive', 'maroon', 'brown','lime','teal' ]

In [6]:
# loading data. 
network_data_df = pd.read_json((path + data_file), orient='split')
all_networks = network_data_df.loc[network_data_df['langlinks'].notnull()].loc[network_data_df['lang'] == 'en']['title'].values.tolist()
# new version: all_networks = network_data_df.loc[network_data_df['ego'] == True].loc[network_data_df['lang'] == 'en']['title'].values.tolist()

In [7]:
# setting up classes WikiNode and WikiNetwork. 
class WikiNode:
    def __init__(self, node_title, lang, network_data):

        node_data = network_data.loc[network_data['title'] == node_title].loc[network_data['lang'] == lang]
        
        self.node_title = node_data[['title']].iloc[0,0] # iloc[0,0] needed because there can be two instance of same wikipage in the dataframe: one as centralnode (with langlinks) and one as a normal node of other network (without langlinks).  
        self.node_ID = node_data[['uniqueid']].iloc[0,0]
        self.node_links = node_data[['links']].iloc[0,0]
        self.node_lang = node_data[['lang']].iloc[0,0]

In [63]:
class WikiNetwork(WikiNode):
   
    def __init__(self, node_title, lang, threshold = 0):
        
        WikiNode.__init__(self, node_title, lang, network_data = network_data_df)
        self.threshold = threshold
        self.network_nodes = {}
        self.network_links = [node_title]
        self.network_edges = [] 
        self.network_status = []
        
        # Go through node_links of the central node (node_title) to build network.
        for link in self.node_links + [self.node_title]:
            try: 
                Node2 = WikiNode(link, lang, network_data = network_data_df) # NB: the links are not always in the same language as the network. It throws an error as result. - for now it just skips. 
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_ID] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
            except: 
                pass
        self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape"):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]
        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape"):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]
        
        if type == 'networkx':
            return edges_network
        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]

    def getCommunities(self):  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        communities = greedy_modularity_communities(G)

        result = []
        for number in range(len(communities)): 
            result = result + [{i: number} for i in list(communities[number])] 

        return result

    def getStatsNodes(self):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        communities = greedy_modularity_communities(G)
        degree_centrality_nodes = networkx.algorithms.centrality.degree_centrality(G)
        eccentricity_nodes = networkx.algorithms.distance_measures.eccentricity(G)
        dict_communities = {key:value for value in range(len(communities)) for key in communities[value] }

        df = pd.DataFrame({'degree_centrality':pd.Series(degree_centrality_nodes), 'eccentricity':pd.Series(eccentricity_nodes), 'community':pd.Series(dict_communities)}) 

        val_max = max(df['degree_centrality'])
        val_min = min(df['degree_centrality'])
        df[['normalized_centrality']] = df[['degree_centrality']].apply(lambda x: (x - val_min) / (val_max - val_min), result_type = 'expand')
        df[['degree_centrality_rounded']] = df[['degree_centrality']].apply(lambda x: round(x, 2))

        return df

In [19]:
node_title = 'Vacuna'
lang = 'es'
network_data = network_data_df

node_data = network_data.loc[network_data['title'] == node_title].loc[network_data['lang'] == lang]


In [64]:
wiki_page = WikiNetwork('Vacuna', 'es')

In [65]:
stats_nodes = wiki_page.getStatsNodes()

In [66]:
set(stats_nodes['community'].tolist())

{0, 1, 2, 3, 4}

In [74]:
[stats_nodes.loc[stats_nodes['community'] == int(number)].loc[stats_nodes['degree_centrality'] == max(stats_nodes['degree_centrality'].loc[stats_nodes['community'] == int(number)].tolist())] for number in set(stats_nodes['community'].tolist())]

[        degree_centrality  eccentricity  community  normalized_centrality  \
 Vacuna                1.0             1          0                    1.0   
 
         degree_centrality_rounded  
 Vacuna                        1.0  ,
                         degree_centrality  eccentricity  community  \
 Control de autoridades            0.84106             2          1   
 Wikidata                          0.84106             2          1   
 
                         normalized_centrality  degree_centrality_rounded  
 Control de autoridades                   0.84                       0.84  
 Wikidata                                 0.84                       0.84  ,
         degree_centrality  eccentricity  community  normalized_centrality  \
 Cólera           0.152318             2          2               0.146667   
 
         degree_centrality_rounded  
 Cólera                       0.15  ,
                               degree_centrality  eccentricity  community  \
 Experimentac

In [55]:
stats_nodes.loc[stats_nodes['community'] == 2]

Unnamed: 0,degree_centrality,eccentricity,community,normalized_centrality
Afganistán,0.14,2,2,0.133333
Cólera,0.15,2,2,0.146667
Pakistán,0.15,2,2,0.14


In [47]:
stats_nodes.loc[stats_nodes['degree_centrality'] == max(stats_nodes['degree_centrality'].tolist())].index 

Index(['Vacuna'], dtype='object')

In [41]:
val_max = max(stats_nodes['degree_centrality'])
val_min = min(stats_nodes['degree_centrality'])

stats_nodes[['normalized_centrality']] = stats_nodes[['degree_centrality']].apply(lambda x: (x - val_min) / (val_max - val_min), result_type = 'expand')
# pd.concat([pd_nodes, stats_nodes], axis = 1)


In [43]:
stats_nodes

Unnamed: 0,degree_centrality,eccentricity,community,normalized_centrality
2009 swine flu pandemic vaccine,0.328767,3,0,0.545714
2017 Democratic Republic of the Congo Ebola virus outbreak,0.015411,4,0,0.022857
ABC News (Australia),0.003425,5,3,0.002857
ACE inhibitor,0.294521,4,1,0.488571
AIDS,0.029110,4,0,0.045714
...,...,...,...,...
Yellow fever vaccine,0.166096,4,0,0.274286
Yersinia pestis,0.011986,4,0,0.017143
Zika virus,0.013699,4,0,0.020000
Zika virus vaccine,0.162671,4,0,0.268571
