In [185]:
# Setup packages 
from collections import Counter
from collections import ChainMap
from dash_bootstrap_components._components.Row import Row
import pandas as pd
from pandas.core.common import flatten
import numpy as np
import networkx as nx
from networkx.algorithms import approximation
from networkx.algorithms.community import greedy_modularity_communities
import networkx.algorithms
import networkx.utils # (algorithms to check out: "approximation", "eccentricity", "diameter", "radius", "periphery", "center", "barycenter", "Community" "degree_centrality", "constraint", "local_constraint", "effective_size") 

# Dash packages needed for building dash app 

import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_cytoscape as cyto
import dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate

# setup layout and paths
path = "/home/teijehidde/Documents/Git Blog and Coding/"
data_file = "data_dump/data_new6.json" 
external_stylesheets = path + 'Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/stylesheet.css' # downloaded from: https://codepen.io/chriddyp/pen/bWLwgP.css. Should appear in credits! 
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
styles = {
    'pre': {
        'border': 'thin lightgrey solid',
        'overflowX': 'scroll'
    }
}
colors = {
    'background': 'white', # use color coding later. 
    'text': 'gray'
}
list_colours = ['red', 'blue', 'purple','orange','green','olive', 'maroon', 'brown','lime','teal' ]

In [186]:
# loading data. 
network_data_df = pd.read_json((path + data_file), orient='split')
all_networks = network_data_df.loc[network_data_df['ego'] == True].loc[network_data_df['lang'] == 'en']['title'].values.tolist()

In [187]:
# setting up classes WikiNode and WikiNetwork. 
class WikiNode:
    def __init__(self, node_title, lang, network_data):

        node_data = network_data.loc[network_data['title'] == node_title].loc[network_data['lang'] == lang]
        
        self.node_title = node_data[['title']].iloc[0,0] # iloc[0,0] needed because there can be two instance of same wikipage in the dataframe: one as centralnode (with langlinks) and one as a normal node of other network (without langlinks).  
        self.node_ID = node_data[['uniqueid']].iloc[0,0]
        self.node_links = node_data[['links']].iloc[0,0]
        self.node_lang = node_data[['lang']].iloc[0,0]
        try: 
            self.node_translation = node_data['langlinks'].values.tolist()[0][0]['*']
        except: 
            self.node_translation = '.'

In [197]:
class WikiNetwork(WikiNode):
   
    def __init__(self, node_title, lang, threshold = 0):
        
        WikiNode.__init__(self, node_title, lang, network_data = network_data_df)
        self.threshold = threshold
        self.network_nodes = {}
        self.network_links = [node_title]
        self.network_edges = [] 
        self.network_status = []
        
        # Go through node_links of the central node (node_title) to build network.
        for link in self.node_links + [self.node_title]:
            try: 
                Node2 = WikiNode(link, lang, network_data = network_data_df) # NB: the links are not always in the same language as the network. It throws an error as result. - for now it just skips. 
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_ID] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
            except: 
                pass
        self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape"):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]
        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape"):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]
        
        if type == 'networkx':
            return edges_network
        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]

    def getTranslations(self):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        translation_list = [(self.network_nodes[i].node_title, self.network_nodes[i].node_translation) for i in self.network_nodes if self.network_nodes[i].node_title in selected_nodes]
        
        translation_nodes = {} 
        for node in selected_nodes:
            try: 
                translation_nodes[node] = [t[1] for t in translation_list if t[0] == node][0]
            except: 
                translation_nodes[node] = '...'

        return pd.DataFrame({'translation':pd.Series(translation_nodes)})

    def getStatsNodes(self):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        centrality_nodes = networkx.algorithms.centrality.eigenvector_centrality(G)
        # eccentricity_nodes = networkx.algorithms.distance_measures.eccentricity(G)
        df = pd.DataFrame({'network_centrality':pd.Series(centrality_nodes)}) #  'eccentricity':pd.Series(eccentricity_nodes) 

        val_max = max(df['network_centrality'])
        val_min = min(df['network_centrality'])
        df[['centrality_normalized']] = df[['network_centrality']].apply(lambda x: (x - val_min) / (val_max - val_min), result_type = 'expand')
        df[['centrality_rounded']] = df[['network_centrality']].apply(lambda x: round(x, 2))

        return df

    def getStatsCommunities(self):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        communities = greedy_modularity_communities(G)
        dict_communities = {key:value for value in range(len(communities)) for key in communities[value] }

        community_centrality_nodes = {}
        for community in communities:
            selected_edges = [(a,b) for a,b in G.edges if a in community if b in community]
            G_community = nx.Graph()
            G_community.add_edges_from(selected_edges)
            community_centrality_nodes.update(networkx.algorithms.centrality.degree_centrality(G))
            
        return pd.DataFrame({'community':pd.Series(dict_communities), 'community_centrality': pd.Series(community_centrality_nodes)}) 

    def getStatsNetwork(self):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        return networkx.algorithms.approximation.clustering_coefficient.average_clustering(G, trials=1000, seed=10)


In [198]:
wiki_page = WikiNetwork('ワクチン', 'ja')

In [199]:
wiki_page.getStatsNetwork()

AttributeError: 'NoneType' object has no attribute 'is_directed'

In [168]:
node_title = 'ワクチン'
lang = 'ja'

nodes = wiki_page.getNodes(type='cytoscape')
edges = wiki_page.getEdges(type='cytoscape')
nodes_translations = wiki_page.getTranslations()
stats_nodes = wiki_page.getStatsNodes()
stats_communities = wiki_page.getStatsCommunities()

# pd_nodes = pd.DataFrame([{'page_ID': v.node_ID, 'title': v.node_title} for v in wiki_page.network_nodes.values()]).set_index('title', drop = False)
pd_nodes = pd.concat([nodes_translations, stats_nodes, stats_communities], axis = 1).reset_index() # pd_nodes,
pd_nodes = pd_nodes.rename(columns={'index': 'title'})

# {'node_title': node_title, 'lang': lang, 'nodes_network': nodes, 'edges_network': edges, 'nodes_stats': pd_nodes.to_dict('records')} 



In [169]:
pd_nodes

Unnamed: 0,title,translation,network_centrality,centrality_normalized,centrality_rounded,community,community_centrality
0,1798年,1798,0.000992,0.000991,0.0,0,0.011730
1,1971年,1971,0.003558,0.025093,0.0,0,0.049853
2,2019年,2019,0.001732,0.007941,0.0,0,0.043988
3,ATCコード A,ATC code A,0.098432,0.916228,0.1,1,0.460411
4,ATCコード D,ATC code D,0.098432,0.916228,0.1,1,0.460411
...,...,...,...,...,...,...,...
337,RVSV-ZEBOVワクチン,,0.000904,0.000167,0.0,0,0.005865
338,挿入 (遺伝学),,0.000918,0.000298,0.0,0,0.005865
339,MMRVワクチン,,0.000921,0.000323,0.0,0,0.005865
340,2017年コンゴ民主共和国 エボラウイルス感染症発生,,0.000886,0.000000,0.0,0,0.002933


In [164]:
selected_nodes = [k for k,v in wiki_page.links_count.items() if float(v) >= wiki_page.threshold]
[(wiki_page.network_nodes[i].node_title, wiki_page.network_nodes[i].node_translation) for i in wiki_page.network_nodes if wiki_page.network_nodes[i].node_title in selected_nodes]

['1798',
 '1971',
 '2019',
 'ATC code A',
 'ATC code D',
 'ATC code D09',
 'Hepatitis A vaccine',
 'BCG vaccine',
 '.',
 '.',
 'Hepatitis B vaccine',
 '.',
 'COVID-19 vaccine',
 'Hepatitis C',
 '.',
 'DNA vaccine',
 '.',
 'Hib vaccine',
 'International Standard Book Number',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 'RNA vaccine',
 'T-cell receptor',
 'WHO Model List of Essential Medicines',
 'Cancer vaccine',
 '.',
 'Immunologic adjuvant',
 'Atopic dermatitis',
 'Anabolic steroid',
 'Centers for Disease Control and Prevention',
 'American English',
 '.',
 'Angiotensin II receptor blocker',
 'ACE inhibitor',
 'Antivirus software',
 'International Certificate of Vaccination or Prophylaxis',
 'United Kingdom',
 'British English',
 'Influenza',
 'Influenza vaccine',
 '.',
 'Virus',
 'Viral vector vaccine',
 '.',
 'Edward Jenner',
 'Ebola',
 '.',
 '.',
 '.',
 'Otto Gottlieb Mohnike',
 'Calcium channel blocker',
 'Guillain–Barré syndrome',
 'GlaxoSmithKline',
 'Coccidioidomycosis',
 '.',
 'Cohor

In [65]:
wiki_page.network_nodes

{'1718': <__main__.WikiNode at 0x7fe55eec89d0>,
 '1796': <__main__.WikiNode at 0x7fe55e100730>,
 '1879': <__main__.WikiNode at 0x7fe55e100df0>,
 '1881': <__main__.WikiNode at 0x7fe55e100130>,
 '1884': <__main__.WikiNode at 0x7fe55e100250>,
 '1885': <__main__.WikiNode at 0x7fe55e100940>,
 '1890': <__main__.WikiNode at 0x7fe55e100790>,
 '1897': <__main__.WikiNode at 0x7fe55e100850>,
 '1926': <__main__.WikiNode at 0x7fe55e100490>,
 '1927': <__main__.WikiNode at 0x7fe55e1004f0>,
 '1937': <__main__.WikiNode at 0x7fe55e1007f0>,
 '1945': <__main__.WikiNode at 0x7fe55e100a60>,
 '1952': <__main__.WikiNode at 0x7fe55e100bb0>,
 '1954': <__main__.WikiNode at 0x7fe55e100e20>,
 '1962': <__main__.WikiNode at 0x7fe55e100f40>,
 '1964': <__main__.WikiNode at 0x7fe55e100820>,
 '1967': <__main__.WikiNode at 0x7fe55e1009d0>,
 '1970': <__main__.WikiNode at 0x7fe55e100b50>,
 '1974': <__main__.WikiNode at 0x7fe55e100c70>,
 '1977': <__main__.WikiNode at 0x7fe55e14fa60>,
 '1978': <__main__.WikiNode at 0x7fe55e1

In [34]:
nodes = wiki_page.getNodes(type='cytoscape')
edges = wiki_page.getEdges(type='cytoscape')
stats_nodes = wiki_page.getStatsNodes()
stats_communities = wiki_page.getStatsCommunities()
pd_nodes = pd.concat([stats_nodes, stats_communities], axis = 1).reset_index()
pd_nodes = pd_nodes.rename(columns={'index': 'title'})

stats_nodes = {'node_title': 'لقاح', 'lang': 'ar', 'nodes_network': nodes, 'edges_network': edges, 'nodes_stats': pd_nodes.to_dict('records')} 

In [35]:
pd_nodes

Unnamed: 0,title,network_centrality,centrality_normalized,centrality_rounded,community,community_centrality
0,Ad5-nCOV,0.029839,0.220337,0.03,1,0.247761
1,Ad26.COV2.S,0.028962,0.213615,0.03,1,0.238806
2,BBV152,0.029839,0.220337,0.03,1,0.247761
3,إبيفاك كورونا,0.030260,0.223556,0.03,1,0.250746
4,تجربة سريرية,0.049902,0.374040,0.05,1,0.346269
...,...,...,...,...,...,...
331,Betz Halloran,0.001080,0.000000,0.00,0,0.002985
332,Clin. Infect. Dis.,0.001080,0.000000,0.00,0,0.002985
333,أنثى البقرة,0.001080,0.000000,0.00,0,0.002985
334,القاموس الجهنمي، الطبعة السادسة,0.001080,0.000000,0.00,0,0.002985


In [12]:
pd.DataFrame.from_dict(stats_nodes['nodes_stats']) 

Unnamed: 0,page_ID,title,network_centrality,centrality_normalized,centrality_rounded,community,community_centrality
0,ar8338317,Ad26.COV2.S,0.028962,0.213615,0.03,1,0.238806
1,ar8030088,Ad5-nCOV,0.029839,0.220337,0.03,1,0.247761
2,ar8030084,BBV152,0.029839,0.220337,0.03,1,0.247761
3,ar2588132,Open access,0.001333,0.001941,0.00,1,0.005970
4,ar6874832,أبحاث فيروس الهربس البسيط,0.028765,0.212103,0.03,1,0.226866
...,...,...,...,...,...,...,...
331,,,0.001080,0.000000,0.00,0,0.002985
332,,,0.001080,0.000000,0.00,0,0.002985
333,,,0.001080,0.000000,0.00,0,0.002985
334,,,0.001080,0.000000,0.00,0,0.002985


In [10]:
int(stats_nodes.loc['وبائيات']['community'])

AttributeError: 'dict' object has no attribute 'loc'

In [62]:
nodes = wiki_page.getNodes(type='cytoscape')
edges = wiki_page.getEdges(type='cytoscape')
stats_nodes = wiki_page.getStatsNodes()
stats_communities = wiki_page.getStatsCommunities()

pd_nodes = pd.DataFrame([{'page_ID': v.node_ID, 'title': v.node_title} for v in wiki_page.network_nodes.values()]).set_index('title', drop = False)
pd_nodes = pd.concat([pd_nodes, stats_nodes, stats_communities], axis = 1)

data = {'nodes_network': nodes, 'edges_network': edges, 'nodes_stats': pd_nodes.to_dict('records')} 

In [65]:
stats_nodes = pd.DataFrame.from_dict(data['nodes_stats']) 
stats_nodes = stats_nodes.set_index('title', drop = False)

In [66]:
stats_nodes

Unnamed: 0_level_0,page_ID,title,network_centrality,centrality_normalized,centrality_rounded,community,community_centrality
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2009 swine flu pandemic vaccine,en23224587,2009 swine flu pandemic vaccine,0.316393,0.315271,0.32,0,0.316393
2017 Democratic Republic of the Congo Ebola virus outbreak,en54039081,2017 Democratic Republic of the Congo Ebola vi...,0.016393,0.014778,0.02,0,0.016393
ABC News (Australia),en4797328,ABC News (Australia),0.004918,0.003284,0.00,0,0.004918
ACE inhibitor,en2767,ACE inhibitor,0.283607,0.282430,0.28,1,0.283607
AIDS,en36056314,AIDS,0.029508,0.027915,0.03,0,0.029508
...,...,...,...,...,...,...,...
Zika virus vaccine,en53907564,Zika virus vaccine,0.157377,0.155993,0.16,0,0.157377
Zoster vaccine,en8125462,Zoster vaccine,0.162295,0.160920,0.16,0,0.162295
Vaccine,en32653,Vaccine,1.000000,1.000000,1.00,0,1.000000
,,,0.001639,0.000000,0.00,0,0.001639


In [68]:
int(stats_nodes.loc['ACE inhibitor']['community'])

1