In [1]:
# Setup packages (might be able to remove a few of these.)
# packages needed for downloading, saving and loading data 
import os
from collections import Counter
import collections
from dash_bootstrap_components._components.Col import Col 
import json
from networkx.algorithms.traversal.depth_first_search import dfs_labeled_edges
import pandas as pd
import numpy as np
from sklearn import preprocessing

# packages for creation classes and network analysis 
import networkx as nx
from itertools import chain
# import communities
# from networkx.algorithms import approximation
# from networkx.algorithms import community
from networkx.algorithms.community import greedy_modularity_communities
from networkx.utils import not_implemented_for 
__all__ = [
    "eccentricity",
    "diameter",
    "radius",
    "periphery",
    "center",
    "barycenter",
    "degree_centrality",
    "constraint", 
    "local_constraint", 
    "effective_size"
]

# Dash packages for presentation analysis  
import dash
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
import dash_cytoscape as cyto
import dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import plotly.express as px

# setup layout and paths
path = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
data_file = "data_new2.json" 
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP])
styles = {
    'pre': {
        'border': 'thin lightgrey solid',
        'overflowX': 'scroll'
    }
}

In [2]:
def degreeCentrality(G):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/centrality/degree_alg.html#degree_centrality
    
    if len(G) <= 1:
        return {n: 1 for n in G}

    s = 1.0 / (len(G) - 1.0)
    centrality = {n: d * s for n, d in G.degree()}
    return centrality

def eccentricity(G, v=None, sp=None):
# copy-pasted from: https://networkx.org/documentation/stable/_modules/networkx/algorithms/distance_measures.html#eccentricity

    order = G.order()

    e = {}
    for n in G.nbunch_iter(v):
        if sp is None:
            length = nx.single_source_shortest_path_length(G, n)
            L = len(length)
        else:
            try:
                length = sp[n]
                L = len(length)
            except TypeError as e:
                raise nx.NetworkXError('Format of "sp" is invalid.') from e
        if L != order:
            if G.is_directed():
                msg = (
                    "Found infinite path length because the digraph is not"
                    " strongly connected"
                )
            else:
                msg = "Found infinite path length because the graph is not" " connected"
            raise nx.NetworkXError(msg)

        e[n] = max(length.values())

    if v in G:
        return e[v]  # return single value
    else:
        return e


In [46]:
def normalizing(val, max, min):
    return (val - min) / (max - min); 

In [3]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, lang, network_data):

        node_data = network_data.loc[network_data['title'] == node_title].loc[network_data['lang'] == lang]
        
        self.node_title = node_data[['title']].iloc[0,0] # iloc[0,0] needed because there can be two instance of same wikipage in the dataframe: one as centralnode (with langlinks) and one as a normal node of other network (without langlinks).  
        self.node_ID = node_data[['uniqueid']].iloc[0,0]
        self.node_links = node_data[['links']].iloc[0,0]
        self.node_lang = node_data[['lang']].iloc[0,0]

In [101]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title, lang, threshold = 0):
        
        saved_network_data = pd.read_json((path + data_file), orient='split')
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title, lang, network_data = saved_network_data)
        self.threshold = threshold
        self.network_nodes = {}
        self.network_links = []
        self.network_edges = [] 
        self.network_status = []
        
        # Go through node_links of the central node (node_title) to build network.
        
        for link in self.node_links + [self.node_title]:
            try: 
                Node2 = WikiNode(link, lang, network_data = saved_network_data) # NB: the links are not always in the same language as the network. It throws an error as result. - for now it just skips. 
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_title] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
            except: 
                pass
        self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape"):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]
        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape"):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= self.threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]
        
        if type == 'networkx':
            return edges_network
        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]

    def getCommunities(self):  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))
        communities = greedy_modularity_communities(G)

        result = []
        for number in range(len(communities)): 
            result = result + [{i: number} for i in list(communities[number])] 

        return result

    def getStatsNodes(self, integrate = False):
        # I think this will work much better with pandas... For next sprint. (see: https://stackoverflow.com/questions/46711557/calculating-min-and-max-over-a-list-of-dictionaries-for-normalizing-dictionary-v)
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx'))

        communities = greedy_modularity_communities(G)
        degree_centrality_nodes = degreeCentrality(G)
        eccentricity_nodes = eccentricity(G)

        dict_communities = {key:value for value in range(len(communities)) for key in communities[value] }
        # for number in range(len(communities)):
        #    for node in communities[number]: 
        #        key, value = node, number
        #        dict_communities[key] = value 

        normalized_centrality_nodes = {}
        centrality_max = max(degree_centrality_nodes.values())
        centrality_min = min(degree_centrality_nodes.values())
        for node in degree_centrality_nodes.keys():
            normalized_centrality_nodes[node] = normalizing(val= degree_centrality_nodes[node], max= centrality_max, min = centrality_min)        

        if integrate == True: 
            for node in self.network_nodes.keys():
                try: 
                    self.network_nodes[node].node_centrality = degree_centrality_nodes[node]
                    self.network_nodes[node].normalized_centrality = normalized_centrality_nodes[node]
                    self.network_nodes[node].eccentricity = eccentricity_nodes[node]
                    self.network_nodes[node].community = dict_communities[node]
                except: 
                    pass
                    # print(node + ' failed.')
            print ('Integration of Stats to nodes done.')

        else:
            return pd.DataFrame({'degree_centrality':pd.Series(degree_centrality_nodes), 'normalized_centrality':pd.Series(normalized_centrality_nodes), 'eccentricity':pd.Series(eccentricity_nodes), 'community':pd.Series(dict_communities)}) 


In [102]:
test = WikiNetwork('Vaccine', lang = 'en')

In [103]:
stats_nodes = test.getStatsNodes(integrate = False)
    
#    

TypeError: unhashable type: 'slice'

In [98]:
name = stats_nodes.index[0]
stats_nodes.loc[name]

degree_centrality        0.328767
normalized_centrality    0.545714
eccentricity             3.000000
community                0.000000
Name: 2009 swine flu pandemic vaccine, dtype: float64

In [99]:
list_selectors = ['[label = "{}"]'.format(i) for i in stats_nodes.index]

In [100]:
list_selectors

['[label = "2009 swine flu pandemic vaccine"]',
 '[label = "2017 Democratic Republic of the Congo Ebola virus outbreak"]',
 '[label = "ABC News (Australia)"]',
 '[label = "ACE inhibitor"]',
 '[label = "AIDS"]',
 '[label = "ALVAC-CEA vaccine"]',
 '[label = "ATC code A"]',
 '[label = "ATC code B"]',
 '[label = "ATC code C"]',
 '[label = "ATC code D"]',
 '[label = "ATC code D09"]',
 '[label = "ATC code G"]',
 '[label = "ATC code H"]',
 '[label = "ATC code J"]',
 '[label = "ATC code J07"]',
 '[label = "ATC code L"]',
 '[label = "ATC code M"]',
 '[label = "ATC code N"]',
 '[label = "ATC code P"]',
 '[label = "ATC code R"]',
 '[label = "ATC code S"]',
 '[label = "ATC code V"]',
 '[label = "ATCvet code QI"]',
 '[label = "Acellular"]',
 '[label = "Addiction medicine"]',
 '[label = "Adenoviridae"]',
 '[label = "Adenovirus vaccine"]',
 '[label = "Adhesive bandage"]',
 '[label = "Adjuvant"]',
 '[label = "Advisory Committee on Immunization Practices"]',
 '[label = "Afghanistan"]',
 '[label = "Alfa

In [90]:
list_styles = []
list_colours = ['red', 'blue', 'purple','orange','green','olive', 'maroon', 'brown','lime','teal' ]

for node in stats_nodes.index:
    list_styles.append({'background-color': list_colours[selected_community], 
                        'background-opacity': stats_nodes[node]['centrality_normed'] + .2, 
                        'shape': 'ellipse',
                        'width': (stats_nodes[node]['centrality_normed'] * 5) + 1, 
                        'height': (stats_nodes[node]['centrality_normed'] * 5) + 1,
                        }) 

['2009 swine flu pandemic vaccine',
 '2017 Democratic Republic of the Congo Ebola virus outbreak',
 'ABC News (Australia)',
 'ACE inhibitor',
 'AIDS',
 'ALVAC-CEA vaccine',
 'ATC code A',
 'ATC code B',
 'ATC code C',
 'ATC code D',
 'ATC code D09',
 'ATC code G',
 'ATC code H',
 'ATC code J',
 'ATC code J07',
 'ATC code L',
 'ATC code M',
 'ATC code N',
 'ATC code P',
 'ATC code R',
 'ATC code S',
 'ATC code V',
 'ATCvet code QI',
 'Acellular',
 'Addiction medicine',
 'Adenoviridae',
 'Adenovirus vaccine',
 'Adhesive bandage',
 'Adjuvant',
 'Advisory Committee on Immunization Practices',
 'Afghanistan',
 'Alfalfa',
 'Alice Miles Woodruff',
 'Alkylating antineoplastic agent',
 'Allergy',
 'Alternative vaccination schedule',
 'Alum',
 'Aluminum',
 'Anabolic steroid',
 'Analgesic',
 'Anatomical Therapeutic Chemical Classification System',
 'Androvax',
 'Anesthetic',
 'Angiotensin II receptor antagonist',
 'Anorectic',
 'Antacid',
 'Anthelmintic',
 'Anthony Fauci',
 'Anthrax',
 'Anthrax v

In [95]:
pd_network = pd.DataFrame([{'page_ID': v.node_ID, 'title': v.node_title, 'community': 'TODO', 'centrality': 'TODO'} for v in wiki_page.network_nodes.values()])

In [96]:
titles_pages = [v.node_title for v in wiki_page.network_nodes.values()]
list_communities = wiki_page.getCommunities()

In [97]:
pd_network

Unnamed: 0,page_ID,title,community,centrality
0,en23224587,2009 swine flu pandemic vaccine,TODO,TODO
1,en54039081,2017 Democratic Republic of the Congo Ebola vi...,TODO,TODO
2,en4797328,ABC News (Australia),TODO,TODO
3,en2767,ACE inhibitor,TODO,TODO
4,en36056314,AIDS,TODO,TODO
...,...,...,...,...
603,en34367,Yersinia pestis,TODO,TODO
604,en24831215,Zika virus,TODO,TODO
605,en53907564,Zika virus vaccine,TODO,TODO
606,en8125462,Zoster vaccine,TODO,TODO


In [98]:
for title in titles_pages:
    index_title = pd_network.loc[pd_network['title'] == title].index
    try: 
        pd_network.at[index_title, 'community'] = str([i for i in range(len(list_communities)) if title in list_communities[i]])
    except: 
        pd_network.at[index_title, 'community'] = '[nan]'
    
    

In [99]:
pd_network

Unnamed: 0,page_ID,title,community,centrality
0,en23224587,2009 swine flu pandemic vaccine,[0],TODO
1,en54039081,2017 Democratic Republic of the Congo Ebola vi...,[0],TODO
2,en4797328,ABC News (Australia),[3],TODO
3,en2767,ACE inhibitor,[1],TODO
4,en36056314,AIDS,[0],TODO
...,...,...,...,...
603,en34367,Yersinia pestis,[0],TODO
604,en24831215,Zika virus,[0],TODO
605,en53907564,Zika virus vaccine,[0],TODO
606,en8125462,Zoster vaccine,[0],TODO


In [111]:
node_community = '[] str(pd_network.loc[pd_network['title'] == 'ACE inhibitor']['community'])[0]

In [112]:
node_community

'3'

In [102]:
subset = pd_network.loc[pd_network['community'] == node_community]

ValueError: Can only compare identically-labeled Series objects

In [215]:
network_nodes = {}
network_links = []
lang = 'en'
saved_network_data = pd.read_json((path + data_file), orient='split')

for link in test.node_links:
    node2 = WikiNode(node_title = link, lang = lang, network_data = saved_network_data) # NB: the links are not always in the same language as the network. - for now it just skips. 
    # try: 
    network_links = network_links + node2.node_links  #  [x for x in node2.node_links if x in test.node_links]
    # except: 
        
    # purged_edges = []
    # for purged_link in purged_links:
    #    purged_edges.append((link,purged_link))  
    network_nodes[node2.node_ID] = node2

TypeError: can only concatenate list (not "NoneType") to list

In [208]:
test2 = WikiNode('Cushion plant', 'en',  network_data = saved_network_data)

In [209]:
test2.node_links

['Alpine climate',
 'Apiaceae',
 'Apical dominance',
 'Arctic climate',
 'Arid',
 'Asteraceae',
 'Azorella compacta',
 'Caryophyllaceae',
 'Climax community',
 'Convergent evolution',
 'Donatia novae-zelandiae',
 'Donatiaceae',
 'Ecosystem engineer',
 'Endemism',
 'Epidermis (botany)',
 'Family (biology)',
 'Feldmark',
 'Fen',
 'Flower',
 'ISBN (identifier)',
 'Keystone species',
 'Moss campion',
 'Mulinum leptacanthum',
 'Myosotis alpestris',
 'New Zealand',
 'Nutrient',
 'Oreopolus glacialis',
 'Parallel evolution',
 'Peat',
 'Peru',
 'Photosynthesis',
 'Plant',
 'Pollinator',
 'Primary succession',
 'Rosette (botany)',
 'Senescence',
 'Silene acaulis',
 'Soil',
 'Species richness',
 'Stylidiaceae',
 'Subalpine',
 'Subantarctic',
 'Subarctic climate',
 'Svalbard',
 'Tap root',
 'Taproot',
 'Tasmania',
 'Tasmanian cushion plants',
 'Tierra del Fuego',
 'Transpiration',
 'Wood']

In [212]:
saved_network_data.iloc[1300:1410,]

Unnamed: 0,title,lang,pageid,uniqueid,lastrevid,links,langlinks
1300,Scottish Parliament,en,61188,en61188,1036072476,"[Solicitor General for Scotland, South Scotlan...",
1301,Valentyna Shevchenko (politician),en,2223869,en2223869,1001782074,,
1302,"List of members of the parliament of Ukraine, ...",en,67510936,en67510936,1020082802,"[List of members of the parliament of Ukraine,...",
1303,Dáil Éireann (Irish Republic),en,765649,en765649,1034555395,"[Grand and General Council, Head of state, Hea...",
1304,Osnova (political party),en,59190554,en59190554,999931294,"[2019 Ukrainian parliamentary election, 2019 U...",
...,...,...,...,...,...,...,...
1405,Parliament of Sierra Leone,en,3654549,en3654549,1000330561,"[2018 Sierra Leonean general election, Abass B...",
1406,Single-member constituency,en,5046791,en5046791,441852629,[Single-member district],
1407,Kyivan Rus,en,601925,en601925,103240156,[Kievan Rus'],
1408,National Assembly (Botswana),en,3458580,en3458580,1035097650,"[1st Parliament of Botswana, 2009 Botswana gen...",


In [194]:
node2.node_links

In [140]:
G = nx.Graph()
G.add_edges_from(test.getEdges(type = 'networkx', threshold=0))

In [141]:
degree_centrality_nodes = degreeCentrality(G)
eccentricity_nodes = eccentricity(G) 

In [142]:
degree_centrality_nodes

{}

In [11]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [10]:
all_networks = getDownloadedNetworks()

In [19]:
# test download. 
test = downloadNetworks(node_title="Cambridge", original_lang="fr")

Downloading Wiki network name: Cambridge in language: fr. Please note that this can take a while.
Data succesfully saved. Wiki node name: Cambridge; downloaded in language: fr.
Downloading Wiki network name: Cambridge (Verenigd Koninkrijk) in language: nl. Please note that this can take a while.
Data succesfully saved. Wiki node name: Cambridge (Verenigd Koninkrijk); downloaded in language: nl.
Download of additional languages finished.
Download of additional languages finished.


In [130]:
# turning wikipedia API output into a list of panda dataframes 
df = [pd.DataFrame.from_dict(item['query']['pages']).transpose() for item in test]

In [138]:
# merging all these dataframes into one, using 'update' method. 
# note that does not work yet for intiial node, because different data (columns) were downloaded. I need to download langlinks and info for all nodes. Using zip to make files smaller.  
for item in df: df[1].update(item)

In [141]:
# the resulting df is the one on which the 'update' method was run. 
df_result = df[1]

ns                                                         0
title                               Chichester (Royaume-Uni)
missing                                                  NaN
pageid                                                769145
links      [{'ns': 0, 'title': 'Aberdeen'}, {'ns': 0, 'ti...
Name: 769145, dtype: object

In [88]:
result = pd.concat(test3)

In [89]:
result

Unnamed: 0,contentmodel,langlinks,lastrevid,length,links,ns,pageid,pagelanguage,pagelanguagedir,pagelanguagehtmlcode,title,touched,missing
15789,wikitext,"[{'lang': 'af', '*': 'Cambridge'}, {'lang': 'a...",179254172,17453,"[{'ns': 0, 'title': '1025'}, {'ns': 0, 'title'...",0,15789,fr,ltr,fr,Cambridge,2021-08-02T19:50:20Z,
-1,,,,,,0,,,,,Cambridge City F.C.,,
-2,,,,,,0,,,,,"Chesterton, Cambridge",,
-3,,,,,,0,,,,,City Ground (Cambridge),,
-4,,,,,,0,,,,,Gerri Bird,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
40398,,,,,,0,40398,,,,York,,
10300,,,,,,0,10300,,,,Écosse,,
27816,,,,,,0,27816,,,,Édimbourg,,
11495371,,,,,"[{'ns': 0, 'title': 'Accolade (architecture)'}...",0,11495371,,,,Église Saint-Bene't,,


In [118]:
result2 = result.groupby(['pageid','title'])['links'].apply(lambda x: x + x).reset_index()

In [119]:
result2

Unnamed: 0,index,links
0,15789,"[{'ns': 0, 'title': '1025'}, {'ns': 0, 'title'..."
1,15789,
2,15789,
3,15789,
4,15789,
...,...,...
23216,1377936,
23217,1377936,
23218,1377936,
23219,1377936,


In [112]:
result2.iloc[50]['links']

nan

In [100]:
if 'nan' in result2: result2.remove('nan')

In [101]:
result2

Unnamed: 0,pageid,title,links
0,269,Alphabet phonétique international,"[{'ns': 0, 'title': '1886'}, {'ns': 0, 'title'..."
1,1348,Héraldique,"[{'ns': 0, 'title': '1806'}, {'ns': 0, 'title'..."
2,1367,Hongrie,"nan,[{'ns': 0, 'title': '.hu'}, {'ns': 0, 'tit..."
3,1490,Irlande du Nord,"nan,nan,nan,nan,[{'ns': 0, 'title': '12 juille..."
4,1712,Liste des pays du monde,"nan,nan,nan,nan,nan,[{'ns': 0, 'title': 'Abkha..."
...,...,...,...
210,13226163,Sawtry,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
211,13230180,Yaxley (Cambridgeshire),"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
212,13793011,Kettle's Yard,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
213,14131764,Lord Lieutenant du Cambridgeshire,"nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,nan,na..."
