In [20]:
# Setup packages
import os
from itertools import chain
from collections import Counter
import collections 
import networkx as nx
import requests
import json

# all these network algorithms are currently being tried out. 
from networkx.algorithms import approximation
from networkx.algorithms import community
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms.community import greedy_modularity_communities

# For network visualization: 
from pyvis.network import Network
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/Comparing Wikipedia Knowledge Networks (Network Analysis Page links)/Code/"
DATA_FILE = "DATA.json" 

# Loading JSON file: 
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [21]:
# Function: download data on links, background data and available language for ONE node: 'node_title' using Wikimedia API and save to json file. 
def downloadNode(node_title, language = "en"): 
    
    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the node of the network. 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():
        
        PARAMS_CONT = PARAMS
        PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS_CONT)
        wiki_data.append(response.json())

    # step 2: transforming data from API into unified dictionary. 
    # 2a: creating list of available nodes. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))

    # 2b: Using all_nodes to go through raw data from API -- in this case this should just by 1 node. 
    for node in all_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': language, 'AvailableLanguages': []}
        
        item_name = language + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys():
                    node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[language + node] = node_data

    # Step 3: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki node name: " + node_title + "; downloaded in language: " + language + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#   finally:
#       return wiki_data


In [30]:
# Function: download data on links from ALL PAGES linked to 'node_title' (but excluding node_title itself!) from Wikimedia API and save to json file.  
def downloadNetwork(node_title, language = "en"): 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + language + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    print("Downloading Wiki network name: " + node_title + "in language: " + language + ". Please note that this can take a while.")
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS_CONT = PARAMS
        if 'plcontinue' in wiki_data[-1]['continue']:
            PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 

        if 'gplcontinue' in wiki_data[-1]['continue']: 
            PARAMS_CONT["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']

        response = S.get(url=API_ENDPOINT, params = PARAMS_CONT)
        wiki_data.append(response.json())

    # step 2: transforming data from API into unified dictionary. 
    # 2a: creating list of available nodes. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))

    # 2b: Using all_nodes to go through raw data from API
    for node in all_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': language}
        
        item_name = language + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[language + node] = node_data

    # Step 3: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki network name: " + node_title + "; downloaded in language: " + language + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#   finally:
#       return wiki_data


In [23]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_language = "en", requested_languages = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_language][0]
    list_available_languages = []
    for item in available_languages: 
         list_available_languages.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if requested_languages == []:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_languages)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in available_languages: 
            if item['lang'] in requested_languages: 
                downloadNetwork(node_title = item['*'], language = item['lang'])
    
        print("Download of additional languages finished.") 


In [24]:
# function: provide titles of networks that are saved in the JSON file. Also provides the language they were saved in. 
def printDownloadedNetworks(): 

    # download data from JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # create set of ego network names.  
    downloaded_networks = [(v['ego']) for (k,v) in network_data.items()]
    downloaded_networks = set(list(chain(*downloaded_networks)))
    
    # print names of ego networks and language that they have been downloaded in. 
    overview_downloaded_networks = {}
    for title in downloaded_networks:
        langs = []
        nodes_data = [v for (k,v) in network_data.items() if v['title'] == title]
        for node in nodes_data: 
            langs.append(node['language']) 
        overview_downloaded_networks[title] = langs
    
    print(overview_downloaded_networks)

In [25]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, language):
        
        # Select node in JSON file (by title and language). 
        node_data = [v for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == language][0]
        
        # Extract data and place in instance of Wikinode class. 
        self.node_title = node_data['title']
        self.node_ID = node_data['node_ID']
        self.node_links = node_data['links']
        self.node_language = node_data['language']


In [26]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title,language):
        # make list available ego networks
        available_networks = [(v['ego']) for (k,v) in network_data.items()]
        available_networks = set(list(chain(*available_networks)))
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        if node_title in available_networks:
            WikiNode.__init__(self, node_title, language)
            self.network_nodes = []
            self.network_edges = [] 
            self.network_status = []
        
            # Go through node_links of the central node (node_title) to build network.
            for link in self.node_links:
                try:     
                    Node2 = WikiNode(link, language)                
                    purged_nodes = [x for x in Node2.node_links if x in self.node_links]
                    purged_edges = []
                    for purged_node in purged_nodes:
                        purged_edges.append((link,purged_node))  
                    self.network_nodes = self.network_nodes + purged_nodes
                    self.network_edges = self.network_edges + purged_edges
                except: 
                    print('Loading of node ' + link + ' failed.')
                self.nodes_count = Counter(self.network_nodes)
            print("Data Succesfully loaded.")
        
        else: 
            print("Node not available. Download using downloadNetwork() function.") 

    def getNodes(self,threshold=0):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        nodes_network = []

        for node in selected_nodes:
            node_tuple = (node, {"name": node})
            nodes_network.append(node_tuple)

        return nodes_network

    def getEdges(self,threshold=0):
        selected_nodes = [k for k,v in self.nodes_count.items() if float(v) >= threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        return edges_network

    def getStatsNetwork(self): 
        # TODO: return a dictionary with stats on network: 
        # - triangles
        # - degree_centrality 
        # - ... 

        print('WIP')

    def getStatsNodes(self, nodes):
        # TODO: return an numpy array with stats per node: 
        # - triangles
        # - degree_centrality 
        # - ... 
        # if nodes == None: 
          #  node = self.node_links

        print('WIP')
    
    def getNetworkCommunities(self,threshold=0):
        # TODO: return an numpy array with stats per community. Add in an overall library.  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(threshold))
        
        return greedy_modularity_communities(G)
    
    def drawGraph(self,threshold=0,name='no_name'):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(threshold))

        netdraw = Network('2000px', '2000px')
        netdraw.from_nx(G)
        netdraw.barnes_hut()

        title = name + ".html"
        netdraw.show(title)


In [27]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [28]:
downloadNode('Royston')

Data succesfully saved. Wiki Node name: Royston; downloaded in language: en.


In [31]:
downloadNetwork('Royston')

Downloading Wiki network name: Roystonin language: en. This can take a while.
Data succesfully saved. Wiki network name: Royston; downloaded in language: en.


In [8]:
downloadAdditionalLanguage(node_title = 'Flask')

The wikipedia page is available in the following languages:
['ar', 'ceb', 'de', 'es', 'eu', 'fr', 'it', 'ko', 'pl', 'ru', 'sk']


In [9]:
downloadAdditionalLanguage(node_title = 'Flask', original_language = "en", requested_languages = ["de"])  

Data succesfully saved. Wiki network name: Kolba; downloaded in language: de.
Download of additional languages finished.


In [7]:
printDownloadedNetworks()

{'إرهاب': ['ar'], 'علمانية': ['ar'], 'Laïcité': ['fr'], 'دين (معتقد)': ['ar'], 'Terrorism': ['en'], 'Religion': ['de', 'fr', 'en'], 'Secularisme': ['nl'], 'England': ['en'], 'Vaccin': ['fr'], 'Säkularismus': ['de'], 'إيثريوم': ['ar'], 'Terrorisme': ['fr'], 'Religie': ['nl'], 'Ethereum': ['en', 'de', 'fr', 'ru', 'tr'], 'Aşı (tıp)': ['tr'], 'لقاح': ['ar'], 'Вакцина': ['ru'], 'Secularism': ['en'], 'Vaccine': ['en', 'fr'], 'Terrorismus': ['de']}


In [8]:
network = WikiNetwork('Laïcité', language='fr')

Loading of node Church of Scotland Act 1921 failed.
Loading of node Critique de la Scientologie failed.
Loading of node Gennade II Scholarius failed.
Loading of node Iamdudum failed.
Loading of node Leyla Sahin contre Turquie failed.
Loading of node Parlement brésilien failed.
Loading of node UVV failed.
Loading of node Église de Google failed.
Data Succesfully loaded.


In [9]:
network.drawGraph(1,name="Network_Graph")

In [17]:
G = nx.Graph()
G.add_edges_from(network.getEdges(threshold = 1))

print(nx.triangles(G))




{"'Pataphysique": 3248, '1948': 1150, 'International Standard Book Number': 7928, 'Librairie Arthème Fayard': 392, 'Online Computer Library Center': 733, 'Paronymie': 2, 'Presses universitaires de France': 499, '10 novembre': 2158, '11 avril': 1755, '12 décembre': 1813, '16 janvier': 1968, '1830': 586, '1859': 1105, '1863': 936, '1870': 1246, '1871': 1253, '1901': 445, '1917': 1014, '1918': 1258, '1919': 1073, '1921': 1055, '1922': 1124, '1923': 1084, '1924': 890, '1925': 831, '1926': 848, '1927': 782, '1928': 688, '1929': 747, '1937': 961, '1938': 978, '1939': 1192, '1940': 938, '1946': 1238, '1947': 1270, '1950': 930, '1953': 675, '1956': 920, '1971': 570, '1976': 536, '1979': 750, '1983': 685, '1984': 672, '1985': 728, '1989': 1148, '1990': 928, '1991': 1002, '1997': 910, '1999': 1186, '1er décembre': 2448, '1er novembre': 2173, '2002': 793, '2003': 723, '2004': 883, '2005': 1097, '2009': 1138, '2010': 966, '2012': 807, '2015': 1029, '2016': 874, '2017': 822, '20 avril': 1706, '20 j

In [20]:
nx.degree_centrality(G)

.023564064801178203,
 'Jean-Paul Sartre': 0.27245949926362295,
 "Loi fondamentale de la République fédérale d'Allemagne": 0.025036818851251842,
 'Nouvelle-Calédonie': 0.1251840942562592,
 'Éditions Ellipses': 0.011782032400589101,
 "Constitution de l'Inde": 0.020618556701030927,
 'Tribunal constitutionnel fédéral': 0.0029455081001472753,
 'Nicolae Ceaușescu': 0.048600883652430045,
 'Sōka Gakkai': 0.007363770250368188,
 'Jean-Paul II': 0.10751104565537555,
 'Novembre 1990': 0.030927835051546393,
 'Charia': 0.07511045655375552,
 'Bharatiya Janata Party': 0.041237113402061855,
 'Révolution roumaine de 1989': 0.010309278350515464,
 'Union européenne': 0.0898379970544919,
 'Homosexualité': 0.11929307805596465,
 'Canton de Genève': 0.027982326951399114,
 'Canton de Neuchâtel': 0.020618556701030927,
 'Canton de Vaud': 0.03829160530191458,
 "Traité d'Amsterdam": 0.005891016200294551,
 'Dollar américain': 0.060382916053019146,
 'Février 2002': 0.029455081001472753,
 'Janvier 1979': 0.0279823269