In [1]:
# Setup packages
import os
from itertools import chain
from collections import Counter
import collections 
import networkx as nx
import requests
import json
import pandas as pd
import numpy as np

# all these network algorithms are currently being tried out. 
from networkx.algorithms import approximation
from networkx.algorithms import community
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms.community import greedy_modularity_communities

# For network visualization: 
from pyvis.network import Network
# use dynetx for dynamic network visualization? -- when I can download and incoporate revision history? 

PATH = "/home/teijehidde/Documents/Git Blog and Coding/data_dump/"
DATA_FILE = "DATA.json" 

# Loading JSON file: 
with open(PATH + DATA_FILE) as json_file:
    network_data = json.load(json_file)

In [2]:
def SaveData(wiki_data, node_title, lang):
    # step 1: transforming data from API into unified dictionary. 
    # 1a: creating list of available nodes. 
    all_nodes = []

    for item in wiki_data:
        all_nodes = all_nodes + list(item['query']['pages'].keys())
    all_nodes = list(set(all_nodes))

    # 1b: Using all_nodes to go through raw data from API -- in this case this should just by 1 node. 
    for node in all_nodes:
        node_data = {'node_ID': node, 'title': '', 'links': [], 'ego': [], 'language': lang, 'AvailableLanguages': []}
        
        item_name = lang + node
        if item_name in network_data.keys():
            node_data = network_data[item_name]
        
        for item in wiki_data:
            if node in item['query']['pages'].keys(): 
                node_data['title'] = item['query']['pages'][node]['title']

                if 'links' in item['query']['pages'][node].keys():
                    for link in item['query']['pages'][node]['links']: 
                        node_data['links'].append(link['title'])

                if 'langlinks' in item['query']['pages'][node].keys():
                    node_data['AvailableLanguages'] = item['query']['pages'][node]['langlinks']

                node_data['ego'].append(node_title)
        
        node_data['ego'] = list(set(node_data['ego']))
        
        network_data[lang + node] = node_data

    # Step 2: Saving data to json file. 
    try: 
        with open(PATH + DATA_FILE, 'w') as outfile:
            json.dump(network_data, outfile)
            print("Data succesfully saved. Wiki node name: " + node_title + "; downloaded in language: " + lang + ".")

    except: 
        print("Something went wrong. Check code.")

# optional, for debugging: 
#   finally:
#       return wiki_data


In [3]:
# Function: download additional languages of ego network.
def downloadAdditionalLanguage(node_title, original_lang, additional_langs = []): # "de", "fr", "nl"

    # download data fro JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # make a list of the language that are available for requested page. 
    available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == original_lang][0]
    list_available_languages = []
    for item in available_languages: 
         list_available_languages.append(item['lang'])

    # If no languages are requested, the function shows available languages. 
    if additional_langs == []:
        pass

    if additional_langs == ["available_langs"]:
        print('The wikipedia page is available in the following languages:')         
        print(list_available_languages)
    
    # Goes through avialble languages of a wikipedia page, and downloads those that were requested (using the downloadNetwork() function). 
    else:
        for item in available_languages: 
            if item['lang'] in additional_langs:
                downloadNetworks(node_title = item['*'], original_lang = item['lang'], additional_langs = [])
    
        print("Download of additional languages finished.") 


In [4]:
# Function: download data on links from ALL PAGES linked to 'node_title' (but excluding node_title itself!) from Wikimedia API and save to json file.  
def downloadNetworks(node_title, original_lang = "en", additional_langs = ["ar" "de", "fr", "nl"]): 

    # setup and load existing data node.
    API_ENDPOINT = "https://" + original_lang + ".wikipedia.org/w/api.php" # fr.wikipedia.org; https://en.wikipedia.org
    wiki_data = []

    # step 1: download data on the central node of the network (incl. available languages). 
    # setup API query and initial API call 
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "titles": node_title,
        "prop": "links|info|langlinks",
        "plnamespace": 0, 
        "pllimit": 500,
        "lllimit": 500, 
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())
    
    # Continue API call until all data on ego node has been downloaded. 
    while 'continue' in wiki_data[-1].keys():
        
        PARAMS_CONT = PARAMS
        PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue']

        response = S.get(url=API_ENDPOINT, params=PARAMS_CONT)
        wiki_data.append(response.json())

    # step 2: use generator to download data on all additional nodes. 
    # setup API query for first generator API call (used to download data on all pages that are linked to node_title) 
    print("Downloading Wiki network name: " + node_title + " in language: " + original_lang + ". Please note that this can take a while.")
    S = requests.Session()
    PARAMS = {
        "action": "query",
        "generator": "links",
        "titles": node_title,
        "gplnamespace": 0, 
        "gpllimit": 500, 
        "plnamespace": 0,
        "pllimit": 500, 
        "prop": "links",
        "format": "json"
    }
    response = S.get(url=API_ENDPOINT, params=PARAMS)
    wiki_data.append(response.json())

    # Continue API call until all data on network is downloaded. 
    while 'continue' in wiki_data[-1].keys():

        PARAMS_CONT = PARAMS
        if 'plcontinue' in wiki_data[-1]['continue']:
            PARAMS_CONT["plcontinue"] = wiki_data[-1]['continue']['plcontinue'] 

        if 'gplcontinue' in wiki_data[-1]['continue']: 
            PARAMS_CONT["gplcontinue"] = wiki_data[-1]['continue']['gplcontinue']

        response = S.get(url=API_ENDPOINT, params = PARAMS_CONT)
        wiki_data.append(response.json())

    # step 3: transform and save data:  
    SaveData(wiki_data, node_title=node_title, lang=original_lang)

    # step 4: download additional languages: 
    downloadAdditionalLanguage(node_title = node_title, original_lang = original_lang, additional_langs = additional_langs)



In [5]:
# function: provide titles of networks that are saved in the JSON file. Also provides the language they were saved in. 
def getDownloadedNetworks(): 

    # download data from JSON file. 
    with open(PATH + DATA_FILE) as json_file:
        network_data = json.load(json_file)
    
    # create set of ego network names.  
    downloaded_networks = [(v['ego']) for (k,v) in network_data.items()]
    downloaded_networks = set(list(chain(*downloaded_networks)))
    downloaded_networks = [v for (k,v) in network_data.items() if v['title'] in downloaded_networks]

    # print names of ego networks and language that they have been downloaded in. 
    items = {}  
    for network in downloaded_networks: 
        items[network['title'] + ' (' + network['language'] + ')'] = {'lang':  network['language'], '*': network['title']}    
    return(items)

In [6]:
# Initiate class Node. 
class WikiNode:
    def __init__(self, node_title, lang):
        
        # Select node in JSON file (by title and language). 
        node_data = [v for (k,v) in network_data.items() if v['title'] == node_title if v['language'] == lang][0]
        
        # Extract data and place in instance of Wikinode class. 
        self.node_title = node_data['title']
        self.node_ID = node_data['node_ID']
        self.node_links = node_data['links']
        self.node_lang = node_data['language']


In [7]:
# Initiate class WikiNetwork
class WikiNetwork(WikiNode):
   
    def __init__(self,node_title, lang):
        
        # initiate the central node of the network as class WikiNode, add additional attributes for class WikiNetwork 
        WikiNode.__init__(self, node_title, lang)
        self.network_nodes = {}
        self.network_links = []
        self.network_edges = [] 
        self.network_status = []
    
        # Go through node_links of the central node (node_title) to build network.
        for link in self.node_links:
            try:     
                Node2 = WikiNode(link, lang)             
                purged_links = [x for x in Node2.node_links if x in self.node_links]
                purged_edges = []
                for purged_link in purged_links:
                    purged_edges.append((link,purged_link))  
                self.network_nodes[Node2.node_ID] = Node2
                self.network_links = self.network_links + purged_links
                self.network_edges = self.network_edges + purged_edges
            except: 
                print('Loading of node ' + link + ' failed.')
            self.links_count = Counter(self.network_links)

    def getNodes(self, type="cytoscape", threshold=0):
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        
        if type == 'networkx':
            return [(i, {"name": i}) for i in selected_nodes]

        if type == 'cytoscape':
            return [{'data': {'id': i, "label": i}} for i in selected_nodes]

    def getEdges(self,type="cytoscape", threshold=0):  
        selected_nodes = [k for k,v in self.links_count.items() if float(v) >= threshold]
        edges_network = [(a,b) for a,b in self.network_edges if a in selected_nodes and b in selected_nodes]

        if type == 'networkx':
            return edges_network

        if type == 'cytoscape':
            return [{'data': {'source': a, "target": b}} for a,b in edges_network]

    def getStatsNetwork(self): 
        return("WIP")
        # TODO: return a dictionary with stats on network: 
        # - triangles
        # - degree_centrality 
        # - ...

    def getStatsNodes(self, nodes):
        # TODO: return an numpy array with stats per node: 
        # - triangles
        # - degree_centrality 
        # - ... 
        # if nodes == None: 
          #  node = self.node_links
        return('WIP')
    
    def getNetworkCommunities(self,threshold=0):
        # TODO: return an numpy array with stats per community. Add in an overall library.  
        G = nx.Graph()
        G.add_edges_from(self.getEdges(type = 'networkx', threshold= threshold))
        
        return greedy_modularity_communities(G)
    
    def drawGraph(self,threshold=0,name='no_name'):
        G = nx.Graph()
        G.add_edges_from(self.getEdges(threshold))

        netdraw = Network('2000px', '2000px')
        netdraw.from_nx(G)
        netdraw.barnes_hut()

        title = name + ".html"
        netdraw.show(title)


In [8]:
# Class: a collection of wiki networks of the same topic, in different languages. Automatically takes all languages that have been downloaded before. 
class WikiNetworkCollection():

        def __init__(self,chosen_networks):
        
                # make list available ego networks
                all_downloaded_networks = getDownloadedNetworks()
                available_languages = [v['AvailableLanguages'] for (k,v) in network_data.items() if v['title'] == node_title][0]
                
                topic_networks = []
                topic_networks = [{'lang': original_language, '*': node_title}] + [v for v in available_languages if v in all_downloaded_networks]

                # initiate tclass WikiNetwork for each available language.  
                self.networks = {}
                if topic_networks is not []:
                        for network in topic_networks:
                                self.networks[network['lang'] + '_' + network['*']] = WikiNetwork(node_title = network['*'], language = network['lang'])

        def getStatsIsomorphism(self):
        # TODO: return an numpy array with stats per network: 
        # - ... related to similarities / difference of network to other networks in collection. 
                print('WIP')


In [9]:
########################################
########################################
# FROM HERE RUN TIME STARTS # 

In [10]:
getDownloadedNetworks()

{'Flask (en)': {'lang': 'en', '*': 'Flask'},
 'Kolba (de)': {'lang': 'de', '*': 'Kolba'},
 'Royston (en)': {'lang': 'en', '*': 'Royston'},
 'Oxford (en)': {'lang': 'en', '*': 'Oxford'},
 'Flask (fr)': {'lang': 'fr', '*': 'Flask'},
 'Oxford (de)': {'lang': 'de', '*': 'Oxford'},
 'Oxford (fr)': {'lang': 'fr', '*': 'Oxford'},
 'Vaccine (en)': {'lang': 'en', '*': 'Vaccine'},
 'Vaccine (fr)': {'lang': 'fr', '*': 'Vaccine'},
 'Vaccine (de)': {'lang': 'de', '*': 'Vaccine'},
 'Secularism (en)': {'lang': 'en', '*': 'Secularism'},
 'علمانية (ar)': {'lang': 'ar', '*': 'علمانية'},
 'Laïcité (fr)': {'lang': 'fr', '*': 'Laïcité'},
 'Secularisme (nl)': {'lang': 'nl', '*': 'Secularisme'}}

In [11]:
test = WikiNetwork('Oxford', lang = 'en')

Loading of node C. S. Lewis Nature Reserve failed.
Loading of node Labstep failed.


In [14]:
network_communities = test.getNetworkCommunities()


In [15]:
community_colours = ['red', 'blue', 'yellow', 'orange', 'green', 'purple', 'olive', 'brown', 'maroon', 'lime', 'teal'] 

In [142]:
def layout_communities(network_communities):
    
    colour_count = -1 
    result = []
    for community in network_communities: 
        list_selectors = ''.join([('[label = "{}"],'.format(i)) for i in community])
        colour_count = colour_count + 1 
        # result.append(
            {'selector': list_selectors.rstrip(list_selectors[-1]),
             'style': { 'background-color': community_colours[colour_count] } },
        # )
    
    return result

In [145]:
layout_communities(network_communities)

[{'selector': '[label = "Seacourt"],[label = "University College, Oxford"],[label = "Trinity College, Oxford"],[label = "Grandpont"],[label = "A34 road (England)"],[label = "A40 road"],[label = "Tom Tower"],[label = "Ipswich railway station"],[label = "Stagecoach bus route X5"],[label = "Oxford Saints"],[label = "University Church of St Mary the Virgin"],[label = "Balliol College, Oxford"],[label = "Imran Khan"],[label = "Southampton Airport Parkway railway station"],[label = "VIAF (identifier)"],[label = "The Oxford Times"],[label = "Phoenix Picturehouse"],[label = "Oxford Stadium"],[label = "South Oxfordshire"],[label = "East Anglia"],[label = "Sunnymead"],[label = "Zuleika Dobson"],[label = "Reading railway station"],[label = "His Dark Materials"],[label = "Park & ride"],[label = "Morris Motors"],[label = "Christ Church Picture Gallery"],[label = "Oxford University Boat Club"],[label = "Deborah Harkness"],[label = "Concert hall"],[label = "Oxford University RFC"],[label = "Waterways

In [137]:
temp

<generator object layout_communities.<locals>.<genexpr> at 0x7f4691c02ba0>

In [117]:
final

{'selector': '[label = "Travis (band)"],[label = "Foals (band)"]',
 'style': {'background-color': 'maroon'}}

In [26]:
list_selectors_complete[9]

IndexError: list index out of range

In [56]:
selected_network = 'Oxford (en)'

wiki_page_options = [v['AvailableLanguages'] for v in network_data.values() if v['title'] == all_networks[selected_network]['*'] if v['language'] == all_networks[selected_network]['lang']]
language_options = [selected_network] + [k for k,v in all_networks.items() if {'lang': v['lang'], '*': v['*']} in wiki_page_options[0]]


In [57]:
[{'label': i, 'value': i} for i in language_options] 

[{'label': 'Oxford (en)', 'value': 'Oxford (en)'},
 {'label': 'Oxford (de)', 'value': 'Oxford (de)'},
 {'label': 'Oxford (fr)', 'value': 'Oxford (fr)'}]

In [54]:
all_keys = set().union(*(d.keys() for d in language_options))


AttributeError: 'tuple' object has no attribute 'keys'

In [45]:
[{'label': a, 'value': a} for a in language_options] 

[{'label': {'Oxford (en)': {'lang': 'en', '*': 'Oxford'}},
  'value': {'Oxford (en)': {'lang': 'en', '*': 'Oxford'}}},
 {'label': ('Oxford (de)', {'lang': 'de', '*': 'Oxford'}),
  'value': ('Oxford (de)', {'lang': 'de', '*': 'Oxford'})},
 {'label': ('Oxford (fr)', {'lang': 'fr', '*': 'Oxford'}),
  'value': ('Oxford (fr)', {'lang': 'fr', '*': 'Oxford'})}]

In [41]:
[v for v in language_options] 

[{'Oxford (en)': {'lang': 'en', '*': 'Oxford'}},
 ('Oxford (de)', {'lang': 'de', '*': 'Oxford'}),
 ('Oxford (fr)', {'lang': 'fr', '*': 'Oxford'})]

In [65]:
def test_fun(input):
    
    text = WikiNode('Oxford', 'en')
    if input > 1:
        return text.node_ID

    

In [67]:
test_fun(10)

'22308'

In [1]:
import networkx as nx
G = nx.Graph()
G.add_edge(0,1,weight=.1)
G.add_edge(2,1,weight=.2)
nx.write_gml(G,'g.gml')
nx.write_graphml(G,'g.xml')

In [2]:
G.

<networkx.classes.graph.Graph at 0x7ff998081d90>