In this notebook, we calculate some generalized network statistics about the input network, and save the output as a .csv. 

Import the usual suspects for network / data analysis

In [1]:
import networkx as nx
import os, sys, time
import pandas as pd
import numpy as np

### Defining the Main Function
In the next block, we define the main process which is applied against an input network, passed to the function. This function does all of the calculations we are interested in. Note the in-line comments for specifics. 

In [6]:
def NetStats(g, ISO):
    # g is a passed in path to a networkx object, in string format. 
    # ISO is the ISO-3 country code for the country. It is used mainly as a device for keeping track of the output. 
    
    # all results will be loaded into a results dictionary. We define an empty dict here. 
    results = {}
    
    # load the Graph
    G = nx.read_gpickle(g)
    
    # the first of our results - load the number of nodes and edges in as entries to the results dict
    results['number_of_edges'] = G.number_of_edges()
    results['number_of_nodes'] = G.number_of_nodes()
    
    # genertae the list of connected sub-graphs - usually 1, but often more. 
    Gs = list(nx.strongly_connected_component_subgraphs(G))
    
    # Here, we identify the sub-graphs worthy of analysis (set thresholds appropriately!)
    iterator = 0
    
    # we create empty buckets for the edges and nodes, and one for the iterator
    counts, edges, nodes = [],[],[]
    for g in Gs:
        counts.append(iterator)
        edges.append(g.number_of_edges())
        nodes.append(g.number_of_nodes())
        iterator+=1
        
    # After iterating through all sub-graphs, we load into a dataframe the results. 
    # Each graph is summarized as a line in this df
    df = pd.DataFrame({'id':counts,'edges':edges,'nodes':nodes})
    
    # we sort by the number of edges, largest graph first
    df = df.sort_values(by = 'edges', ascending = False)
    
    # We set the threshold for graph analysis here. The threshold is - half the number of edges of the largest graph. 
    thresh = df.edges.iloc[0] * 0.5
    
    # we remove any sub-graphs that don't meet this newly impoed standard
    df = df.loc[df.edges >= thresh]
    print(df)
    
    # we generate a list of the graph IDs through which to iterate in the actual summary statistics stage
    id_list = list(df.id)
    
    # now, we are ready to calculate some stats. We do the following process for each interesting subgraph:
    for i in range(0, len(id_list)):
        
        ### Section 1
        # note that all results are appended to the dictionary with 'i' - the graph ID - to
        # allow multiple results for each major-network
        
        # set up timing 
        start = time.time()
        i = 0
        
        # pick out current graph from Gs, the list of graphs
        curr_G = Gs[id_list[i]]
        
        # generate an UNdirected graph from the current graph for this stage of calcs
        undirected_G = nx.Graph(curr_G)
        
        # calculate cyclomatic number
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.cycles.cycle_basis.html
        circuits = nx.cycle_basis(undirected_G)
        cyclomatic_number = len(circuits)
        results['G%s_cyclomatic_number' % i] = cyclomatic_number
        
        # get simple number of nodes and edges
        e = undirected_G.number_of_edges()
        v = undirected_G.number_of_nodes()
        results['G%s_number_of_edges'% i] = e
        results['G%s_number_of_nodes'% i] = v
        
        # print out elapsed time for the above calculations
        print('\tTime elapsed for Section 1: %s seconds' % (time.time() - start))
        
        ### Section 2
        start = time.time()
        
        # calculate the network's alpha, beta and gamma as derivatives from the cyclomatic number + number of nodes and edges
        results['G%s_alpha'% i] = cyclomatic_number / ((2 * v) - 5)
        results['G%s_beta'% i] = e / v
        results['G%s_gamma'% i] = e / (3 * (v - 2))
        print('\tTime elapsed for Section 2: %s seconds' % (time.time() - start))
    
        ### Section 3
        # here, you see some cell magic with the '%' symbol. These commands will only work in a jupyter env. 
        start = time.time()
        
        # calculate eccentricity
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.distance_measures.eccentricity.html
        %time ecc = nx.eccentricity(undirected_G)
        
        # calculate network diameter
        # https://networkx.github.io/documentation/networkx-1.7/reference/generated/networkx.algorithms.distance_measures.diameter.html
        %time results['G%s_diameter' %i] = nx.diameter(undirected_G, ecc)
        
        # calculate network radius
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.distance_measures.radius.html
        %time results['G%s_radius' %i] = nx.radius(undirected_G, ecc)
        
        # calculate average clustering
        # https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.cluster.average_clustering.html
        %time results['G%s_average_clustering' %i] = nx.average_clustering(undirected_G)
        
        # calculate this massive monster of a mouthful
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.assortativity.degree_assortativity_coefficient.html
        %time results['G%s_degree_assortativity_coefficient' %i] = nx.degree_assortativity_coefficient(undirected_G)
        
        # the following two calcs were very time-expensive, so were commented out
        #%time results['G%s_global_efficiency' %i] = nx.global_efficiency(undirected_G)
        #%time results['G%s_av_node_connectivity' %i] = nx.average_node_connectivity(undirected_G)
        print('\tTime elapsed for Section 3: %s seconds' % (time.time() - start))
        
        ### Section 4
        start = time.time()
        
        # here, we calculate some by-node stats (mean).
        # We also calculate the median, 1st quartile and 3rd quartile of these results
        # https://networkx.github.io/documentation/networkx-1.9/reference/generated/networkx.algorithms.centrality.degree_centrality.html
        Z = list(nx.degree_centrality(undirected_G).values())
        results['G%s_av_degree_centrality' % i] = np.mean(Z)
        results['G%s_0.25_degree_centrality' % i] = np.percentile(Z, 25)
        results['G%s_0.50_degree_centrality' % i] = np.percentile(Z, 50)
        results['G%s_0.75_degree_centrality' % i] = np.percentile(Z, 75)
        print('\tTime elapsed for Section 4: %s seconds' % (time.time() - start))
        
        ### Section 5
        # same idea as for Section 4 here, but with closeness centrality. 
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.closeness_centrality.html
        start = time.time()
        Z = list(nx.closeness_centrality(undirected_G).values())
        results['G%s_av_closeness_centrality' % i] = np.mean(Z)
        results['G%s_0.25_closeness_centrality' % i] = np.percentile(Z, 25)
        results['G%s_0.50_closeness_centrality' % i] = np.percentile(Z, 50)
        results['G%s_0.75_closeness_centrality' % i] = np.percentile(Z, 75)
        print('\tTime elapsed for Section 5: %s seconds' % (time.time() - start))
        
        # Section 6
        # same idea as for Section 4 here, but with betweenness centrality. 
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.betweenness_centrality.html
        start = time.time()
        Z = list(nx.betweenness_centrality(undirected_G).values())
        results['G%s_av_betweenness_centrality' % i] = np.mean(Z)
        results['G%s_0.25_betweenness_centrality' % i] = np.percentile(Z, 25)
        results['G%s_0.50_betweenness_centrality' % i] = np.percentile(Z, 50)
        results['G%s_0.75_betweenness_centrality' % i] = np.percentile(Z, 75)
        print('\tTime elapsed for Section 6: %s seconds' % (time.time() - start))
        
        # Section 7
        # same idea as for Section 4 here, but with eigenvector centrality. 
        # https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.centrality.eigenvector_centrality.html
        start = time.time()
        try:
            Z = list(nx.eigenvector_centrality(undirected_G).values())
            results['G%s_av_eigenvector_centrality' % i] = np.mean(Z)
            results['G%s_0.25_eigenvector_centrality' % i] = np.percentile(Z, 25)
            results['G%s_0.50_eigenvector_centrality' % i] = np.percentile(Z, 50)
            results['G%s_0.75_eigenvector_centrality' % i] = np.percentile(Z, 75)
        except:
            pass
        print('\tTime elapsed for Section 7: %s seconds' % (time.time() - start))
        
        """
        This section was again commented out as it was slowing the code to a crawl.
        # Section 8
        start = time.time()
        try:
            Z = list(nx.communicability_betweenness_centrality(undirected_G).values())
            results['G%s_av_communicability_betweenness_centrality' % i] = np.mean(Z)
            results['G%s_0.25_av_communicability_betweenness_centrality' % i] = np.percentile(Z, 25)
            results['G%s_0.50_av_communicability_betweenness_centrality' % i] = np.percentile(Z, 50)
            results['G%s_0.75_av_communicability_betweenness_centrality' % i] = np.percentile(Z, 75)
        except:
            pass
        print('\tTime elapsed for Section 8: %s seconds' % (time.time() - start))
        """
    
    # generate a dataframe of the results we have calculated, by network
    df = pd.DataFrame(results, index = ['value'])
    
    # here the dataframe is transposed, the index reset, and the index 
    # renamed to 'var_name' to make it a 2D data table that can be efficiently sliced
    df = df.transpose().reset_index().rename(columns = {'index':'var_name'})
    
    # we add the country of interest as an additional column - the only time we use the ISO variable
    df['country'] = ISO
    
    # return the results df
    return df

### Execution

In [7]:
# set the root path to all of the networks which will be abalyzed. 
root = r'D:\Criticality II\country_networks'

# we walk the root path, picking out the pickles. This will look different depending on your file structure. 
Q = []
for q, t, folder in os.walk(root):
    if q[-6:] == 'output':
        Q.append(q)
        
# we iterate through each of our valid paths
for q in Q:
    
    # from my folder structure we pick out the ISO code. Again, this is user-specific to their file structure
    ISO = q[-10:-7]
    if ISO not in ['ABW','AFG']:
        
        print('...processing %s' % ISO)
        
        # we define g as the path to the pickled networkx object
        g = os.path.join(q, '{}_processed.pickle'.format(ISO))
        
        # we assign D to be the output dataframe from the netstats function
        D = NetStats(g, ISO)
        
        # we save the stats for this network down to the path location as a .csv
        path = r'C:\Users\charl\Documents\CE\Criticality\Netstats'
        D.to_csv(os.path.join(path, '%s_processed_netstats.csv' % ISO))

...processing AGO
   id  edges  nodes
0   0   7227   2674
	Time elapsed for Section 1: 0.07486414909362793 seconds
	Time elapsed for Section 2: 0.0 seconds
Wall time: 14.5 s
Wall time: 0 ns
Wall time: 0 ns
Wall time: 28 ms
Wall time: 25 ms
	Time elapsed for Section 3: 14.518999338150024 seconds
	Time elapsed for Section 4: 0.0010001659393310547 seconds
	Time elapsed for Section 5: 14.598000288009644 seconds
	Time elapsed for Section 6: 19.646999835968018 seconds
	Time elapsed for Section 7: 1.1510004997253418 seconds
...processing AIA
   id  edges  nodes
0   0     94     33
	Time elapsed for Section 1: 0.0 seconds
	Time elapsed for Section 2: 0.0 seconds
Wall time: 2 ms
Wall time: 0 ns
Wall time: 0 ns
Wall time: 0 ns
Wall time: 991 µs
	Time elapsed for Section 3: 0.0050008296966552734 seconds
	Time elapsed for Section 4: 0.0 seconds
	Time elapsed for Section 5: 0.0029993057250976562 seconds
	Time elapsed for Section 6: 0.0029702186584472656 seconds
	Time elapsed for Section 7: 0.014024

KeyboardInterrupt: 