# Graphs

This notebook is similar to the second part of the __Analysis notebook__. Its sole purpose is to create the same graphs as before, but in a format that can be used for the data story's moving network.

In [1]:
#imports

import json
import itertools
import collections

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from operator import itemgetter
from community import community_louvain
from networkx.readwrite import json_graph
from networkx.algorithms.community.centrality import girvan_newman

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

We only use degree 1, as larger degrees resulted in unmanagably large graphs. But distance 1 is quite good enough for our purposes.

In [2]:
#Global variables 

DEGREE = 1

DEGREE_FILE = '../generated/map/degree_' + str(DEGREE) +'/'

def get_graph_elem_file(elem_type, leak):
    return DEGREE_FILE + leak + '_' + elem_type + '.csv'



### Create the graphs

Note that as some of the graphs have still very many nodes, we can choose to filter these down to only those connected to more than one node (unless a node is a match -- we still want to know who is not connected to anyone).

Note the special attention drawn to the "bahamas" leak, which has a slightly different schema.

In [3]:
def remove_loners(full_df, clusters_only, cluster_size=2):
    
    '''Removes nodes with fewer than 2 connections who are not matches'''
    
    node_counts = full_df['START_ID'].append(full_df['END_ID']).value_counts()

    ids_to_remove = []
        
    for index, row in full_df.iterrows():
        start_id = row['START_ID']
        end_id = row['END_ID']
        
        if clusters_only:
            if ((node_counts[start_id] < cluster_size) and (node_counts[end_id] < cluster_size)):
                ids_to_remove.append(index)
            
        else:
            if ((row['match_x'] == False and node_counts[start_id] < cluster_size)
                or (row['match_y'] == False and node_counts[end_id] < cluster_size)):
                ids_to_remove.append(index)
    
    return full_df.drop(full_df.index[ids_to_remove]).reset_index(drop = True)

#The moving network needs node ids starting at 0, so we reset them
def zero_id_nodes(full_df):
    '''Sets the indexes of the nodes to zero (needed for the graph)'''
    id_map = {}
    curr_id = 0
    
    for index, row in full_df.iterrows():
        start_id = row['START_ID']
        end_id = row['END_ID']
        
        if start_id not in id_map:
            id_map[start_id] = curr_id
            curr_id +=1
            
        if end_id not in id_map:
            id_map[end_id] = curr_id
            curr_id +=1
            
        full_df.at[index, 'START_ID'] = id_map[start_id]
        full_df.at[index, 'END_ID'] = id_map[end_id]
    
    return full_df

def extract_nodes(full_df):
    start_nodes = full_df[['START_ID', 'name_x', 'match_x']]
    end_nodes = full_df[['END_ID', 'name_y', 'match_y']]
    
    start_nodes.rename(columns={'START_ID':'id', 'name_x': 'name', 'match_x': 'match'}, inplace = True)
    end_nodes.rename(columns={'END_ID':'id', 'name_y': 'name', 'match_y': 'match'}, inplace = True)
    
    nodes = start_nodes.append(end_nodes).drop_duplicates(['id', 'name', 'match'])
    
    return nodes
    
    

def graph_leak_matches(leak, dense = False, clusters_only = False, cluster_size = 2):
    
    '''Creates graphs of matches found in a given leak'''
    
    bahamas = (leak == 'bahamas')

    node_id = '0'
    name_index = '1'

    if bahamas:
        node_id = '4' 
        name_index = '7'
    
    #Load the data
    nodes = pd.read_csv(get_graph_elem_file('nodes', leak), index_col = 0)\
                                                            .reset_index(drop = True)[[node_id, name_index, 'Match']]

    nodes.rename(columns={node_id:'node_id', name_index:'name', 'Match':'match'}, inplace = True)
    edges = pd.read_csv(get_graph_elem_file('edges', leak), index_col = 0).reset_index(drop = True)[['START_ID', 'END_ID']]

    #Format the data as a set of edges with information about the nodes
    full_df = pd.merge(nodes, edges, left_on = 'node_id', right_on = 'START_ID')
    full_df = pd.merge(full_df, nodes, left_on = 'END_ID', right_on = 'node_id').drop(['node_id_x', 'node_id_y'], axis = 1)
    
    #filter too large graphs if told so
    if dense:
        print('Size before removing loners: ' + str(len(full_df)))
        full_df = remove_loners(full_df, clusters_only, cluster_size)
        print('Size after removing loners: ' + str(len(full_df)))
        
    full_df = zero_id_nodes(full_df)
    
    graph_nodes = extract_nodes(full_df)
    
    #create the graph
    graph = nx.DiGraph()
    for i in range(0,len(full_df)):
        graph.add_edge(int(full_df.loc[i,"START_ID"]), int(full_df.loc[i,'END_ID']))

    for n in graph:
        graph.node[n]['name'] = graph_nodes.loc[graph_nodes['id'] == n, 'name'].to_string(index=False)
        graph.node[n]['match'] = graph_nodes.loc[graph_nodes['id'] == n, 'match'].bool()
        
            
    
    #export as .json
    d = json_graph.node_link_data(graph)
    name = leak
    
    if dense:
        name = leak + "_small"
    json.dump(d, open('../results/graphs/graph_'+ name +'.json','w'))
    
    graph
    

In [4]:
graph_leak_matches('panama')

In [5]:
graph_leak_matches('paradise')

In [6]:
graph_leak_matches('paradise', True)

Size before removing loners: 234
Size after removing loners: 57


In [7]:
graph_leak_matches('offshore')

In [8]:
graph_leak_matches('offshore', True)

Size before removing loners: 98
Size after removing loners: 33


In [9]:
graph_leak_matches('bahamas')