# Graphs

Using the previously collected data detailing how matches connect to the rest of the leaked information, we can create graphs and see the degree of interconnectivity between the matches.

In [1]:
#imports

import json
import itertools
import collections

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from operator import itemgetter
from community import community_louvain
from networkx.readwrite import json_graph
from networkx.algorithms.community.centrality import girvan_newman

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

We only use degree 1, as larger degrees resulted in unmanagably large graphs. But distance 1 is quite good enough for our purposes.

In [3]:
#Global variables 

DEGREE = 1

DEGREE_FILE = '../generated/map/degree_' + str(DEGREE) +'/'

def get_graph_elem_file(elem_type, leak):
    return DEGREE_FILE + leak + '_' + elem_type + '.csv'



### Create the graphs

Note that as some of the graphs have still very many nodes, we can choose to filter these down to only those connected to more than one node (unless a node is a match -- we still want to know who is not connected to anyone).

Note the special attention drawn to the "bahamas" leak, which has a slightly different schema.

In [4]:
def clean_nan(full_df, edge_df, node_df):
    
    '''Removes nodes NaN names'''

    ids_to_remove = []

    full_df_error = full_df.replace(np.nan, 'ERROR', regex=True)
    
    for index, row in full_df_error.iterrows():
        start_id = row['START_ID']
        end_id = row['END_ID']
        
        if row['name_x'] == 'ERROR':
            ids_to_remove.append(start_id)
        if row['name_y'] == 'ERROR':
            ids_to_remove.append(end_id)
    
    print(ids_to_remove)
    
    edge_clean = edge_df[~edge_df['START_ID'].isin(ids_to_remove)]
    edge_clean = edge_clean[~edge_clean['END_ID'].isin(ids_to_remove)]
    
    node_clean = node_df[node_df['node_id'].isin(edge_clean['START_ID']) | node_df['node_id'].isin(edge_clean['END_ID'])]
    
    edge_clean = edge_clean.reset_index(drop=True)
    node_clean = node_clean.reset_index(drop=True)
    node_clean['id'] = node_clean.index
        
    full_df_clean = pd.merge(node_clean, edge_clean, left_on='node_id', right_on='START_ID')
    full_df_clean = pd.merge(full_df_clean, node_clean, left_on='END_ID', right_on='node_id').drop(['node_id_x', 'node_id_y'], axis=1)
        
    return full_df_clean, edge_clean, node_clean

def remove_loners(full_df, edge_df, node_df):
    
    '''Removes nodes with fewer than 2 connections who are not matches'''
    
    node_counts = full_df['START_ID'].append(full_df['END_ID']).value_counts()

    ids_to_remove = []

    for index, row in full_df.iterrows():
        start_id = row['START_ID']
        end_id = row['END_ID']
        
        if (row['match_x'] == False and node_counts[start_id] < 2):
            ids_to_remove.append(start_id)
        if (row['match_y'] == False and node_counts[end_id] < 2):
            ids_to_remove.append(end_id)
    
    edge_lean = edge_df[~edge_df['START_ID'].isin(ids_to_remove)]
    edge_lean = edge_lean[~edge_lean['END_ID'].isin(ids_to_remove)]
    
    node_lean = node_df[node_df['node_id'].isin(edge_lean['START_ID']) | node_df['node_id'].isin(edge_lean['END_ID'])]
    return edge_lean, node_lean

def graph_leak_matches(leak, dense=False):
    
    '''Creates graphs of matches found in a given leak'''
    
    bahamas = (leak == 'bahamas')

    node_id = '0'
    name_index = '1'

    if bahamas:
        node_id = '4' 
        name_index = '7'

    plt.rcParams["figure.figsize"] = (23,23)

    
    #Load the data
    nodes = pd.read_csv(get_graph_elem_file('nodes', leak), index_col=0)\
                                                            .reset_index(drop=True)[[node_id, name_index, 'Match']]

    nodes.rename(columns={node_id:'node_id', name_index:'name', 'Match':'match'}, inplace=True)
    nodes['id'] = nodes.index
    
    edges = pd.read_csv(get_graph_elem_file('edges', leak), index_col=0).reset_index(drop=True)[['START_ID', 'END_ID']]
    
    #Format the data as a set of edges with information about the nodes
    full_df = pd.merge(nodes, edges, left_on='node_id', right_on='START_ID')
    full_df = pd.merge(full_df, nodes, left_on='END_ID', right_on='node_id').drop(['node_id_x', 'node_id_y'], axis=1)
    
    #filter too large graphs if told so
    if dense:
        print('Size before removing loners: ' + str(len(full_df)))
        edges_lean, nodes_lean = remove_loners(full_df, edges, nodes)
        edges_lean = edges_lean.reset_index(drop=True)
        nodes_lean = nodes_lean.reset_index(drop=True)
        nodes_lean['id'] = nodes_lean.index
        
        full_df_lean = pd.merge(nodes_lean, edges_lean, left_on='node_id', right_on='START_ID')
        full_df_lean = pd.merge(full_df_lean, nodes_lean, left_on='END_ID', right_on='node_id').drop(['node_id_x', 'node_id_y'], axis=1)
        
        nodes = nodes_lean
        edges = edges_lean
        full_df = full_df_lean
        print('Size after removing loners: ' + str(len(full_df)))
        
    #prepare for coloring the nodes according to if it is a match to a charity or not
    match_class_1 = full_df[['name_x', 'match_x']].rename(columns={'name_x':'name', 'match_x':'match'})
    match_class_2 = full_df[['name_y', 'match_y']].rename(columns={'name_y':'name', 'match_y':'match'})
    
    match_class = match_class_1.append(match_class_2).set_index('name')
    match_class = match_class[~match_class.index.duplicated(keep='first')]
    
    #match_class = match_class.reindex(graph.nodes())
    #match_class['match'] = pd.Categorical(match_class['match'])
    

    graph = nx.DiGraph()
    for i in range(0,len(full_df)):
        graph.add_edge(int(full_df.loc[i,"id_x"]), int(full_df.loc[i,'id_y']))
    
    if dense:
        for n in graph:
            graph.node[n]['name'] = nodes_lean.loc[nodes_lean['id'] == n, 'name'].to_string(index=False)
            graph.node[n]['match'] = nodes_lean.loc[nodes_lean['id'] == n, 'match'].bool()
    else:
        for n in graph:
            graph.node[n]['name'] = nodes.loc[nodes['id'] == n, 'name'].to_string(index=False)
            graph.node[n]['match'] = nodes.loc[nodes['id'] == n, 'match'].bool()
    
    #export as .json
    d = json_graph.node_link_data(graph)
    json.dump(d, open('network/force.json','w'))
    
    print('Matches for ' + leak + ' papers: ')
    print(nx.info(graph))

In [5]:
graph_leak_matches('panama', True)

Size before removing loners: 37
Size after removing loners: 11
Matches for panama papers: 
Name: 
Type: DiGraph
Number of nodes: 14
Number of edges: 9
Average in degree:   0.6429
Average out degree:   0.6429
