# Graphs

Using the previously collected data detailing how matches connect to the rest of the leaked information, we can create graphs and see the degree of interconnectivity between the matches.

In [188]:
#imports

import json
import itertools
import collections
import http_server

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

from operator import itemgetter
from community import community_louvain
from networkx.readwrite import json_graph
from networkx.algorithms.community.centrality import girvan_newman

%matplotlib inline
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

We only use degree 1, as larger degrees resulted in unmanagably large graphs. But distance 1 is quite good enough for our purposes.

In [5]:
#Global variables 

DEGREE = 1

DEGREE_FILE = '../generated/map/degree_' + str(DEGREE) +'/'

def get_graph_elem_file(elem_type, leak):
    return DEGREE_FILE + leak + '_' + elem_type + '.csv'



### Create the graphs

Note that as some of the graphs have still very many nodes, we can choose to filter these down to only those connected to more than one node (unless a node is a match -- we still want to know who is not connected to anyone).

Note the special attention drawn to the "bahamas" leak, which has a slightly different schema.

In [199]:
def remove_loners(full_df):
    
    '''Removes nodes with fewer than 2 connections who are not matches'''
    
    node_counts = full_df['START_ID'].append(full_df['END_ID']).value_counts()

    ids_to_remove = []

    for index, row in full_df.iterrows():
        start_id = row['START_ID']
        end_id = row['END_ID']

        if (row['match_x'] == False and node_counts[start_id] < 2) or (row['match_y'] == False and node_counts[end_id] < 2):
            ids_to_remove.append(index)
    
    return full_df.drop(full_df.index[ids_to_remove]).reset_index(drop=True)

def graph_leak_matches(leak, dense=False):
    
    '''Creates graphs of matches found in a given leak'''
    
    bahamas = (leak == 'bahamas')

    node_id = '0'
    name_index = '1'

    if bahamas:
        node_id = '4' 
        name_index = '7'

    plt.rcParams["figure.figsize"] = (23,23)

    
    #Load the data
    nodes = pd.read_csv(get_graph_elem_file('nodes', leak), index_col=0)\
                                                            .reset_index(drop=True)[[node_id, name_index, 'Match']]

    nodes.rename(columns={node_id:'node_id', name_index:'name', 'Match':'match'}, inplace=True)
    nodes['id'] = nodes.index
    
    edges = pd.read_csv(get_graph_elem_file('edges', leak), index_col=0).reset_index(drop=True)[['START_ID', 'END_ID']]
    #edges.rename(columns={index:'index_start'})
    
    #Format the data as a set of edges with information about the nodes
    full_df = pd.merge(nodes, edges, left_on='node_id', right_on='START_ID')
    full_df = pd.merge(full_df, nodes, left_on='END_ID', right_on='node_id').drop(['node_id_x', 'node_id_y'], axis=1)
    
    #filter too large graphs if told so
    if dense:
        print('Size before removing loners: ' + str(len(full_df)))
        full_df = remove_loners(full_df)
        print('Size after removing loners: ' + str(len(full_df)))
    
    #create the graph
    #graph = nx.from_pandas_edgelist(full_df, 'name_x', 'name_y', edge_attr=None, create_using=nx.DiGraph())
    #graph = nx.from_pandas_edgelist(full_df, source='START_ID', target='END_ID', edge_attr=True, create_using=nx.DiGraph())
    graph = nx.DiGraph()
    for i in range(0,len(full_df)):
        graph.add_edge(int(full_df.loc[i,"id_x"]), int(full_df.loc[i,'id_y']))
    
    for n in graph:
        graph.node[n]['name'] = nodes.loc[nodes['id'] == n, 'name'].to_string(index=False)
    
    #prepare for coloring the nodes according to if it is a match to a charity or not
    match_class_1 = full_df[['name_x', 'match_x']].rename(columns={'name_x':'name', 'match_x':'match'})
    match_class_2 = full_df[['name_y', 'match_y']].rename(columns={'name_y':'name', 'match_y':'match'})
    
    match_class = match_class_1.append(match_class_2).set_index('name')
    match_class = match_class[~match_class.index.duplicated(keep='first')]
    
    match_class = match_class.reindex(graph.nodes())
    match_class['match'] = pd.Categorical(match_class['match'])
    
    print('Matches for ' + leak + ' papers: ')
    print(nx.info(graph))
    
    #draw the graph
    #nx.draw(graph, nx.spring_layout(graph, scale=60, k=0.25), with_labels=True, alpha=1, node_size=1000,
    #    node_color=match_class['match'].cat.codes, cmap=plt.cm.Paired)
    
    d = json_graph.node_link_data(graph)
    json.dump(d, open('network/force.json','w'))
    
    return nodes

### Graphing

In [202]:
nodes = graph_leak_matches('panama')

Matches for panama papers: 
Name: 
Type: DiGraph
Number of nodes: 111
Number of edges: 71
Average in degree:   0.6396
Average out degree:   0.6396


In [13]:
http_server.load_url('temp/force/index.html')

127.0.0.1 - - [12/Dec/2018 20:21:50] "GET /temp/force/index.html HTTP/1.1" 200 -


Press <RETURN> to stop server

To restart server run: 
python -m http.server 8000


In [203]:
graph_leak_matches('paradise')

Matches for paradise papers: 
Name: 
Type: DiGraph
Number of nodes: 467
Number of edges: 376
Average in degree:   0.8051
Average out degree:   0.8051


Unnamed: 0,node_id,name,match,id
0,81021085,"405 Lexington Avenue, Third Floor; New York Ne...",False,0
1,81024524,One Rotary Center; 1560 Sherman Avenue; Evanst...,False,1
2,81026273,"3rd Floor, 2431 – 37 Ave NE; Calgary T2E 3A8; ...",False,2
3,81027090,Canon's Court; 22 Victoria Street; Hamilton; H...,False,3
4,81027146,Clifton House; 75 Fort Street; Grand Cayman KY...,False,4
5,81029389,Argyle House; 41a Cedar Avenue; Hamilton HM 12...,False,5
6,81031545,46 Point Finger Road; Paget DV 04; Bermuda,False,6
7,81032909,225 N Michigan; Suite 1200; Chicago; IL 60601;...,False,7
8,81035645,One Robert Wood Johnson Place; New Brunswick; ...,False,8
9,81037035,"633 Third Avenue, 4th Floor; New York; New Yor...",False,9


In [204]:
graph_leak_matches('offshore')

Matches for offshore papers: 
Name: 
Type: DiGraph
Number of nodes: 68
Number of edges: 58
Average in degree:   0.8529
Average out degree:   0.8529


Unnamed: 0,node_id,name,match,id
0,237148,,False,0
1,236724,,False,1
2,264050,,False,2
3,268974,,False,3
4,49684,Equity Trust (Samoa) Limited,False,4
5,54662,Portcullis TrustNet (BVI) Limited,False,5
6,123290,Company Incorporations Asia Limited,False,6
7,119709,ATC Primasia Limited,False,7
8,290197,NetIncorp.com Corporation,False,8
9,291482,"Gold-In Consulting Co., Ltd. ???????????????",False,9


In [None]:
graph_leak_matches('bahamas')

In [None]:
graph_leak_matches('panama', True)

In [None]:
graph_leak_matches('paradise', True)

In [None]:
graph_leak_matches('offshore', True)

In [201]:
graph_leak_matches('bahamas', True)

Size before removing loners: 27
Size after removing loners: 7
Matches for bahamas papers: 
Name: 
Type: DiGraph
Number of nodes: 10
Number of edges: 7
Average in degree:   0.7000
Average out degree:   0.7000


Unnamed: 0,node_id,name,match,id
0,23000046,** DISABLED SUISSE SECURITY BANK & TRUST,False,0
1,23000047,GRAHAM COOPER,False,1
2,23000083,PRICEWATERHOUSECOOPERS (BAHAMAS) LIMITED,False,2
3,23000136,MOSSACK FONSECA & CO. (BAHAMAS) LIMITED,False,3
4,23000139,FIRST CHOICE SERVICES LTD.,False,4
5,23000147,UBS TRUSTEES (BAHAMAS) LTD.,False,5
6,23000164,MICHAEL HEPBURN,False,6
7,23000166,SOVEREIGN BAHAMAS LIMITED,False,7
8,23000198,LENNOX CORPORATE SERVICES LIMITED,False,8
9,23000228,H & J CORPORATE SERVICES LTD.,False,9
