## Imports

In [1]:
import pandas as pd
import numpy as np
import networkx as nx
import copy

import matplotlib.pyplot as plt

# https://www.occrp.org/en/panamapapers/database
# TRUMP OFFSHORE INC. is good example to see all entities interacting

## Filenames / paths

The data is separated for every leak source. For each leak source there is a folder containing the nodes of the graph, that can be of different types : <i>intermediary, officer, entity, address</i> (and <i>other</i> for paradise papers only). The folder also contains the edges of this graph.

In [2]:
bahamas_folder = "bahamas/"
panama_folder = "panama/"
paradise_folder = "paradise/"
offshore_folder = "offshore/"

sources_names = [bahamas_folder[:-1], panama_folder[:-1], paradise_folder[:-1], offshore_folder[:-1]]

panama_name = panama_folder + "panama_papers"
paradise_name = paradise_folder + "paradise_papers"
offshore_name = offshore_folder + "offshore_leaks"
bahamas_name = bahamas_folder + "bahamas_leaks"

edges_name = ".edges"
nodes_name = ".nodes."

address_name = "address"
intermediary_name = "intermediary"
officer_name = "officer"
entity_name = "entity"
others_name = "other" # Only for paradise paper there is this extra entity

usual_entity_names = [address_name, intermediary_name, officer_name, entity_name]

## Build local storage

We store data in dictionnaries that map each leak source to its content, which is a dictionnary that maps each type of entity to the Dataframe containing its values. For example <b>d_sources["bahamas"]["officer"]</b> is the Dataframe of officers coming from the bahamas leaks.

In [3]:
def my_read_csv(filename) :
    """ To have same rules when reading data from csv """
    return pd.read_csv(filename, dtype = np.str)

def build_dict(source_name):
    """
    Create a dictionnary for a certain source_name (among : Panama papers, Paradise papers...)
    that maps to each entity name (among : Officer, Intermediary, Address...)
    the content of the csv from source_name for this entity
    """
    d = {en : my_read_csv(source_name + nodes_name + en + ".csv") for en in usual_entity_names}
    
    if source_name == paradise_name: # Extra "other" entity in paradise papers
        d[others_name] = my_read_csv(source_name + nodes_name + others_name + ".csv")
    
    #Add edges
    d["edges"] = my_read_csv(source_name + edges_name + ".csv")
              
    return d

Build the dictionnary, that maps each source its content

In [4]:
d_sources = dict()
d_sources["bahamas"] = build_dict(bahamas_name)
d_sources["panama"] = build_dict(panama_name)
d_sources["paradise"] = build_dict(paradise_name)
d_sources["offshore"] = build_dict(offshore_name)

In [5]:
d_sources['panama']['entity'].columns

Index(['node_id', 'name', 'jurisdiction', 'jurisdiction_description',
       'country_codes', 'countries', 'incorporation_date', 'inactivation_date',
       'struck_off_date', 'closed_date', 'ibcRUC', 'status', 'company_type',
       'service_provider', 'sourceID', 'valid_until', 'note'],
      dtype='object')

## Getting familiar with the data format

### Define some coloring for printing

Keep the same coloring during the project, it makes data very easily readable once you get familiar with the coloring !

In [6]:
BOLD = '\033[1m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
END = '\033[0m'

color_dict = dict()
color_dict["bahamas"] = YELLOW
color_dict["paradise"] = GREEN
color_dict["panama"] = RED
color_dict["offshore"] = BLUE

def color(str):
    """
    Returns the str given in the color of the source it is from 
    (the str must contain source name)
    """
    for source in color_dict.keys():
        if source in str:
            return color_dict[source] + str + END 
        
    return BOLD + str + END #Default color is BOLD

for name, _ in color_dict.items():
    print(color(name))
print(color("Unknown source"))

[93mbahamas[0m
[92mparadise[0m
[91mpanama[0m
[94moffshore[0m
[1mUnknown source[0m


### See what data source misses which column

In [7]:
for source, dict_data in d_sources.items():
    for source_compare, dict_data_compare in d_sources.items():
        print("\n", color(source_compare), "missing columns from source :", color(source))
        for entity in usual_entity_names:
            missing_columns = []
            for col in dict_data[entity].columns:
                if not col in dict_data_compare[entity].columns:
                    missing_columns.append(col)
            if(len(missing_columns) > 0):
                print("Node type", entity, "misses", len(missing_columns), "columns, namely : ", missing_columns)



 [93mbahamas[0m missing columns from source : [93mbahamas[0m

 [91mpanama[0m missing columns from source : [93mbahamas[0m
Node type address misses 10 columns, namely :  ['labels(n)', 'jurisdiction_description', 'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date', 'ibcRUC', 'type', 'status', 'company_type']
Node type intermediary misses 10 columns, namely :  ['labels(n)', 'address', 'jurisdiction_description', 'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date', 'ibcRUC', 'type', 'company_type']
Node type officer misses 11 columns, namely :  ['labels(n)', 'address', 'jurisdiction_description', 'service_provider', 'jurisdiction', 'closed_date', 'incorporation_date', 'ibcRUC', 'type', 'status', 'company_type']
Node type entity misses 3 columns, namely :  ['labels(n)', 'address', 'type']

 [92mparadise[0m missing columns from source : [93mbahamas[0m
Node type address misses 10 columns, namely :  ['labels(n)', 'jurisdiction_description', 

We see that <span style="color:orange">bahamas</span> is the most "complete" source, in the sense it is the one that has the biggest number of columns missing in the others. We will therefore use it to explore the content of columns. *'inactivation_date'* and  *'struck_off_date'* columns from entity will then be explored in <span style="color:red">panama</span>

#### Special case : Paradise paper, <i>other</i> entity

In [8]:
d_sources["paradise"]["other"].columns

Index(['node_id', 'name', 'country_codes', 'countries', 'sourceID',
       'valid_until', 'note'],
      dtype='object')

### SourceID in different sources

We see paradise papers is the only source that has different sourceID

In [9]:
for source, dict_data in d_sources.items():
    print("\nSource :", color(source))
    for entity in usual_entity_names:
        value_count =  dict_data[entity]["sourceID"].value_counts()
        print("Node :", entity, len(value_count), "different sourceID :")


Source : [93mbahamas[0m
Node : address 1 different sourceID :
Node : intermediary 1 different sourceID :
Node : officer 1 different sourceID :
Node : entity 1 different sourceID :

Source : [91mpanama[0m
Node : address 1 different sourceID :
Node : intermediary 1 different sourceID :
Node : officer 1 different sourceID :
Node : entity 1 different sourceID :

Source : [92mparadise[0m
Node : address 7 different sourceID :
Node : intermediary 5 different sourceID :
Node : officer 9 different sourceID :
Node : entity 9 different sourceID :

Source : [94moffshore[0m
Node : address 1 different sourceID :
Node : intermediary 1 different sourceID :
Node : officer 1 different sourceID :
Node : entity 1 different sourceID :


### Check if node_id is a good index for Nodes

In [10]:
merged_node_id = pd.Series()

for source, dict_data in d_sources.items():
    for entity in usual_entity_names:
        merged_node_id = merged_node_id.append(dict_data[entity]["node_id"], ignore_index = True)
        if not dict_data[entity]["node_id"].is_unique:
            print("node_id isn't unique for source", source, "node", entity)

So for each source indepently node_id is a good index. Unfortunately, it doesn't hold if we merge sources

In [11]:
merged_node_id.value_counts().max()

2

### Look for NaN values

In [53]:
bahamas_officer = d_sources["bahamas"]["edges"]
bahamas_officer

Unnamed: 0,node_1,rel_type,node_2,sourceID,valid_until,start_date,end_date
0,24000030,same_address_as,14035591,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
1,24000086,same_address_as,14077570,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
2,24000090,same_address_as,14077931,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
3,24000098,same_address_as,14037925,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
4,24000336,same_address_as,14049152,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
5,20010508,same_company_as,10212286,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
6,20010509,same_company_as,10178531,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
7,20010539,same_company_as,10172356,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
8,20010540,same_company_as,10178601,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,
9,20010541,same_company_as,10171021,Bahamas Leaks,The Bahamas Leaks data is current through earl...,,


In [64]:
d_clean = copy.deepcopy(d_sources)
d_clean["bahamas"]['address'] = d_sources["bahamas"]['address'][['country_codes', 'node_id']]
d_clean["panama"]['address'] = d_sources["panama"]['address'][['country_codes', 'node_id']]
d_clean["paradise"]['address'] = d_sources["paradise"]['address'][['country_codes', 'node_id']]
d_clean["offshore"]['address'] = d_sources["offshore"]['address'][['country_codes', 'node_id']]

d_clean['bahamas']['entity'] = d_sources['bahamas']['entity'][['node_id','name','jurisdiction','incorporation_date']]
d_clean['panama']['entity'] = d_sources['panama']['entity'][['node_id','name','jurisdiction','country_codes','incorporation_date']]
d_clean['paradise']['entity'] = d_sources['paradise']['entity'][['node_id', 'name','jurisdiction','country_codes','incorporation_date']]
d_clean['offshore']['entity'] = d_sources['offshore']['entity'][['node_id', 'name','jurisdiction','country_codes','incorporation_date']]

d_clean['bahamas']['intermediary'] = d_sources['bahamas']['intermediary'][['node_id', 'country_codes','name']]
d_clean['panama']['intermediary'] = d_sources['panama']['intermediary'][['node_id', 'country_codes','name']]
d_clean['paradise']['intermediary'] = d_sources['paradise']['intermediary'][['node_id', 'country_codes','name']]
d_clean['offshore']['intermediary'] = d_sources['offshore']['intermediary'][['node_id', 'country_codes','name']]

d_clean['bahamas']['officer'] = d_sources['bahamas']['officer'][['node_id', 'country_codes','name']]
d_clean['panama']['officer'] = d_sources['panama']['officer'][['node_id', 'country_codes','name']]
d_clean['paradise']['officer'] = d_sources['paradise']['officer'][['node_id', 'country_codes','name']]
d_clean['offshore']['officer'] = d_sources['offshore']['officer'][['node_id', 'country_codes','name']]

d_clean['paradise']['other'] = d_sources['paradise']['other'][['node_id', 'country_codes','name']]

d_clean['bahamas']['edges'] = d_sources['bahamas']['edges'][['node_1','node_2', 'rel_type', 'start_date', 'end_date']]
d_clean['panama']['edges'] = d_sources['panama']['edges'][['START_ID', 'END_ID', 'TYPE', 'start_date', 'end_date']]
d_clean['paradise']['edges'] = d_sources['paradise']['edges'][['START_ID', 'END_ID', 'TYPE', 'start_date', 'end_date']]
d_clean['offshore']['edges'] = d_sources['offshore']['edges'][['START_ID', 'END_ID', 'TYPE', 'start_date', 'end_date']]

In [41]:
countries = dict(zip(d_sources['bahamas']['address']['country_codes'], d_sources['bahamas']['address']['countries']))
countries.update(dict(zip(d_sources['panama']['address']['country_codes'], d_sources['panama']['address']['countries'])))
countries.update(dict(zip(d_sources['paradise']['address']['country_codes'], d_sources['paradise']['address']['countries'])))
countries.update(dict(zip(d_sources['offshore']['address']['country_codes'], d_sources['offshore']['address']['countries'])))

In [44]:
jurisdictions = dict(zip(d_sources['bahamas']['entity']['jurisdiction'], d_sources['bahamas']['entity']['jurisdiction_description']))
jurisdictions.update(dict(zip(d_sources['panama']['entity']['jurisdiction'], d_sources['panama']['entity']['jurisdiction_description'])))
jurisdictions.update(dict(zip(d_sources['paradise']['entity']['jurisdiction'], d_sources['paradise']['entity']['jurisdiction_description'])))
jurisdictions.update(dict(zip(d_sources['offshore']['entity']['jurisdiction'], d_sources['offshore']['entity']['jurisdiction_description'])))

In [49]:
countries

{'BHS': 'Bahamas',
 'NLD': 'Netherlands',
 'ESP': 'Spain',
 'SVN': 'Slovenia',
 'PAN': 'Panama',
 'JOR': 'Jordan',
 'HUN': 'Hungary',
 'GRC': 'Greece',
 'VEN': 'Venezuela',
 'GBR': 'United Kingdom',
 'USA': 'United States',
 'SGP': 'Singapore',
 'KAZ': 'Kazakhstan',
 'GGY': 'Guernsey',
 'KEN': 'Kenya',
 'MCO': 'Monaco',
 'CHE': 'Switzerland',
 'BMU': 'Bermuda',
 'MLT': 'Malta',
 'IRL': 'Ireland',
 'MDG': 'Madagascar',
 'PHL': 'Philippines',
 'CAN': 'Canada',
 'CHN': 'China',
 'JPN': 'Japan',
 'KOR': 'South Korea',
 'MYS': 'Malaysia',
 'VNM': 'Viet Nam',
 'AUS': 'Australia',
 'THA': 'Thailand',
 'KHM': 'Cambodia',
 'TWN': 'Taiwan',
 'HKG': 'Hong Kong',
 'AGO': 'Angola',
 'BLZ': 'Belize',
 'AIA': 'Anguilla',
 nan: nan,
 'ATG': 'Antigua and Barbuda',
 'VGB': 'British Virgin Islands',
 'MNG': 'Mongolia',
 'ARE': 'United Arab Emirates',
 'CAF': 'Central African Republic',
 'IDN': 'Indonesia',
 'CRI': 'Costa Rica',
 'IND': 'India',
 'PAK': 'Pakistan',
 'RUS': 'Russia',
 'JEY': 'Jersey',
 'PE

In [50]:
jurisdictions

{'BAH': 'Bahamas',
 'SAM': 'Samoa',
 'PMA': 'Panama',
 'NEV': 'Nevada',
 'UK': 'United Kingdom',
 'SGP': 'Singapore',
 'RAK': 'Ras Al Khaimah',
 'IOM': 'Isle Of Man',
 'ANG': 'British Anguilla',
 'SEY': 'Seychelles',
 'NIUE': 'Niue',
 'UY': 'Uruguay',
 'BVI': 'British Virgin Islands',
 'HK': 'Hong Kong',
 'NZL': 'New Zealand',
 'CYP': 'Cyprus',
 'MLT': 'Malta',
 'BLZ': 'Belize',
 'JSY': 'Jersey',
 'WYO': 'Wyoming',
 'CRI': 'Costa Rica',
 'AW': 'Aruba',
 'XX': 'Undetermined',
 'AE': 'United Arab Emirates',
 'AG': 'Antigua and Barbuda',
 'AI': 'Anguilla',
 'AN': 'Netherlands Antilles',
 'BB': 'Barbados',
 'BM': 'Bermuda',
 'BS': 'Bahamas',
 'BZ': 'Belize',
 'CK': 'Cook Islands',
 'CY': 'Cyprus',
 'BRB': 'Barbados',
 'LBN': 'Lebanon',
 'CYM': 'Cayman Islands',
 'GHA': 'Ghana',
 'USA': 'United States Of America',
 'GD': 'Grenada',
 'GG': 'Guernsey',
 'GI': 'Gibraltar',
 'IM': 'Isle of Man',
 'JE': 'Jersey',
 'KN': 'Saint Kitts and Nevis',
 'KY': 'Cayman Islands',
 'LI': 'Liechtenstein',
 '