# Distances

Using the matches found for data extraction, find the nodes connected to the matching shell companies up to a given distance using the edges. These will then be stored as csv files and used to create graphs that can visualize the degree of connectivity between the hit nodes.

In [1]:
#Imports
import pandas as pd

#spark
import findspark
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()

import networkx as nx

from operator import itemgetter
import matplotlib.pyplot as plt
import collections
from community import community_louvain
from networkx.algorithms.community.centrality import girvan_newman
import itertools
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

### Load Data

In [2]:
#Constants

DISTANCE_DEGREE = 1

def get_match_file(leak, charity):
    return '../generated/matches/' + leak + '_' + charity + '_matches.csv'


In [3]:
#Load matches
panama_wiki_matches = spark.read.csv(get_match_file('panama', 'wikipedia'), header=True)
panama_ingo_matches = spark.read.csv(get_match_file('panama', 'INGO'), header=True)
panama_forbes_matches = spark.read.csv(get_match_file('panama', 'forbes'), header=True)

matches = panama_wiki_matches.union(panama_forbes_matches).union(panama_ingo_matches).drop('_c0')

#Load edges
edges = spark.read.csv('../data/panama/panama_papers.edges.csv', header=True)

address_nodes = spark.read.csv('../data/panama/panama_papers.nodes.address.csv', header=True)
intermediary_nodes = spark.read.csv('../data/panama/panama_papers.nodes.intermediary.csv', header=True)
officer_nodes = spark.read.csv('../data/panama/panama_papers.nodes.officer.csv', header=True)
entity_nodes = spark.read.csv('../data/panama/panama_papers.nodes.entity.csv', header=True)

### Filter down huge leaks datasets to smaller match datasets

In [4]:
def filter_edges(edges, nodes):
    '''Given a set of nodes, returns the edges connected to those nodes'''
    ids = nodes.map(lambda r: r[0]).collect()
    return edges.rdd.filter(lambda r: r[0] in ids or r[2] in ids)

def filter_nodes(nodes, edges):
    ''
    start_ids = edges.map(lambda r: r[0]).collect()
    end_ids = edges.map(lambda r: r[2]).collect()
    
    return nodes.rdd.filter(lambda r: r[0] in start_ids or r[0] in end_ids)

def get_map_of_degree(degree):
    
    degree_i_nodes = matches.rdd
    
    
    for i in range(degree):
        degree_i_edges = filter_edges(edges, degree_i_nodes)
        
        degree_i_addresses = filter_nodes(address_nodes, degree_i_edges)
        degree_i_intermediary = filter_nodes(intermediary_nodes, degree_i_edges)
        degree_i_entities = filter_nodes(entity_nodes, degree_i_edges)
        degree_i_officers = filter_nodes(officer_nodes, degree_i_edges)
        
        degree_i_nodes = degree_i_addresses\
                                .union(degree_i_intermediary)\
                                .union(degree_i_entities)\
                                .union(degree_i_officers)
        
    return degree_i_edges, degree_i_nodes
        
    

In [5]:
filtered_edges, filtered_nodes = get_map_of_degree(DISTANCE_DEGREE)

In [6]:
filtered_nodes.take(5)

[Row(node_id='14002637', name=None, address='125 Main Street; P.O. Box 144; Road Town; Tortola; British Virgin Islands; VG1100', country_codes='VGB', countries='British Virgin Islands', sourceID='Panama Papers', valid_until='The Panama Papers  data is current through 2015', note=None),
 Row(node_id='14003360', name=None, address='13505 BELL AVENUE; OKLAHOMA CITY; OKLAHOMA 73142; USA', country_codes='USA', countries='United States', sourceID='Panama Papers', valid_until='The Panama Papers  data is current through 2015', note=None),
 Row(node_id='14023876', name=None, address='8th Floor; Max City Building; Remi Ollier Street; Port Louis; Mauritius', country_codes='MUS', countries='Mauritius', sourceID='Panama Papers', valid_until='The Panama Papers  data is current through 2015', note=None),
 Row(node_id='14037497', name=None, address='Dixcart House; Fort Charles; Charlestown; Nevis; St. Kitts & Nevis', country_codes='KNA', countries='Saint Kitts and Nevis', sourceID='Panama Papers', val

In [7]:
graph_edges = pd.DataFrame(filtered_edges.collect(),
             columns=['START_ID', 'TYPE', 'END_ID', 'link', 'start_date', 'end_date', 'sourceID', 'valid_until'])
graph_nodes = pd.DataFrame(filtered_nodes.collect())

In [8]:
print(len(graph_nodes[0]))
print(len(graph_nodes[0].unique()))

13472
13472


In [9]:
graph_nodes.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,14002637,,125 Main Street; P.O. Box 144; Road Town; Tort...,VGB,British Virgin Islands,Panama Papers,The Panama Papers data is current through 2015,,,,,,,,,,
1,14003360,,13505 BELL AVENUE; OKLAHOMA CITY; OKLAHOMA 731...,USA,United States,Panama Papers,The Panama Papers data is current through 2015,,,,,,,,,,
2,14023876,,8th Floor; Max City Building; Remi Ollier Stre...,MUS,Mauritius,Panama Papers,The Panama Papers data is current through 2015,,,,,,,,,,
3,14037497,,Dixcart House; Fort Charles; Charlestown; Nevi...,KNA,Saint Kitts and Nevis,Panama Papers,The Panama Papers data is current through 2015,,,,,,,,,,
4,14041972,,Flat D; 7/F.; Block 16; Provident Centre; 51 W...,HKG,Hong Kong,Panama Papers,The Panama Papers data is current through 2015,,,,,,,,,,


In [12]:
graph_edges.to_csv('../generated/map/panama_edges_degree_'+ str(DISTANCE_DEGREE) +'.csv')
graph_nodes.to_csv('../generated/map/panama_nodes_degree_'+ str(DISTANCE_DEGREE) +'.csv')