# Distances

Using the matches found for data extraction, find the nodes connected to the matching shell companies up to a given distance using the edges. These will then be stored as csv files and used to create graphs that can visualize the degree of connectivity between the hit nodes.

In [24]:
#Imports
import pandas as pd

#spark
import findspark
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()

import warnings
warnings.filterwarnings('ignore')

### Load Data

In [25]:
#Constants

DISTANCE_DEGREE = 1

def get_match_file(leak, charity, node_type):
    return '../generated/inspected_matches/' + node_type +'/' + node_type +'_'+ leak + '_' + charity + '_matches.csv'

def get_matches(leak):
    
    node_types = ['officer', 'entity']
    charity_types = ['wikipedia', 'INGO', 'forbes']
    
    first = True
    
    matches = None
    
    for node_type in node_types:
        for charity_type in charity_types:
            
            if first:
                matches = spark.read.csv(get_match_file(leak, charity_type, node_type), header=True)
                first = False
            else:
                new_matches = spark.read.csv(get_match_file(leak, charity_type, node_type), header=True)
                matches = matches.union(new_matches)
                
    return matches.drop('_c0')
    
    

class Leak_Nodes:
    def __init__(self, leak):
        self.address_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.address.csv', header=True)
        self.intermediary_nodes = spark.read\
                                .csv('../data/'+ leak +'/'+ leak +'*.nodes.intermediary.csv', header=True)
        self.officer_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.officer.csv', header=True)
        self.entity_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.entity.csv', header=True)



### Filter down huge leaks datasets to smaller match datasets

In [26]:
def filter_edges(edges, nodes):
    '''Given a set of nodes, returns the edges connected to those nodes'''
    ids = nodes.map(lambda r: r[0]).collect()
    return edges.rdd.filter(lambda r: r[0] in ids or r[2] in ids)

def filter_nodes(nodes, edges, bahamas):
    '''Given a set of edges, return the nodes connected to those edges'''
    start_ids = edges.map(lambda r: r[0]).collect()
    end_ids = edges.map(lambda r: r[2]).collect()
    
    index_id = 0
    if bahamas:
        index_id = 4
        
    
    return nodes.rdd.filter(lambda r: r[index_id] in start_ids or r[index_id] in end_ids)

def get_map_of_degree(degree, matches, leak_nodes, edges, bahamas):
    '''Given a degree, gets all the nodes connected to matches by degree edges (and the edges too)'''
    
    degree_i_nodes = matches.rdd
    
    
    for i in range(degree):
        degree_i_edges = filter_edges(edges, degree_i_nodes)
        
        degree_i_addresses = filter_nodes(leak_nodes.address_nodes, degree_i_edges, bahamas)
        degree_i_intermediary = filter_nodes(leak_nodes.intermediary_nodes, degree_i_edges, bahamas)
        degree_i_entities = filter_nodes(leak_nodes.entity_nodes, degree_i_edges, bahamas)
        degree_i_officers = filter_nodes(leak_nodes.officer_nodes, degree_i_edges, bahamas)
        
        degree_i_nodes = degree_i_addresses\
                                .union(degree_i_intermediary)\
                                .union(degree_i_entities)\
                                .union(degree_i_officers)
        
    return degree_i_edges, degree_i_nodes
        
    

In [27]:
def get_distance_from(distance_degree, leak):
    
    bahamas = (leak == 'bahamas')
    
    edges = spark.read.csv('../data/' + leak +'/' + leak + '*.edges.csv', header=True)
    
    nodes = Leak_Nodes(leak)
    
    matches = get_matches(leak)
    
    filtered_edges, filtered_nodes = get_map_of_degree(distance_degree, matches, nodes, edges, bahamas)
    
    if bahamas:
        graph_edges = pd.DataFrame(filtered_edges.collect(),
             columns=['START_ID', 'TYPE', 'END_ID', 'sourceID', 'valid_until', 'start_date', 'end_date'])
    else:
        graph_edges = pd.DataFrame(filtered_edges.collect(),
             columns=['START_ID', 'TYPE', 'END_ID', 'link', 'start_date', 'end_date', 'sourceID', 'valid_until'])
    
    graph_nodes = pd.DataFrame(filtered_nodes.collect())
    
    graph_edges.to_csv('../generated/map/degree_'+ str(distance_degree) +'/'+ leak +'_edges.csv')
    graph_nodes.to_csv('../generated/map/degree_'+ str(distance_degree) +'/'+ leak +'_nodes.csv')
        
    
    

In [28]:
get_distance_from(1, 'panama')

192


In [6]:
get_distance_from(1, 'paradise')

officer -  wikipedia  - size:  73
officer -  INGO  - size:  6
new - size:  79
officer -  forbes  - size:  31
new - size:  110
entity -  wikipedia  - size:  56
new - size:  166
entity -  INGO  - size:  19
new - size:  185
entity -  forbes  - size:  31
new - size:  216
total - size:  216
[Row(node_id='80121170', ShellName='Save the Children', CharityName='Free the Children', CharityHeadquarters='Toronto, Ontario, Canada')]


In [7]:
get_distance_from(1, 'offshore')

officer -  wikipedia  - size:  9
officer -  INGO  - size:  4
new - size:  13
officer -  forbes  - size:  7
new - size:  20
entity -  wikipedia  - size:  7
new - size:  27
entity -  INGO  - size:  10
new - size:  37
entity -  forbes  - size:  2
new - size:  39
total - size:  39
[Row(node_id='109972', ShellName='World Vision                            ', CharityName='World Vision International', CharityHeadquarters=None)]


In [29]:
get_distance_from(1, 'bahamas')

45
