# Connections

Using the matches found for data extraction, we find the nodes connected to the matching shell companies up to a given distance using the edges. These will then be stored as csv files and used to create graphs that can visualize the degree of connectivity between the hit nodes.

In [1]:
#Imports
import pandas as pd

#spark
import findspark
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()

import warnings
warnings.filterwarnings('ignore')

### Load Data

Until now the matches were spread across several csv files. We group them all in one dataset for easier use.

We also create a class for the different types of nodes to make them more portable.

In [2]:
#Constants

DISTANCE_DEGREE = 1

def get_match_file(leak, charity, node_type):
    return '../generated/inspected_matches/' + node_type +'/' + node_type +'_'+ leak + '_' + charity + '_matches.csv'

def get_matches(leak):
    
    node_types = ['officer', 'entity']
    charity_types = ['wikipedia', 'INGO', 'forbes']
    
    first = True
    
    matches = None
    
    for node_type in node_types:
        for charity_type in charity_types:
            
            if first:
                matches = spark.read.csv(get_match_file(leak, charity_type, node_type), header=True)
                first = False
            else:
                new_matches = spark.read.csv(get_match_file(leak, charity_type, node_type), header=True)
                matches = matches.union(new_matches)
                
        # save this for later
        pd.DataFrame(matches.drop('_c0').collect())\
                            .to_csv('../generated/map/connections/' + node_type +'/'+ node_type +'_' + leak +'_matches.csv')
                
    matches_clean = matches.drop('_c0')
    
    #save these in case we need them later
    pd.DataFrame(matches_clean.collect()).to_csv('../generated/map/connections/all_'+ leak +'_matches.csv')
                
                
    return matches_clean
    
    

class Leak_Nodes:
    def __init__(self, leak):
        self.address_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.address.csv', header=True)
        self.intermediary_nodes = spark.read\
                                .csv('../data/'+ leak +'/'+ leak +'*.nodes.intermediary.csv', header=True)
        self.officer_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.officer.csv', header=True)
        self.entity_nodes = spark.read.csv('../data/'+ leak +'/'+ leak +'*.nodes.entity.csv', header=True)



### Filtering for connectivity

In order to find connections, we do the following:

    1. Start with the ids of the shells matching charities
    2. Filter the edges dataset. Keep the ones expanding from the matches
    3. Filter the nodes datasets. Keep the ones at the ends of the edges found in step three and add them to the nodes from step one
    4. Repeat steps 2-4 until the satisfactory distance from the start has been reached.



In [3]:
def filter_edges(edges, nodes):
    '''Given a set of nodes, returns the edges connected to those nodes'''
    ids = nodes.map(lambda r: r[0]).collect()
    return edges.rdd.filter(lambda r: r[0] in ids or r[2] in ids)

def filter_nodes(nodes, edges, bahamas):
    '''Given a set of edges, return the nodes connected to those edges'''
    start_ids = edges.map(lambda r: r[0]).collect()
    end_ids = edges.map(lambda r: r[2]).collect()
    
    index_id = 0
    if bahamas:
        index_id = 4
        
    
    return nodes.rdd.filter(lambda r: r[index_id] in start_ids or r[index_id] in end_ids)

def get_map_of_degree(degree, matches, leak_nodes, edges, bahamas):
    '''Given a degree, gets all the nodes connected to matches by degree edges (and the edges too)'''
    
    degree_i_nodes = matches.rdd
    
    
    for i in range(degree):
        degree_i_edges = filter_edges(edges, degree_i_nodes)
        
        degree_i_addresses = filter_nodes(leak_nodes.address_nodes, degree_i_edges, bahamas)
        degree_i_intermediary = filter_nodes(leak_nodes.intermediary_nodes, degree_i_edges, bahamas)
        degree_i_entities = filter_nodes(leak_nodes.entity_nodes, degree_i_edges, bahamas)
        degree_i_officers = filter_nodes(leak_nodes.officer_nodes, degree_i_edges, bahamas)
        
        degree_i_nodes = degree_i_addresses\
                                .union(degree_i_intermediary)\
                                .union(degree_i_entities)\
                                .union(degree_i_officers)
        
    return degree_i_edges, degree_i_nodes
        
    

### Apply the filtering to all the datasets

Note that the 'bahamas' dataset follows a slightly different schema from the others and requires special treatment. 

We also mark which nodes were original matches for later use in the graphs.

In [4]:
def get_distance_from(distance_degree, leak):
    
    bahamas = (leak == 'bahamas')
    
    edges = spark.read.csv('../data/' + leak +'/' + leak + '*.edges.csv', header=True)
    
    nodes = Leak_Nodes(leak)
    
    matches = get_matches(leak)
    
    filtered_edges, filtered_nodes = get_map_of_degree(distance_degree, matches, nodes, edges, bahamas)
    
    id_index = 0
    
    if bahamas:
        graph_edges = pd.DataFrame(filtered_edges.collect(),
             columns=['START_ID', 'TYPE', 'END_ID', 'sourceID', 'valid_until', 'start_date', 'end_date'])
        id_index = 4
    else:
        graph_edges = pd.DataFrame(filtered_edges.collect(),
             columns=['START_ID', 'TYPE', 'END_ID', 'link', 'start_date', 'end_date', 'sourceID', 'valid_until'])
    
    graph_nodes = pd.DataFrame(filtered_nodes.collect())
    
    match_ids = matches.select('node_id').rdd.flatMap(lambda r: r).collect()
    
    graph_nodes['Match'] = True
    for index, row in graph_nodes.iterrows():
        graph_nodes['Match'][index] = (row[id_index] in match_ids)
    
    graph_edges.to_csv('../generated/map/degree_'+ str(distance_degree) +'/'+ leak +'_edges.csv')
    graph_nodes.to_csv('../generated/map/degree_'+ str(distance_degree) +'/'+ leak +'_nodes.csv')
        
    
    

### Get the data for each set of leaks

In [5]:
get_distance_from(1, 'panama')

In [None]:
get_distance_from(1, 'paradise')

In [None]:
get_distance_from(1, 'offshore')

In [None]:
get_distance_from(1, 'bahamas')