# Notebook to find scraped charity names in Leaked Papers using Spark

Using the names of charities from wikipedia and forbes (see web_scraping section), we filter the leaked nodes for shell companies with names of a certain degree of similarity to those of the charities.

In [1]:
# Imports
import re
import nltk
import json
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#stop words
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

#spark
import findspark

#Ruijia
#findspark.init(r'C:\Users\Ruijia\Spark')

#Sabrina
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')


from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()


### Stopwords

Part of our filtering algorithm checks nltk's list of stopwords, augmented with handpicked results from trial-and-error, to discount matches between shell companies and charities that may have occurred without good reason. (For example matching only on words like "of" and "the" is clearly not valid.)

In [2]:
# Stop words

def init_stopwords():
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    stop_words.add('&')
    stop_words.add('co')
    stop_words.add('co.')
    stop_words.add('co.,')
    stop_words.add('co.,ltd.')
    stop_words.add('corp')
    stop_words.add('corp.')
    stop_words.add('corp.,')
    stop_words.add('de')
    stop_words.add('foundation')
    stop_words.add('inc')
    stop_words.add('inc.')
    stop_words.add('limited')
    stop_words.add('international')
    stop_words.add('ltd')
    stop_words.add('ltd.')
    stop_words.add('s.a.')
    stop_words.add('world')
    stop_words.add('global')

    stop_words = list(stop_words)
    return stop_words

stop_words = init_stopwords()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### The Matching Algorithm

Here is our actual matching algorithm, improved mostly by trial and error (we attempted to use idf scores, but this had a terrible performance).


Given a shell name and charity name, we do as follows:

 1. Count the number of words in common between the two names, and the number of words in common that are stopwords
 2. If tuning is set to occur (this is optimized for wikipedia charities)
     1. If all the matches are stopwords -> not a match
     2. If "family foundation" are the only matching words (these occur very, very often, so we make a special case) for them -> not a match
     3. If one of the names is only one word:
         1. Does the word occur in the other name (and is it not a stopword)? -> match
         2. Else not a match
     4. If the percentage of matches over the total length of the names is above a certain threshold (we found 0.6 to be optimal) -> match
     5. Else not a match

In [14]:


def check_for_words(charity, shell, stop_words, tuning):
    
    percentage = 0.6
    
    if charity is None or shell is None:
        return False
    
    charity_words = [x.lower() for x in charity.split()]
    shell_words = [x.lower() for x in shell.split()]
    len_charity = len(charity_words)
    len_shell = len(shell_words)
    
    count_random_matches = 0
    stop_word_random_matches = 0
    
    if len_charity == 0 or len_shell == 0:
        return False
    
    for i in range(len_charity):
        word = charity_words[i]
        if word in shell_words:
            count_random_matches += 1
            
            if word in stop_words:
                stop_word_random_matches += 1
                
    if tuning:
        #if only stopwords match, not valid
        if count_random_matches - stop_word_random_matches < 1:
            return False

        #"Family foundations are tricky -> make sure they are not the only matching parts"
        if ('family' in shell_words 
            and 'foundation' in shell_words 
            and 'family' in charity_words 
            and 'foundation' in charity_words 
            and count_random_matches < 3 
            and len_shell > 2 
            and len_charity > 2):
            return False

    if len_charity == 1 or len_shell == 1:
        return (np.abs(len_charity - len_shell) < 2  and count_random_matches == 1)
        
    return ((count_random_matches/len_charity >= percentage) 
            and (count_random_matches/len_shell >= percentage))
        

### Applying the Algorithm

With this function, we can search a specific set of leaked papers (panama, paradise, bahamas or offshore), a specific source of charities (wikipedia, forbes or INGO) and a specific type of node (officer or entity) for matches.

In [4]:
def extract_matches_between(leak, charity, sharp, node_type):
    
    
    stop_words = init_stopwords()
    
    #load data
    charity_location = '../generated/scraping/' + charity + '/' + charity + '_charity_info.csv'
    leak_location = '../data/' + leak + '/' + leak + '*.nodes.'+ node_type +'.csv'
    
    leak_data = spark.read.csv(leak_location, header=True)

    charity_data = spark.read.csv(charity_location, header=True)
    
    #select columns
    charity_names = charity_data.select('name', 'Headquarters').withColumnRenamed('name', 'CharityName')
    shell_names = leak_data.select('node_id','name').withColumnRenamed('name', 'ShellName')
    
    #crossjoin lists of names
    shells_vs_charities = shell_names.crossJoin(charity_names)
    
    #filter the crossjoined names using the algorithm
    filtered_names = shells_vs_charities.rdd.filter(lambda r: check_for_words(r[1], r[2], stop_words, sharp) == True)
    
    #collect and write to file the results
    matches = pd.DataFrame(filtered_names.collect(), columns=['node_id','ShellName','CharityName','CharityHeadquarters'])

    
    matches.to_csv('../generated/matches/' + node_type +'/'+ node_type +'_'+ leak +'_'+ charity +'_matches.csv')
    
    

### Entity nodes

We search all combinations of leaks and charity files according to nodes of type entity, as these are the ones that usually represent shell companies and most of the charities are to be found here, if at all.

In [5]:
# Extraction of possible charity names in the Panama Papers
extract_matches_between('panama', 'forbes', False, 'entity')
extract_matches_between('panama', 'wikipedia', True, 'entity')
extract_matches_between('panama', 'INGO', False, 'entity')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Extraction of possible charity names in the Paradise Papers
extract_matches_between('paradise', 'forbes', False,'entity')
extract_matches_between('paradise', 'wikipedia', True, 'entity')
extract_matches_between('paradise', 'INGO', False, 'entity')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Extraction of possible charity names in the Bahamas Leaks
extract_matches_between('bahamas', 'forbes', False, 'entity')
extract_matches_between('bahamas', 'wikipedia', True, 'entity')
extract_matches_between('bahamas', 'INGO', False, 'entity')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Extraction of possible charity names in the Offshore Leaks
extract_matches_between('offshore', 'forbes', False, 'entity')
extract_matches_between('offshore', 'wikipedia', True, 'entity')
extract_matches_between('offshore', 'INGO', False, 'entity')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Officer nodes

We also search "officer" type nodes, as data exploration showed that these can also contain the names of charities.

In [9]:
# Extraction of possible charity names in the Panama Papers
extract_matches_between('panama', 'forbes', False, 'officer')
extract_matches_between('panama', 'wikipedia', True, 'officer')
extract_matches_between('panama', 'INGO', False, 'officer')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Extraction of possible charity names in the Paradise Papers
extract_matches_between('paradise', 'forbes', False, 'officer')
extract_matches_between('paradise', 'wikipedia', True, 'officer')
extract_matches_between('paradise', 'INGO', False, 'officer')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Extraction of possible charity names in the Bahamas Leaks
extract_matches_between('bahamas', 'forbes', False, 'officer')
extract_matches_between('bahamas', 'wikipedia', True, 'officer')
extract_matches_between('bahamas', 'INGO', False, 'officer')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Extraction of possible charity names in the Offshore Leaks
extract_matches_between('offshore', 'forbes', False, 'officer')
extract_matches_between('offshore', 'wikipedia', True, 'officer')
extract_matches_between('offshore', 'INGO', False, 'officer')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
