# Notebook to find scraped charity names in the Leaked Papers using Spark

In [9]:
# Imports
import sys
import re
import nltk
import json
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#stop words
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

#spark
import findspark
#Ruijia
#findspark.init(r'C:\Users\Ruijia\Spark')

#Sabrina
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()


### Compute word weights for the matching algorithm

We use the "Inverse Document Frequency" to give words appearing in charity names weights. (Here the names are the documents). This will allow us to reduce the weight of hits based only on very general words ("foundation x" is different from "foundation y" and should not be marked as a hit, even though 50% of the words in the name are the same!)

In [10]:
#Get names

def get_word_weights():
    def get_charity_info_location(charity):
        return '../generated/' + charity + '/' + charity + '_charity_info.csv'

    forbes_names = pd.read_csv(get_charity_info_location('forbes'))['name'].tolist()
    wikipedia_names = pd.read_csv(get_charity_info_location('wikipedia'))['name'].tolist()
    INGO_names = pd.read_csv(get_charity_info_location('INGO'))['name'].tolist()

    names = forbes_names + wikipedia_names + INGO_names

    nb_names = len(names)

    #Get list of individual words
    words = " ".join(names).split()
    nb_words = len(words)

    #Get word frequency
    word_freq = {} 
    for word in words: 
        word = word.lower()
        if (word in word_freq): 
            word_freq[word] += 1
        else: 
            word_freq[word] = 1

    #Compute weights: IDF(t) = log_e(Total number of documents (here names) / Number of documents/names with term t in it). 
    def idf(word_count):
        return np.log(nb_names/word_count)

    word_weights = {k: idf(v) for k, v in word_freq.items()}
    
    return word_weights

word_weights = get_word_weights()
print('Weight of word "foundation": ', word_weights.get('foundation'))
print('Weight of word "amnesty": ', word_weights.get('amnesty'))
print('Weight of word "the": ', word_weights.get('the'))

Weight of word "foundation":  1.252762968495368
Weight of word "amnesty":  6.150602768446279
Weight of word "the":  2.321961371957184


As can be seen, very common words, like "the" and "foundation" have a much weaker weight than rare words like "amnesty".

In [11]:
#Note redefined because py4J is interfering with the predefined functions
def get_average_weight(word_weights):
    average = 0
    for weight in word_weights.values(): 
        average += weight

    return average/len(word_weights)

def get_min_weight(word_weights):
    min_weight = 100
    for weight in word_weights.values(): 
        if weight < min_weight:
            min_weight = weight

    return min_weight

def reweigh_stopwords(word_weights, stoplist):
    min_weight = get_min_weight
    for word, weight in word_weights.items():
        if word in stoplist:
            word_weights[word] = weight/2
            

In [12]:
# Addition of english stop words

def init_stopwords():
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))

    stop_words.add('&')
    stop_words.add('co')
    stop_words.add('co.')
    stop_words.add('co.,')
    stop_words.add('co.,ltd.')
    stop_words.add('corp')
    stop_words.add('corp.')
    stop_words.add('corp.,')
    stop_words.add('de')
    stop_words.add('entertainment')
    stop_words.add('family')
    stop_words.add('foundation')
    stop_words.add('inc')
    stop_words.add('inc.')
    stop_words.add('limited')
    stop_words.add('industries')
    stop_words.add('international')
    stop_words.add('ltd')
    stop_words.add('ltd.')
    stop_words.add('s.a.')
    stop_words.add('world')
    stop_words.add('global')

    stoplist = list(stop_words)
    return stoplist

stops = init_stopwords()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def check_for_words(charity, shell, word_weights, average_weight, stoplist):
    
    
    if charity is None or shell is None:
        return False
    
    charity_words = [x.lower() for x in charity.split()]
    shell_words = [x.lower() for x in shell.split()]
    len_charity = len(charity_words)
    len_shell = len(shell_words)
    
    threshold = average_weight
    
    nb_matches = 0
    weight_matches = 0
    nb_stopwords = 0
    
    for i in range(len_charity):
        word = charity_words[i]
        if word in shell_words and word in word_weights.keys():
            weight_matches += word_weights.get(word)
            nb_matches += 1
            
            if word in stoplist:
                nb_stopwords += 1
                
    #note this also takes care of 0 matches
    if nb_matches == nb_stopwords:
            return False

    if len_charity == 1 or len_shell == 1:
        return (np.abs(len_charity - len_shell) < 2  and nb_matches == 1)
            
    if (weight_matches/nb_matches) >= threshold and (float(nb_matches)/float(len_shell)) > 0.6:
        print('Charity: ', charity, ' Shell: ', shell)
        print('Thresh: ', threshold, ' weight: ', weight_matches/nb_matches)
            
    return (weight_matches/nb_matches) >= threshold and (float(nb_matches)/float(len_shell)) > 0.6
        

In [22]:
def extract_matches_between(leak, charity, sharp):
    
    stoplist = init_stopwords()
    weights = get_word_weights()
    reweigh_stopwords(word_weights, stoplist)
    average_weight = get_average_weight(weights)
    
        
    charity_location = '../generated/' + charity + '/' + charity + '_charity_info.csv'
    leak_location = '../data/' + leak + '/' + leak + '*.nodes.entity.csv'
    
    leak_data = spark.read.csv(leak_location, header=True)

    charity_data = spark.read.csv(charity_location, header=True)
    
    charity_names = charity_data.select('name', 'Headquarters').withColumnRenamed('name', 'CharityName')
    shell_names = leak_data.select('node_id','name').withColumnRenamed('name', 'ShellName')
    
    shells_vs_charities = shell_names.crossJoin(charity_names)
    
    filtered_names = shells_vs_charities.rdd.filter(lambda r: check_for_words(r[1], r[2], weights,
                                                                              average_weight, stoplist) == True)
    
    
    
    matches = pd.DataFrame(filtered_names.collect(), columns=['node_id','ShellName','CharityName','CharityHeadquarters'])

    
    matches.to_csv('../generated/matches/' + leak +'_'+ charity +'_matches.csv')
    
    
    

In [23]:
extract_matches_between('panama', 'forbes', False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Extraction of possible charity names in the Panama Papers
extract_matches_between('panama', 'forbes', False)
extract_matches_between('panama', 'wikipedia', True)
extract_matches_between('panama', 'INGO', False)

In [None]:
# Extraction of possible charity names in the Paradise Papers
extract_matches_between('paradise', 'forbes', False)
extract_matches_between('paradise', 'wikipedia', True)
extract_matches_between('paradise', 'INGO', False)

In [None]:
# Extraction of possible charity names in the Bahamas Leaks
extract_matches_between('bahamas', 'forbes', False)
extract_matches_between('bahamas', 'wikipedia', True)
extract_matches_between('bahamas', 'INGO', False)

In [None]:
# Extraction of possible charity names in the Offshore Leaks
extract_matches_between('offshore', 'forbes', False)
extract_matches_between('offshore', 'wikipedia', True)
extract_matches_between('offshore', 'INGO', False)