# Notebook to find scraped charity names in the Panama Papers using Spark

In [339]:
# Imports
import re
import nltk
import json
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#stop words
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

#spark
import findspark
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()


In [340]:
# File paths
DATA_FOLDER = '../data'
PANAMA_DATA_FOLDER = DATA_FOLDER + '/panama'

GENERATED_FOLDER = '../generated'
CHARITY_GENERATED_FOLDER = GENERATED_FOLDER + '/charities'

In [341]:
# Loading and creation of dataframes
pp_edges = spark.read.csv(PANAMA_DATA_FOLDER + '/panama_papers.edges.csv', header=True)
pp_adress = spark.read.csv(PANAMA_DATA_FOLDER + '/panama_papers.nodes.address.csv', header=True)
pp_entity = spark.read.csv(PANAMA_DATA_FOLDER + '/panama_papers.nodes.entity.csv', header=True)
pp_intermediary = spark.read.csv(PANAMA_DATA_FOLDER + '/panama_papers.nodes.intermediary.csv', header=True)
pp_officer = spark.read.csv(PANAMA_DATA_FOLDER + '/panama_papers.nodes.officer.csv', header=True)

wiki_info = spark.read.csv(CHARITY_GENERATED_FOLDER + '/wikipedia_charity_info.csv', header=True)
wiki_links =charities_info = spark.read.csv(CHARITY_GENERATED_FOLDER + '/wikipedia_charity_links.csv', header=True)

In [342]:
# Addition of english stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

stop_words.add('&')
stop_words.add('co')
stop_words.add('co.')
stop_words.add('co.,')
stop_words.add('co.,ltd.')
stop_words.add('corp')
stop_words.add('corp.')
stop_words.add('corp.,')
stop_words.add('de')
stop_words.add('inc.')
stop_words.add('foundation')
stop_words.add('inc')
stop_words.add('limited')
stop_words.add('ltd')
stop_words.add('ltd.')
stop_words.add('s.a.')

stop_words = list(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/sabrina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [343]:
# Splitting each charity name and cleaning stop words

def to_lower_parens_less(word):
    return word.lower().replace('(', '').replace(')', '')

charity_name = wiki_info.select('Name')
def remove_stop(word_list):
    return [to_lower_parens_less(w) for w in word_list if w.lower() not in stop_words]
    
#charity_name.replace(stop_words,None,inplace=True)
charity_name_basic = charity_name.rdd.map(lambda r: remove_stop(r.Name))

# Extracting company shell names

shell_name=pp_entity.select('name')

In [346]:
charities_vs_shells = charity_name.crossJoin(shell_name)

In [353]:
def check_for_words(charity, shell, percentage, lower_bound):
    if charity is None or shell is None:
        return False
    
    charity_words = charity.split()
    shell_words = shell.split()
    len_charity = len(charity_words)
    len_shell = len(shell_words)
    
    count = 0
    for word in charity_words:
        if word in shell_words and word not in stop_words:
            count += 1
    return ((count/len_charity >= percentage) and (count/len_shell >= percentage))
        

In [354]:
charities_vs_shells

DataFrame[Name: string, name: string]

In [355]:
filtered = charities_vs_shells.filter(lambda r: check_for_words(r.Name, r.name, 0.5, 3) == True)
filtered.take(10)

TypeError: condition should be string or Column

__This is the name cross_checking function in progress that will be loaded into the cluster. (Yes, we will improve on the for-loops. Probably.)__

In [263]:
# Search of charity names in all shell names
'''
Matching names are found by accumuation of evidences.
When two words of a charity name is found in a shell name,
there is enough evidences that the name are at least very similar.
Furthuer visual inspection may be required to eliminate false positives.
'''
for i in range(0,len(shell_name)):
    for j in range(0, charity_name.shape[0]):
        counter = 0
        for k in range(0,charity_name.shape[1]):
            if charity_name[k][j] is not None:
                if re.search(' ' + charity_name[k][j] + ' ', shell_name[i]):
                    counter = counter + 1
                    if counter == 2:
                        print('(' + str([i,j,k]) + ') ' + "IT'S A MATCH ! WE'VE GOT THESE BASTARDS ! > " + shell_name[i])
                        break

TypeError: object of type 'PipelinedRDD' has no len()