# Word frequency

Create a dictionary of words appearing in charity names mapped to their inverse logarithmic frequency.

-> The more often a word appears in a charity names, the less weight we attribute to it.

In [49]:
# Imports
import sys
import re
import nltk
import json
import folium
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#stop words
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

#spark
import findspark
#Ruijia
#findspark.init(r'C:\Users\Ruijia\Spark')

#Sabrina
findspark.init('/opt/spark/spark-2.3.2-bin-hadoop2.7/')

from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import min
from pyspark.sql.functions import udf
from pyspark.sql.functions import split
from pyspark.sql.functions import explode

from pyspark.sql.types import StringType
from pyspark.sql.types import TimestampType

from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession.builder.getOrCreate()


In [21]:
#Load data
def get_charity_info_location(charity):
    return '../generated/' + charity + '/' + charity + '_charity_info.csv'
    
forbes_names = pd.read_csv(get_charity_info_location('forbes'))['name'].tolist()
wikipedia_names = pd.read_csv(get_charity_info_location('wikipedia'))['name'].tolist()
INGO_names = pd.read_csv(get_charity_info_location('INGO'))['name'].tolist()

names = forbes_names + wikipedia_names + INGO_names

In [31]:
print('Forbes length: ', len(forbes_names))
print('Wikipedia length: ', len(wikipedia_names))
print('INGO length: ', len(INGO_names))

print('Total number of names: ', len(names))

nb_names = len(names)

Forbes length:  100
Wikipedia length:  324
INGO length:  45
Total number of names:  469


In [30]:
#Get list of individual words
words = " ".join(names).split()
nb_words = len(words)

In [39]:
word_freq = {} 
for word in words: 
    word = word.lower()
    if (word in word_freq): 
        word_freq[word] += 1
    else: 
        word_freq[word] = 1

In [40]:
# IDF(t) = log_e(Total number of documents (here names) / Number of documents/names with term t in it). 
def idf(word_count):
    return np.log(nb_names/word_count)
    

word_weights = {k: idf(v) for k, v in word_freq.items()}

In [57]:
print('Weight of word "foundation": ', word_weights.get('foundation'))
print('Weight of word "amnesty": ', word_weights.get('amnesty'))
print('Weight of word "the": ', word_weights.get('of'))

Weight of word "foundation":  1.252762968495368
Weight of word "amnesty":  6.150602768446279
Weight of word "the":  2.389402652752717


As can be seen, common words like "foundation" and "the" have waker weights than rare words like "amnesty"

In [56]:
max_weight = 0
for weight in word_weights.values():
    if weight > max_weight:
        max_weight = weight

max_weight

6.150602768446279