# Forbes list of largest US Charities Data scraping


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

Get html data from article

In [2]:
URL = 'https://www.forbes.com/sites/williampbarrett/2016/12/14/the-largest-u-s-charities-for-2016/#5ca92a8d4abb'

In [3]:
r = requests.get(URL)
page_body = r.text

In [4]:
soup = BeautifulSoup(page_body, 'html.parser')

The article contains a list of 100 largest US charities, each with a link to more information. Extract these. 

In [5]:
#Ordered lists have the html tag 'ol'
charity_list_section = soup.find('ol')
charity_list = charity_list_section.findAll('li')
print("Nb charities: ",len(charity_list))
print(charity_list[0:3])

Nb charities:  100
[<li><a data-ga-track="InternalLink:http://www.forbes.com/companies/united-way-worldwide/" href="http://www.forbes.com/companies/united-way-worldwide/" target="_self"><strong data-ga-track="InternalLink:http://www.forbes.com/companies/united-way-worldwide/">United Way Worldwide</strong></a>, $3.708 billion.</li>, <li><a data-ga-track="InternalLink:http://www.forbes.com/companies/task-force-for-global-health/" href="http://www.forbes.com/companies/task-force-for-global-health/" target="_self"><strong data-ga-track="InternalLink:http://www.forbes.com/companies/task-force-for-global-health/">Task Force for Global Health</strong></a>, $3.154 billion.</li>, <li><a data-ga-track="InternalLink:http://www.forbes.com/companies/feeding-america/" href="http://www.forbes.com/companies/feeding-america/" target="_self"><strong data-ga-track="InternalLink:http://www.forbes.com/companies/feeding-america/">Feeding America</strong></a>, $2.150 billion.</li>]


In [11]:
def value_to_number(value):
    #United Way Worldwide, $3.708 billion.
    if value is None:
        return None
    nb = value[1:-1]
    order = value[len(value) - 1]
    nb_zeros = 0
    if order == 'B':
        nb_zeros = 9
    elif order == 'M':
        nb_zeros = 6
    
    decimal_loc = nb.find('.')
    whole = nb
    part = ''
    if decimal_loc != -1:
        whole = nb[:decimal_loc]
        part = nb[decimal_loc + 1:]
        nb_zeros -= len(part)
    
    final_nb =  whole + part
    for i in range(nb_zeros):
        final_nb += '0'
    return final_nb

def parse_revenue(value):
    number_start = value.find('$') + 1 
    number = value[number_start:]
    order = number.find(' ')
    order_val = number[order:]
    digits = number[:order]
    
    nb_zeros = 0
    if 'billion' in order_val:
        nb_zeros = 9
    elif 'million.' in order_val:
        nb_zeros = 6
        
    decimal_loc = digits.find('.')
    whole = digits
    part = ''
    if decimal_loc != -1:
        whole = digits[:decimal_loc]
        part = digits[decimal_loc + 1:]
        nb_zeros -= len(part)
    
    final_nb =  whole + part
    for i in range(nb_zeros):
        final_nb += '0'
    return final_nb
    
    
def get_or_none(type_dict, entry_list, values):
    for entry in entry_list:
        if entry in type_dict:
            return values[type_dict.get(entry)].string
    return None
    
    

def extract_data_from_elem(list_elem):
    money = list_elem.text[list_elem.text.find('$'):]
    link = list_elem.find('a').get('href')
    r_char = requests.get(link)
    soup_char = BeautifulSoup(r_char.text, 'html.parser')
    name_line = soup_char.find('h1').string 
    name = name_line[name_line.find(' ') + 1:]
   
    types = soup_char.findAll('span', class_='profile-row--type')
    type_dict = {}
    for i in range(len(types)):
        type_dict[types[i].string] = i
    
    values = soup_char.findAll('span', class_='profile-row--value')
    category = get_or_none(type_dict, ['Category', 'Industries'], values)
    country = get_or_none(type_dict, ['Country'], values)
    leader = get_or_none(type_dict, ['Top Person', 'CEO'], values)
    last_date = get_or_none(type_dict, ['Fiscal Year End'], values)
    hq = get_or_none(type_dict, ['Headquarters'], values)
    return [name, money, category, leader, last_date, hq, country]

    
    
def get_charity_data(charity_list):
    charity_data = []
    for elem in charity_list:
        output = extract_data_from_elem(elem)
        charity_data.append(output)
    return pd.DataFrame(charity_data, columns=['Name', 'Income($)', 'Field', 'Top Person', 'Fiscal Year End', 'Headquarters', 'Country'])
        
        


In [12]:
df = get_charity_data(charity_list)

In [13]:
df

Unnamed: 0,Name,Income($),Field,Top Person,Fiscal Year End,Headquarters,Country
0,United Way Worldwide,$3.708 billion.,Domestic Needs,Brian Gallagher,"Jun 30, 2017","Alexandria, Virginia",United States
1,Task Force for Global Health,$3.154 billion.,International Needs,David Ross,"Aug 31, 2017","Decatur, Georgia",United States
2,Feeding America,$2.150 billion.,Domestic Needs,Diana Aviv,"Jun 30, 2016","Chicago, Illinois",United States
3,Salvation Army,"$1,904 billion.",Domestic Needs,David Jeffrey,"Sep 30, 2016","Alexandria, Virginia",United States
4,YMCA,$1.202 billion.,Travel & Leisure,Kevin Washington,,"Chicago, Illinois",United States
5,St. Jude Children's Research Hospital,$1.181 billion,Healthcare & Social,"James Downing, MD",,"Memphis, Tennessee",United States
6,Food for the Poor,$1.156 billion.,International Needs,Robin Mahfood,"Dec 31, 2016","Coconut Creek, Florida",United States
7,Boys & Girls Clubs of America,$923 million.,Youth,James Clark,"Dec 31, 2016","Atlanta, Georgia",United States
8,Catholic Charities USA,$921 million.,Domestic Needs,Donna Markham,"Jun 30, 2016","Alexandria, Virginia",United States
9,Goodwill Industries International,$902 million.,Domestic Needs,James Gibbons,"Dec 31, 2016","Rockville, Maryland",United States


In [17]:
df.to_csv('../../generated/charities/Forbes_top_100_US_Charities.csv')

In [18]:
df_test = pd.read_csv('../../generated/charities/Forbes_top_100_US_Charities.csv', index_col=0)
df_test

Unnamed: 0,Name,Income($),Field,Top Person,Fiscal Year End,Headquarters,Country
0,United Way Worldwide,$3.708 billion.,Domestic Needs,Brian Gallagher,"Jun 30, 2017","Alexandria, Virginia",United States
1,Task Force for Global Health,$3.154 billion.,International Needs,David Ross,"Aug 31, 2017","Decatur, Georgia",United States
2,Feeding America,$2.150 billion.,Domestic Needs,Diana Aviv,"Jun 30, 2016","Chicago, Illinois",United States
3,Salvation Army,"$1,904 billion.",Domestic Needs,David Jeffrey,"Sep 30, 2016","Alexandria, Virginia",United States
4,YMCA,$1.202 billion.,Travel & Leisure,Kevin Washington,,"Chicago, Illinois",United States
5,St. Jude Children's Research Hospital,$1.181 billion,Healthcare & Social,"James Downing, MD",,"Memphis, Tennessee",United States
6,Food for the Poor,$1.156 billion.,International Needs,Robin Mahfood,"Dec 31, 2016","Coconut Creek, Florida",United States
7,Boys & Girls Clubs of America,$923 million.,Youth,James Clark,"Dec 31, 2016","Atlanta, Georgia",United States
8,Catholic Charities USA,$921 million.,Domestic Needs,Donna Markham,"Jun 30, 2016","Alexandria, Virginia",United States
9,Goodwill Industries International,$902 million.,Domestic Needs,James Gibbons,"Dec 31, 2016","Rockville, Maryland",United States
