# Wikipedia list of charitable foundations web scraping

In [71]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

### Get the wikipedia list of charities in html

In [2]:
#get html webpage
URL = 'https://en.wikipedia.org/wiki/List_of_charitable_foundations'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

### Extract name of charity and link to its wikipedia page from the list

In [52]:
link_sections = soup.findAll('div', class_="div-col columns column-width")

In [59]:
link_sections

[<div class="div-col columns column-width" style="-moz-column-width: 30em; -webkit-column-width: 30em; column-width: 30em;">
 <ul><li><a href="/wiki/AARP#The_AARP_Foundation" title="AARP">AARP Foundation</a></li>
 <li><a href="/wiki/Acorns_Children%27s_Hospice" title="Acorns Children's Hospice">Acorns Children's Hospice</a></li>
 <li><a href="/wiki/Action_Against_Hunger" title="Action Against Hunger">Action Against Hunger</a></li>
 <li><a href="/wiki/Action_Deafness" title="Action Deafness">Action Deafness</a></li>
 <li><a href="/wiki/ActionAid" title="ActionAid">ActionAid</a></li>
 <li><a href="/wiki/Acumen_(organization)" title="Acumen (organization)">Acumen</a></li>
 <li><a href="/wiki/Adelson_Foundation" title="Adelson Foundation">Adelson Foundation</a></li>
 <li><a href="/wiki/Adventist_Development_and_Relief_Agency" title="Adventist Development and Relief Agency">Adventist Development and Relief Agency</a></li>
 <li><a href="/wiki/Aerospace_Heritage_Foundation_of_Canada" title="A

In [84]:
links_lists = [x.findAll('a') for x in link_sections]
flat_links_list = [item for sublist in links_lists for item in sublist]

In [88]:
flat_links_list

[<a href="/wiki/AARP#The_AARP_Foundation" title="AARP">AARP Foundation</a>,
 <a href="/wiki/Acorns_Children%27s_Hospice" title="Acorns Children's Hospice">Acorns Children's Hospice</a>,
 <a href="/wiki/Action_Against_Hunger" title="Action Against Hunger">Action Against Hunger</a>,
 <a href="/wiki/Action_Deafness" title="Action Deafness">Action Deafness</a>,
 <a href="/wiki/ActionAid" title="ActionAid">ActionAid</a>,
 <a href="/wiki/Acumen_(organization)" title="Acumen (organization)">Acumen</a>,
 <a href="/wiki/Adelson_Foundation" title="Adelson Foundation">Adelson Foundation</a>,
 <a href="/wiki/Adventist_Development_and_Relief_Agency" title="Adventist Development and Relief Agency">Adventist Development and Relief Agency</a>,
 <a href="/wiki/Aerospace_Heritage_Foundation_of_Canada" title="Aerospace Heritage Foundation of Canada">Aerospace Heritage Foundation of Canada</a>,
 <a href="/wiki/Aleh_Negev" title="Aleh Negev">Aleh Negev</a>,
 <a href="/wiki/Alex%27s_Lemonade_Stand_Foundatio

In [102]:
charities = pd.DataFrame([(x.text, 'https://en.wikipedia.org' + x.get('href')) for x in flat_links_list], 
                        columns={'Name', 'Link'})

In [104]:
charities.to_csv('../../generated/charities/wikipedia_charity_links.csv')

### Extract information about individual charities from the table on their linked pages

In [139]:
def list_features(charities):
    
    features = []
    
    for index, row in charities.iterrows():
        link = row['Link']

        r_link = requests.get(link)
        soup_link = BeautifulSoup(r_link.text, 'html.parser')
        table = soup_link.find('table', class_="infobox vcard")

        if table is not None:
            table_rows = table.findAll('tr')
            for row in table_rows:
                feature = row.find('th')
                if feature is not None:
                    feature = feature.text
                    features.append(feature)
    return pd.Series(features).unique()
                    

In [140]:
#Get the list of features we can potentially extract about a charity
list_features(charities)

array(['Formation', 'Founder', 'Type', 'Tax ID no. ', 'Headquarters',
       'Location', 'Members  ', 'CEO', 'Board Chair', 'Subsidiaries',
       'Revenue  ', 'Expenses', 'Staff  ', 'Volunteers  ', 'Website',
       'Formerly called', 'Motto', 'Purpose', 'Region served ',
       'Legal status', 'Membership  ', 'Secretary General', 'Founded',
       'Focus', 'Area served ', 'Method', 'Endowment', 'Product', 'Owner',
       'Key people', 'Employees  ', 'Services', 'Chair',
       'Co-executive director',
       'Revenue .mw-parser-output .nobold{font-weight:normal}(2014) ',
       'Expenses (2014)', 'Employees (2014) ', 'Volunteers (2014) ',
       'Alpha Sigma Tau', 'Scope', 'Colors', 'Symbol', 'Flower', 'Jewel',
       'Publication', 'Philanthropy', 'Chapters', 'Established',
       'Director', 'President', 'Main organ',
       'Revenue .mw-parser-output .nobold{font-weight:normal}(2015) ',
       'Expenses (2015)', 'Affiliations', 'Founders', 'Coordinates',
       'Origins', 'Chief E

In [153]:
#using the list above, go through the charity wikipedia pages and attempt to draw out any possible information
def extract_info_from_table(table):
    
    leader1 = None
    leader2 = None
    leader3 = None
    leader4 = None
    leader1_set = False
    leader2_set = False
    leader3_set = False
    leader4_set = False
    revenue = None
    hq = None
    location = None
    other_names = None
    subsidiaries = None
    purpose = None
        
    if table is not None:
        table_rows = table.findAll('tr')
        for row in table_rows:
            feature = row.find('th')
            value = row.find('td')
            if feature is not None and value is not None:
                feature = feature.text
                value = value.text
                
                if feature in ['CEO', 'Secretary General', 'Owner', 'Key people',
                                'Chair', 'Co-executive director', 'President', 'Board Chair',
                                'Chief Executive Officer', 'Board of Directors', 'Executive Director',
                                'National President & CEO', 'Chairman', 'Chief Executive', 
                                'Deputy Secretary General', 'Chair of the Supervisory Board',
                                'Chairman of the Governing Body', 'President/CEO', 'President and CEO',
                                'Board\xa0of directors', 'President of the Board', 'Board of Trustees',
                                'Chair, Adult Advisory Council', 'Leader', 'President, Treasurer', 
                                'President & CEO', 'Executive director', 'Vice president', 'Predecessor',
                                'Chairman of Governors', 'Notable Board Members[1]', 'Board of Directors',
                                'Superior General', 'Honorary President', 'Co-Chairman', 'Managing Director',
                                'Chair World Board', 'President Emeritus', 'Director General']:
                    if leader1_set == False:
                        leader1 = value
                        leader1_set = True
                    elif leader2_set == False:
                        leader2 == value
                        leader2_set = True
                    elif leader3_set == False:
                        leader3 == value
                        leader3_set = True
                    elif leader4_set == False:
                        leader4 == value
                        leader4_set = True
                    ceo = value
                elif feature in ['Revenue', 'Revenue .mw-parser-output .nobold{font-weight:normal}(2016) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2015) ', 
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2014) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2017) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2016) ',
                                'Net income', 'Total assets', 'Operating income',
                                'Revenue (FY 2015)' ]:
                    revenue = value
                elif feature in ['Headquarters', 'Address']:
                    hq = value
                elif feature in ['Location', 'Chapters', 'Region ', 'Country']:
                    location = value
                elif feature in ['Formerly called', 'Abbreviation', 'Parent organization']:
                    other_names = value
                elif feature in ['Subsidiaries', 'Affiliation']:
                    subsidiaries = value
                elif feature in ['Purpose', 'Focus', 'Product', 'Services', 'Industry', 'Fields']:
                    purpose = value
    return [leader1, leader2, leader3, leader4, revenue, hq, location, other_names, subsidiaries, purpose]
    
        

def extract_info_from_link(link):
    r_link = requests.get(link)
    soup_link = BeautifulSoup(r_link.text, 'html.parser')
    table = soup_link.find('table', class_="infobox vcard")
    return extract_info_from_table(table)

In [154]:
detailed_charities = []
for index, row in charities.iterrows():
    name = row['Name']
    link = row['Link']
    charity_details = extract_info_from_link(link)
    charity_details.insert(0, name)
    detailed_charities.append(charity_details)

In [155]:
charities_detailed = pd.DataFrame(detailed_charities,
                                  columns=['Name', 'Leader 1', 'Leader 2', 'Leader 3', 'Leader 4',
                                           'Revenue', 'Headquarters', 'Location', 'Other names', 'Subsidiaries',
                                          'Purpose'])

In [158]:
#Save the data as a csv file
charities_detailed.to_csv('../../generated/charities/wikipedia_charity_info.csv')