# Wikipedia list of INGO web scraping

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

### Get the wikipedia list of INGO in html

In [2]:
#get html webpage
URL = 'https://en.wikipedia.org/wiki/International_non-governmental_organization'
r = requests.get(URL)
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

### Extract name of INGO and link to its wikipedia page from the list

In [3]:
link_sections = soup.findAll('div', class_="mw-parser-output")

In [4]:
links_lists = [x.findAll('a', attrs={'href': re.compile("/w")}) for x in link_sections]

flat_links_list = [((item.text),(item.get('href'))) for sublist in links_lists for item in sublist]

In [5]:
flat_links_list = flat_links_list[44:97]

tag_list = 'edit'
for tag in flat_links_list[:]:
    if tag_list in tag:
        flat_links_list.remove(tag)
        
flat_links_list

[('khalsa Aid', '/wiki/Khalsa_Aid'),
 ('ADRA', '/wiki/ADRA'),
 ('CAFOD', '/wiki/CAFOD'),
 ('CARE', '/wiki/CARE_(relief_agency)'),
 ('Oxfam', '/wiki/Oxfam'),
 ('Lakshyam ngo', '/w/index.php?title=Lakshyam_ngo&action=edit&redlink=1'),
 ("SOS Children's Villages", '/wiki/SOS_Children%27s_Villages'),
 ('World Vision International', '/wiki/World_Vision_International'),
 ('Mercy Corps', '/wiki/Mercy_Corps'),
 ('Save the Children', '/wiki/Save_the_Children'),
 ('Good Neighbors International', '/wiki/Good_Neighbors_(NGO)'),
 ('Doctors Without Borders', '/wiki/M%C3%A9decins_Sans_Fronti%C3%A8res'),
 ('HealthRight International', '/wiki/HealthRight_International'),
 ('charity: water', '/wiki/Charity:_water'),
 ('Lakshyam NGO', '/w/index.php?title=Lakshyam_ngo&action=edit&redlink=1'),
 ('Compassion International', '/wiki/Compassion_International'),
 ('Plan', '/wiki/Plan_(aid_organisation)'),
 ('World Association of Girl Guides and Girl Scouts',
  '/wiki/World_Association_of_Girl_Guides_and_Girl_Sc

In [6]:
INGO = pd.DataFrame([(x[0], 'https://en.wikipedia.org' + x[1]) for x in flat_links_list], columns=('Name', 'Link'))

In [7]:
INGO.to_csv('../../generated/INGO/wikipedia_INGO_links.csv')

### Extract information about individual INGO from the table on their linked pages

In [8]:
def list_features(INGO):
    
    features = []
    
    for index, row in INGO.iterrows():
        link = row['Link']
        r_link = requests.get(link)
        soup_link = BeautifulSoup(r_link.text, 'html.parser')
        table = soup_link.find('table', class_="infobox vcard")

        if table is not None:
            table_rows = table.findAll('tr')
            for row in table_rows:
                feature = row.find('th')
                if feature is not None:
                    feature = feature.text
                    features.append(feature)
    return pd.Series(features).unique()
                    

In [9]:
#Get the list of features we can potentially extract about a INGO
list_features(INGO)

array(['Founded', 'Founder', 'Founded at', 'Type', 'Legal status',
       'Focus', 'Headquarters', 'Area served ', 'CEO', 'Revenue  ',
       'Employees  ', 'Volunteers  ', 'Website', 'Location', 'Product',
       'Owner', 'Key people', 'Formation', 'Founders', 'Fields',
       'Secretary General', 'Deputy Secretary General',
       'Chair of the Supervisory Board', 'Named after', 'Director',
       'Abbreviation', 'Purpose', 'President', 'Honorary President',
       'Method', 'Expenses', 'Staff  ', 'Registration\xa0no.', 'Origins',
       'Motto', 'Tax ID no. ',
       'World Association of Girl Guides and Girl Scouts', 'Country',
       'Membership', 'Chair World Board', 'Coordinates',
       'World Organization of the Scout Movement',
       'World Scout Committee Chairman', 'Membership  ', 'Presidents',
       'Praeses', 'Secretary-General', 'Region served ', 'Industry',
       'Services', 'Members  ', 'Official language ',
       'Executive Director', 'Main organ', 'Budget  ', 'Re

In [10]:
#using the list above, go through the INGO wikipedia pages and attempt to draw out any possible information
def extract_info_from_table(table):
    
    leader1 = None
    leader2 = None
    leader3 = None
    leader4 = None
    leader1_set = False
    leader2_set = False
    leader3_set = False
    leader4_set = False
    revenue = None
    hq = None
    location = None
    other_names = None
    subsidiaries = None
    purpose = None
        
    if table is not None:
        table_rows = table.findAll('tr')
        for row in table_rows:
            feature = row.find('th')
            value = row.find('td')
            if feature is not None and value is not None:
                feature = feature.text
                value = value.text
                
                if feature in ['CEO', 'Secretary General', 'Owner', 'Key people',
                                'Chair', 'Co-executive director', 'President', 'Board Chair',
                                'Chief Executive Officer', 'Board of Directors', 'Executive Director',
                                'National President & CEO', 'Chairman', 'Chief Executive', 
                                'Deputy Secretary General', 'Chair of the Supervisory Board',
                                'Chairman of the Governing Body', 'President/CEO', 'President and CEO',
                                'Board\xa0of directors', 'President of the Board', 'Board of Trustees',
                                'Chair, Adult Advisory Council', 'Leader', 'President, Treasurer', 
                                'President & CEO', 'Executive director', 'Vice president', 'Predecessor',
                                'Chairman of Governors', 'Notable Board Members[1]', 'Board of Directors',
                                'Superior General', 'Honorary President', 'Co-Chairman', 'Managing Director',
                                'Chair World Board', 'President Emeritus', 'Director General']:
                    if leader1_set == False:
                        leader1 = value
                        leader1_set = True
                    elif leader2_set == False:
                        leader2 == value
                        leader2_set = True
                    elif leader3_set == False:
                        leader3 == value
                        leader3_set = True
                    elif leader4_set == False:
                        leader4 == value
                        leader4_set = True
                    ceo = value
                elif feature in ['Revenue', 'Revenue .mw-parser-output .nobold{font-weight:normal}(2016) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2015) ', 
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2014) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2017) ',
                                'Revenue .mw-parser-output .nobold{font-weight:normal}(2016) ',
                                'Net income', 'Total assets', 'Operating income',
                                'Revenue (FY 2015)' ]:
                    revenue = value
                elif feature in ['Headquarters', 'Address']:
                    hq = value
                elif feature in ['Location', 'Chapters', 'Region ', 'Country']:
                    location = value
                elif feature in ['Formerly called', 'Abbreviation', 'Parent organization']:
                    other_names = value
                elif feature in ['Subsidiaries', 'Affiliation']:
                    subsidiaries = value
                elif feature in ['Purpose', 'Focus', 'Product', 'Services', 'Industry', 'Fields']:
                    purpose = value
    return [leader1, leader2, leader3, leader4, revenue, hq, location, other_names, subsidiaries, purpose]
    
        

def extract_info_from_link(link):
    r_link = requests.get(link)
    soup_link = BeautifulSoup(r_link.text, 'html.parser')
    table = soup_link.find('table', class_="infobox vcard")
    return extract_info_from_table(table)

In [11]:
detailed_INGO = []
for index, row in INGO.iterrows():
    name = row['Name']
    link = row['Link']
    INGO_details = extract_info_from_link(link)
    INGO_details.insert(0, name)
    detailed_INGO.append(INGO_details)

In [12]:
INGO_detailed = pd.DataFrame(detailed_INGO,
                                  columns=['Name', 'Leader 1', 'Leader 2', 'Leader 3', 'Leader 4',
                                           'Revenue', 'Headquarters', 'Location', 'Other names', 'Subsidiaries',
                                          'Purpose'])

In [13]:
#Save the data as a csv file
INGO_detailed.to_csv('../../generated/INGO/wikipedia_INGO_info.csv')