# Starter for the EIB website


In [202]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [203]:
base_url = "http://www.eib.org/about/accountability/complaints/cases/index.htm"

In [204]:
html, sc = get_page_content(base_url)

In [205]:
def get_page_content(url):
    """
        Given a url, this would return the html content of the page parsed by BeautifulSoup
    """
    page = requests.get(url)
    page_content = BeautifulSoup(page.content, 'html.parser')
    return page_content, page.status_code

In [206]:
def get_project_table(html):
    """
        Grab the table with project information
    """
    maindiv = html.find("div", {"id": "consultationsList"})
    table= maindiv.find_all('table')
    df = pd.read_html(str(table), header=0)
    df = df[0]
    return df

In [207]:
def get_project_urls(html):
    """
    Retrieve the urls from the onclick js function
    """
    maindiv = html.find("div", {"id": "consultationsList"})
    trs = maindiv.find_all('tr')
    urls = []
    for i in trs:
        if i.get('onclick'):
            url = i.get('onclick').split(',')[0].replace("window.open('/"'',"").strip('\'').strip()
            url = 'http://' + url
            urls.append(url)
    return urls

**Scrape**

In [220]:
def scrape():
    ## GET PROJECT TABLE
    df = get_project_table(html)
    ## GET URLS
    urls = get_project_urls(html)
    df['urls'] = urls

    ## Limit to E type
    df = df[df.Type == 'E']

    def clean(x):
        return x.replace(':','').strip()

    ## Store the project specific data
    ## Only grabbing Filer/ID right now - but should be expanded
    project_data = []
    count404 = 0
    url404 = []
    
    ## Iterate over urls - controlling for 404 errors
    for idx, url in zip(df.index,df.urls):
        page, sc = get_page_content(url)
        if sc != 404:
            main_section = page.find('div',{'id':'consultations'})
            project_id = main_section.find('strong',text='Reference').next_sibling
            filer = main_section.find('strong',text='Complainant').next_sibling
            project_data.append([idx, clean(project_id), clean(filer)])
        else:
            count404 += 1
            url404.append(url)
            project_data.append([idx, None, None])
    
    print('Number of 404 Responses', count404)
    
    ## Merge into DF and return 
    project_data = pd.DataFrame(project_data,columns=['idx','project id','Filer(s)'])
    project_data.index = project_data.idx
    project_data = project_data.drop('idx',axis=1)
    df = pd.concat([df, project_data],axis=1)
    return df, {'url404':url404, 'count404':count404}

In [222]:
df = scrape()

In [223]:
df

Unnamed: 0_level_0,Received Date,Type,Case Name,Country/Territory,Allegations,Last Stage Completed,Current Status,urls,project id,Filer(s)
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,28/09/2009,E,Gazela Bridge Rehabilitation,Serbia,Social impact of the project,16/07/2010,Closed,http://eib.org/about/accountability/complaints...,,
2,07/11/2008,E,Transport Lending Policy,Unknown,Failure to consult the public on EIB Transport...,15/04/2009,Closed,http://eib.org/about/accountability/complaints...,SG/E/2008/06,Mrs. Anna Roggenbuck (CEE Bankwatch) and Mrs. ...
7,19/05/2009,E,South Sinai Power Plant,Egypt,Negative environmental and social impacts,27/04/2010,Closed,http://eib.org/about/accountability/complaints...,SG/E/2009/05,Members of the Nuweiba Community - managers/ow...
8,13/02/2012,E,Autoroute Sfax-Gabes,Tunisia,Violation of the legal expropriation procedure,25/11/2013,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/01,Abdelhamid Bouchanoua
9,26/10/2010,E,MOZAL II,Mozambique,Breach of EIB environmental policies,04/05/2012,Closed,http://eib.org/about/accountability/complaints...,SG/E/2010/16,Coalition of Mozambican NGOs (Justiça Ambienta...
15,05/01/2012,E,Cairo Metro Line 3 (Phase 3),Egypt,The promoter ignores the complainants' request...,09/03/2014,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/12,
32,12/07/2012,E,DTS Expressway,Poland,Failure to comply with environmental law,04/03/2014,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/10,The association ‘Stowarzyszenie Gliwiczanie dl...
33,02/05/2012,E,Eastern Poland Roads Ten-T,Poland,Failure to mitigate project impacts regarding ...,18/01/2013,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/06,Confidential
49,19/10/2012,E,ICL Specialty Chemicals R&D (RSFF),Spain,Negative environmental impacts of the subsidia...,06/09/2013,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/11,
58,26/05/2012,E,Larnaca Sewerage and Drainage I,Cyprus,Environmental compliance of the project; misma...,04/03/2014,Closed,http://eib.org/about/accountability/complaints...,SG/E/2012/07,Antonio Serraino
