In [56]:
import os
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import numpy as np
import json

In [443]:
COUNTRY_INFORMATION_FILE = 'country_information_updated.csv'
UNICODE_ENCODING = "utf-8"
TOPIC = 'Electricity'
ATTEMPTS_LIMIT = 20
FAIL_LIMIT = 5

In [444]:
def make_country_names_thesaurus(country_names_thesaurus_file=COUNTRY_INFORMATION_FILE):
    """
    Get a dict mapping ideal country names to list of alternates.
    Parameters
    ----------
    country_names_thesaurus_file : str
        Filepath.
    Returns
    -------
    Dict of {'country': ['alt_country0', 'alt_country1', ...]}.
    """
#     with open(country_names_thesaurus_file, 'rbU') as f:
#         csvreader = csv.DictReader(f)
#         country_names_thesaurus = {}
    data = pd.read_csv(country_names_thesaurus_file,encoding=UNICODE_ENCODING)
    
    country_names_thesaurus = {}
    
    for i in range(data.shape[0]):
        country_primary_name = data.loc[i,'primary_country_name']
        country_names_thesaurus[country_primary_name] = [
                data.loc[i,'geo_country_name'],
                data.loc[i,'carma_country_name'],
                data.loc[i,'iea_country']
        ]
    return country_names_thesaurus

In [445]:
class IEAGenerationScraper:
    
    def __init__(self, topic, year):
        
        self.driver = self.driver_setup()
        self.table_view = self.reach_table_panel()
        self.year = year
        self.topic = topic
        self.configure_table()

    def driver_setup(self):

        # Start the webdriver and direct to the webpage
        driver = webdriver.Chrome()
        driver.get(URL)

        return driver


    def reach_table_panel(self):
        
        local_driver = self.driver

        try:
            wait = WebDriverWait(local_driver, 5).until(EC.presence_of_element_located((By.ID, 'app')))
        except:
            print("Running over time, unable to locate the table mode button")

        table_view = local_driver.find_element_by_id('app')
        table_view.find_elements_by_tag_name('li')[1].click()
        
        return table_view


    def __select_parameters(self, header, selection = None):
        
        assert selection is not None,"an option must be provided in the 'select' field"
        
        cell = self.__find_element_by_header(header)

        for option in cell.find_elements_by_tag_name('option'):    
            if str(selection) in option.text:
                option.click()
                return
    
    
    def __fill_parameters(self, header, value = None):
        
        assert value is not None, 'a value must be provided'
        
        cell = self.__find_element_by_header(header)
        
        input_cell = cell.find_element_by_tag_name('input')
        input_cell.click()
        input_cell.send_keys(str(value) + '\n')
        return
    
    
    def __find_element_by_header(self, header):
        local_driver = self.table_view
        return local_driver.find_element_by_name(header)


    def year_selecter(self, selection):
        return self.__select_parameters('year', selection = selection)


    def topic_selecter(self, selection):
        return self.__select_parameters('table', selection = selection)


    def country_selecter(self, value):
        return self.__fill_parameters('country', value = value)
    
    
    def configure_table(self):
        self.year_selecter(self.year)
        self.topic_selecter(self.topic)
    
    
    def __locate_table(self):
        
        local_driver = self.table_view
        
        try:
            wait = WebDriverWait(local_driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'stats-table')))
        except:
            print("Running over time, unable to locate the table")
            
        return local_driver.find_element_by_class_name('stats-table')
        
    
    def retrieve_data(self):
        
        table = self.__locate_table()
        generation_by_fuel = self.json_setup()
        rows = table.find_elements_by_tag_name('tr')
        
        assert len(rows) > 1
        
        for row in rows[1:]:
            (fuel, gen) = self.parse_row(row)
            
            if fuel in generation_by_fuel:
                generation_by_fuel[fuel] = gen
            
            if fuel == 'Total production':
                break
            
        return generation_by_fuel
        
    
    def parse_row(self, row):
        
        fields = row.find_elements_by_tag_name('td')
        # parse the fuel
        fuel = fields[0].text.replace('*','')
        # parse the generation value. assign 0 if empty
        generation = fields[1].text
        generation = 0 if generation == '' else int(generation.replace('\u202f',''))
        
        return (fuel, generation)
    
    
    
    def json_setup(self):
        empty_json = {'Coal': -1, 
                      'Oil': -1, 
                      'Natural gas': -1, 
                      'Biofuels': -1, 
                      'Waste': -1,
                      'Nuclear': -1,
                      'Hydro': -1, 
                      'Geothermal': -1,
                      'Solar PV': -1,
                      'Solar thermal': -1,
                      'Wind': -1,
                      'Tide': -1,
                      'Other sources': -1, 
                      'Total production': -1}
        return empty_json


In [446]:
def run_scraper(total_gen_by_country, year):
    scraper = IEAGenerationScraper(TOPIC, year)
    consecutive_fails = 0
    fail_limit = 5

    # iterate through countries
    for country,country_aliases in country_names.items():


        iea_country = country_aliases[2]

        if country in total_gen_by_country:
            print("Already scraped for {}, skipping.".format(country))
            continue

        if iea_country!=iea_country:
            print("No IEA alias for {}, skipping.".format(country))
            continue

        print("Retrieving data for {}...".format(country))

        try:

            scraper.country_selecter(iea_country)
            time.sleep(np.random.randint(1,5))

            fetched_data = scraper.retrieve_data()

            if fetched_data['Total production'] == -1:
                raise ValueError

            total_gen_by_country[country] = fetched_data

            print('Success!')

        except:

            consecutive_fails += 1
            print('unable to fetch data for {0} \n Consecutive fail: {1}'.format(country, consecutive_fails))

            if consecutive_fails >= FAIL_LIMIT:
                print('Reached the limit of failed fetching: {} \n Exiting program.'.format(fail_limit))
                raise KeyboardInterrupt

        print('\n')
        time.sleep(np.random.randint(1,5))

In [447]:


def run_scraper_iteratively(attempts, total_gen_by_country, year):
    print('\n\n Attempt: {0} Scraped countries: {1} \n\n'.format(attempts, len(total_gen_by_country)))
    
    if len(total_gen_by_country) >= 139:
        return
    
    try:
        run_scraper(total_gen_by_country, year)
    except:
        if attempts < ATTEMPTS_LIMIT:
            run_scraper_iteratively(attempts + 1, total_gen_by_country, year)
        else:
            raise KeyboardInterrupt

        

In [448]:
def transform(df,year):
    
    df['Solar'] = df['Solar PV'] + df['Solar thermal']
    df.drop(['Solar PV','Solar thermal'],axis=1,inplace=True)
    
    df['Petcoke'] = 0
    df['Cogeneration'] = 0
    
    df = pd.melt(df,id_vars='Unnamed: 0',value_vars=list(df.columns)[1:])
    df = df.sort_values(['Unnamed: 0','variable'])
    df.columns = ['country','fuel','generation_gwh_{}'.format(year)]
    df.reset_index(drop=True,inplace=True)
    
    df.to_csv('generation_by_country_by_fuel_{}_final.csv'.format(year),index=False)
    
    return df

In [449]:
# make country name thesaurus and dict of country objects
country_names = make_country_names_thesaurus()

In [450]:
total_gen_by_country = {}
attempts = 0

In [None]:
run_scraper_iteratively(attempts,total_gen_by_country, 2015)



 Attempt: 0 Scraped countries: 0 


No IEA alias for Aruba, skipping.
No IEA alias for Afghanistan, skipping.
Retrieving data for Angola...
Success!


Retrieving data for Albania...
Success!


Retrieving data for United Arab Emirates...
Success!


Retrieving data for Argentina...
Success!


Retrieving data for Armenia...
Success!


No IEA alias for American Samoa, skipping.
No IEA alias for Antarctica, skipping.
No IEA alias for Antigua and Barbuda, skipping.
Retrieving data for Australia...
Success!


Retrieving data for Austria...
Success!


Retrieving data for Azerbaijan...
Success!


No IEA alias for Burundi, skipping.
Retrieving data for Belgium...
Success!


Retrieving data for Benin...
Success!


No IEA alias for Burkina Faso, skipping.
Retrieving data for Bangladesh...
Success!


Retrieving data for Bulgaria...
Success!


Retrieving data for Bahrain...
Success!


No IEA alias for Bahamas, skipping.
Retrieving data for Bosnia and Herzegovina...
Success!


Retrieving data for B

In [425]:
df = pd.DataFrame(total_gen_by_country).T

In [426]:
df.to_csv('generation_by_country_by_fuel_2015.csv')

In [46]:
gen_by_year_country_fuel = {}

In [47]:
gen_2014 = pd.read_csv('generation_by_country_by_fuel_2014.csv')
gen_2015 = pd.read_csv('generation_by_country_by_fuel_2015.csv')
gen_2016 = pd.read_csv('generation_by_country_by_fuel_2016.csv')
gen_2017 = pd.read_csv('generation_by_country_by_fuel_2017.csv')

In [48]:
gen_2014.rename(columns={'Solar PV':'Solar','Wave_and_Tidal':'Wave and Tidal','Unnamed: 0':'Country'},inplace=True)
gen_2014.drop(['Solar Thermal','Total'],axis=1,inplace=True)
gen_2014.set_index('Country',inplace=True)
gen_by_year_country_fuel['2014'] = gen_2014.to_dict('index')

gen_2016.rename(columns={'Solar PV':'Solar','Wave_and_Tidal':'Wave and Tidal','Unnamed: 0':'Country'},inplace=True)
gen_2016.drop(['Solar Thermal','Total'],axis=1,inplace=True)
gen_2016.set_index('Country',inplace=True)
gen_by_year_country_fuel['2016'] = gen_2016.to_dict('index')

In [49]:
gen_2015.rename(columns={'Solar PV':'Solar',
                         'Biofuels':'Biomass',
                         'Natural gas':'Gas',
                         'Other sources':'Other',
                         'Tide':'Wave and Tidal',
                         'Unnamed: 0':'Country'},inplace=True)
gen_2015.drop(['Solar thermal','Total production'],axis=1,inplace=True)
gen_2015.set_index('Country',inplace=True)
gen_by_year_country_fuel['2015'] = gen_2015.to_dict('index')

gen_2017.rename(columns={'Solar PV':'Solar',
                         'Biofuels':'Biomass',
                         'Natural gas':'Gas',
                         'Other sources':'Other',
                         'Tide':'Wave and Tidal',
                         'Unnamed: 0':'Country'},inplace=True)
gen_2017.drop(['Solar thermal','Total production'],axis=1,inplace=True)
gen_2017.set_index('Country',inplace=True)
gen_by_year_country_fuel['2017'] = gen_2017.to_dict('index')

In [58]:
with open('gen_by_year_country_fuel.json','w') as file:
    json.dump(gen_by_year_country_fuel,file)