Question 1.

In [1]:
def answer_one(): 
    # 3 Datasets.
    # 'Energy Indicators.xls' - energy
    # 'world_bank.csv'        - GDP
    # 'scimagojr-3.xlsx'      - ScimEn

    import numpy as np
    import pandas as pd
    from string import digits 

    #------------------------------Data Loading------------------------------#

    energy = pd.read_excel('Energy Indicators.xls')
    energy = energy[16:243] # Remove header and footer

    GDP = pd.read_csv('world_bank.csv',header=4 )

    ScimEn = pd.read_excel('scimagojr-3.xlsx')


    #------------------------------Data Cleaning------------------------------#

    ## Energy Data:

    # Drop column 1 & 2. 
    del energy['Unnamed: 0']
    del energy['Unnamed: 1']

    # Rename columns.
    energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    # Reindex dataframe
    energy = energy.reset_index() 
    del energy['index']

    # Covert missing value to NaN
    for column_name in energy.columns[1:]:
        energy[column_name][energy.loc[:,column_name].apply(lambda x: isinstance(x, str))]=float('NaN')

    # Update values in Energy Supply
    energy['Energy Supply'] = energy['Energy Supply'] * 1000000

    # Country Name Cleaning
    energy['Country']=[''.join(x for x in i if not x.isdigit()) for i in energy['Country']]  
    energy['Country']=energy['Country'].str.replace("\(.*\)","").str.strip()
        
    # Rename Country Names
    energy['Country'][energy['Country'] == 'Republic of Korea'] = 'South Korea'
    energy['Country'][energy['Country'] == 'United States of America'] = 'United States'
    energy['Country'][energy['Country'] == 'United Kingdom of Great Britain and Northern Ireland'] = 'United Kingdom'
    energy['Country'][energy['Country'] == 'China, Hong Kong Special Administrative Region'] = 'Hong Kong'
     

    ## GDP Data:

    # Rename Country Names
    GDP['Country Name'][GDP['Country Name'] == 'Korea, Rep.'] = 'South Korea'
    GDP['Country Name'][GDP['Country Name'] == 'Iran, Islamic Rep.'] = 'Iran'
    GDP['Country Name'][GDP['Country Name'] == 'Hong Kong SAR, China'] = 'Hong Kong' 
    GDP = GDP[['Country Name','2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013','2014', '2015']]

    ## ScimEn Data:
    ScimEn = ScimEn[0:15]

    # Reset Indexing
    energy = energy.set_index('Country')
    ScimEn = ScimEn.set_index('Country')
    GDP = GDP.set_index('Country Name')

    #------------------------------Data Joining------------------------------#
    output_step1 = pd.merge(ScimEn, energy, how = 'left', left_index = True, right_index = True)
    output_step2 = pd.merge(output_step1, GDP, how = 'left', left_index = True, right_index = True).sort_values('Rank')
    return output_step2

Question 2.

In [2]:
def answer_two():
    
    import numpy as np
    import pandas as pd
    from string import digits 
    import math

    #------------------------------Data Loading------------------------------#
    energy = pd.read_excel('Energy Indicators.xls')
    energy = energy[16:243] # Remove header and footer
    GDP = pd.read_csv('world_bank.csv',header=4 )
    ScimEn = pd.read_excel('scimagojr-3.xlsx')

    #------------------------------Data Cleaning------------------------------#

    ## Energy Data:

    # Drop column 1 & 2. 
    del energy['Unnamed: 0']
    del energy['Unnamed: 1']

    # Rename columns.
    energy.columns = ['Country', 'Energy Supply', 'Energy Supply per Capita', '% Renewable']

    # Reindex dataframe
    energy = energy.reset_index() 
    del energy['index']

    # Covert missing value to NaN
    for column_name in energy.columns[1:]:
        energy[column_name][energy.loc[:,column_name].apply(lambda x: isinstance(x, str))]=float('NaN')

    # Update values in Energy Supply
    energy['Energy Supply'] = energy['Energy Supply'] * 1000000
    
    # Country Name Cleaning
    energy['Country']=[''.join(x for x in i if not x.isdigit()) for i in energy['Country']]  
    energy['Country']=energy['Country'].str.replace("\(.*\)","").str.strip()
        
    # Rename Country Names
    energy['Country'][energy['Country'] == 'Republic of Korea'] = 'South Korea'
    energy['Country'][energy['Country'] == 'United States of America'] = 'United States'
    energy['Country'][energy['Country'] == 'United Kingdom of Great Britain and Northern Ireland'] = 'United Kingdom'
    energy['Country'][energy['Country'] == 'China, Hong Kong Special Administrative Region'] = 'Hong Kong'

    

    ## GDP Data:

    # Rename Country Names
    GDP['Country Name'][GDP['Country Name'] == 'Korea, Rep.'] = 'South Korea'
    GDP['Country Name'][GDP['Country Name'] == 'Iran, Islamic Rep.'] = 'Iran'
    GDP['Country Name'][GDP['Country Name'] == 'Hong Kong SAR, China'] = 'Hong Kong' 
    GDP = GDP[['Country Name','2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013','2014', '2015']]

    GDP = GDP.rename(columns = {'Country Name': 'Country'})

    # Reset Indexing
    energy = energy.set_index('Country')
    ScimEn = ScimEn.set_index('Country')
    GDP = GDP.set_index('Country')

    # Simplify Datasets
    country_list_energy = pd.DataFrame(energy.index)
    country_list_energy = country_list_energy.rename(columns = {0:'Country'})
    country_list_energy['Flag 2'] = 1.0

    country_list_ScimEn = pd.DataFrame(ScimEn.index)
    country_list_ScimEn = country_list_ScimEn.rename(columns = {0:'Country'})
    country_list_energy['Flag 2'] = 1.0

    country_list_GDP = pd.DataFrame(GDP.index)
    country_list_GDP = country_list_GDP.rename(columns = {0:'Country'})
    country_list_energy['Flag 2'] = 1.0

    # Outer Joins
    country_energy_ScimEn_1 = pd.merge(country_list_energy, country_list_ScimEn, how = 'outer', left_on = ['Country'], right_on = ['Country'])
    country_all = pd.merge(country_energy_ScimEn_1, country_list_GDP, how = 'outer', left_on = ['Country'], right_on = ['Country'])

    # Inner Joins
    country_energy_ScimEn_2 = pd.merge(country_list_energy, country_list_ScimEn, how = 'inner', left_on = ['Country'], right_on = ['Country'])
    country_inter = pd.merge(country_energy_ScimEn_2, country_list_GDP, how = 'inner', left_on = ['Country'], right_on = ['Country'])

    return country_all.shape[0] - country_inter.shape[0]

In [3]:
def answer_three():
    Top15 = answer_one() 
    Top15 = Top15[Top15.columns[10:]]
    output = Top15.mean(axis = 1).sort_values(ascending = False)
    return output

In [4]:
def answer_four():
    Top15 = answer_one()
    sub_dat = Top15.loc[answer_three().index[5]]
    return (sub_dat['2015'] - sub_dat['2006'])

In [5]:
def answer_five():
    Top15 = answer_one()
    return float(Top15['Energy Supply per Capita'].mean())

In [6]:
def answer_six():
    Top15 = answer_one()
    Country = Top15.sort_values('% Renewable', ascending = False).index[0]
    Renewable_perct = Top15.loc[Country]['% Renewable']
    return (Country, Renewable_perct)

In [7]:
def answer_seven():
    Top15 = answer_one()
    Top15['Citation Ratio'] = Top15['Self-citations']/Top15['Citations']
    Country = Top15.sort_values('Citation Ratio', ascending = False).index[0]
    Citation_ratio = Top15['Citation Ratio'].loc[Country]
    return (Country, Citation_ratio)

In [8]:
def answer_eight():
    Top15 = answer_one()
    Top15['Population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    return Top15.sort_values('Population', ascending = False).index[2]

In [9]:
def answer_nine():
    Top15 = answer_one()
    Top15['Population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    Top15['Citable documents per capita'] = Top15['Citable documents']/Top15['Population']
    sub_dat = Top15[['Citable documents per capita', 'Energy Supply per Capita']]
    
    # Convert to numeric type
    sub_dat['Energy Supply per Capita'] = pd.to_numeric(sub_dat['Energy Supply per Capita'])
    sub_dat['Citable documents per capita'] = pd.to_numeric(sub_dat['Citable documents per capita'])

    return sub_dat['Citable documents per capita'].corr(sub_dat['Energy Supply per Capita'], method = 'pearson')

In [10]:
def answer_ten():
    Top15 = answer_one()
    Renewable_median = Top15['% Renewable'].median()
    Top15['HighRenew'] = (Top15['% Renewable'] >= Renewable_median).astype('int')
    return Top15['HighRenew']

In [11]:
def answer_eleven():
    Top15 = answer_one()
    
    import pandas as pd
    import numpy as np
    ContinentDict  = pd.DataFrame({'Continent': ['Asia', 'North America', 'Asia', 
                                                 'Europe', 'Europe', 'North America', 
                                                 'Europe', 'Asia', 'Europe', 'Asia',
                                                 'Europe', 'Europe', 'Asia', 'Australia',
                                                 'South America']}, 
                                  index = ['China', 'United States', 'Japan', 'United Kingdom', 
                                           'Russian', 'Canada', 'Germany', 'India', 'France',
                                           'South Korea', 'Italy', 'Spain', 'Iran', 'Australia',
                                           'Brazil'])
    Top15['Population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    new_dat = pd.merge(Top15, ContinentDict, how = 'left', left_index = True, right_index = True)[['Continent', 'Population']]
    output_tab = new_dat.groupby('Continent')['Population'].agg({'size': lambda x: x.count(),
                                                                 'sum': sum, 
                                                                 'mean': lambda x: x.mean(), 
                                                                 'std': lambda x: x.std()})
    
    return output_tab

In [None]:
def answer_thirteen():
    Top15 = answer_one()
    Top15['Population'] = Top15['Energy Supply'] / Top15['Energy Supply per Capita']
    return Top15['Population'].apply(lambda x: f"{x:,}")