# Calculate indicators

This notebook enables the user to use GDP, population, or other sets of indicators to calculate combined indicators such as per capita emissions or emissions / final energy use.

It should be used on data that is already pre-processed for this toolset to ensure efficiency and avoid errors.

One cell allows the user to adjust the units of the indicator. This should be done with care and the default option is not to do it! 

In [1]:
# import modules

# system
import sys, os, re
import time

# analytic
import pandas as pd
#import numpy as np

# open cliamte data
#import countrygroups

# plotting
#import seaborn
import matplotlib.pyplot as plt

# global stocktake tools
import gst_tools.gst_utils as utils


In [2]:
from a_parameters import *

In [3]:
variable_name_to_display, proc_data_fname, source_name = utils.get_primap_variable_and_and_file_name(gas_names[raw_entity], raw_sector, raw_scenario, version)
#data_set_1 = 'R-Andrew-2018_cement-CO2.csv'
data_set_1 = proc_data_fname#'PRIMAP-HISTCR_v2.3.1_CH4_total_excl_LULUCF.csv'
#data_set_2 = 'UN-2017-population.csv'
data_set_2 = raw_other_dataset

# Use this to generate the prefix of the output data file to include the source names of the original data. 
# The entities will automaticall be added when running the script
#new_source_name = 'R-Andrew-2018-cement-CO2-UN-population'
new_source_name = 'PRIMAP-hist_CO2_total_excl_LU_2017_population'

# (don't need the file type ending!)



In [3]:
# get and clean data 

fname_in1 = os.path.join('proc-data', data_set_1)
#fname_in2 = os.path.join('proc-data', data_set_2)
fname_in2 = os.path.join('gst_tools', 'data', data_set_2)

# read in the data
var1 = pd.read_csv(fname_in1)
var2 = pd.read_csv(fname_in2, header=2)

# EPO: make country column and variable name columns coincide
var2.rename(columns={'Country Code':'country','Indicator Name':'variable'},inplace=True)

# Population dataset
var2_unit = ['Pers']*len(var2)
var2['unit'] = var2_unit

# make sure that the same countries and years are available
var1, var2 = utils.ensure_common_years(var1, var2)
var1, var2 = utils.ensure_common_countries(var1, var2)

# check the data format
check1 = utils.verify_data_format(var1)
check2 = utils.verify_data_format(var2)

if not check1 or not check2:
    print('One of the dataframes is not correct! Please check and try again!')
else:
    # get metadata for later use and checking
    var1_name = var1['variable'].unique()[0]
    var2_name = var2['variable'].unique()[0]

    var1_unit  = var1['unit'].unique()[0]
    var2_unit = var2['unit'].unique()[0]


Common countries are: 
['NLD', 'SLB', 'TUR', 'GUY', 'MCO', 'KNA', 'OMN', 'USA', 'TUV', 'GNB', 'ZWE', 'NZL', 'AND', 'MEX', 'BDI', 'VNM', 'LCA', 'SLE', 'SMR', 'DEU', 'BLZ', 'BEL', 'MMR', 'CMR', 'MDG', 'MUS', 'NOR', 'UKR', 'SWE', 'TON', 'GEO', 'HUN', 'LSO', 'MLI', 'SLV', 'DMA', 'CYP', 'BWA', 'KGZ', 'LBR', 'PER', 'ITA', 'AFG', 'ALB', 'SAU', 'BLR', 'BTN', 'RWA', 'VUT', 'STP', 'SSD', 'IRL', 'MWI', 'JAM', 'LBY', 'FJI', 'ERI', 'ISL', 'SUR', 'BRA', 'ECU', 'HTI', 'LBN', 'WSM', 'BHR', 'SVK', 'VCT', 'SGP', 'AUS', 'BHS', 'GRD', 'HND', 'TTO', 'SWZ', 'PHL', 'TGO', 'FSM', 'UGA', 'LUX', 'CPV', 'NER', 'THA', 'PRY', 'GBR', 'TUN', 'MLT', 'CUB', 'AZE', 'NGA', 'IRQ', 'ARE', 'CHN', 'LIE', 'MDV', 'GIN', 'SOM', 'SYR', 'CAF', 'NAM', 'CRI', 'TKM', 'LAO', 'CHL', 'KIR', 'BRB', 'BIH', 'SDN', 'KOR', 'NRU', 'ATG', 'MDA', 'LVA', 'MAR', 'IDN', 'MOZ', 'ZAF', 'GHA', 'JPN', 'TZA', 'SYC', 'GTM', 'ESP', 'COM', 'TJK', 'EUU', 'BRN', 'PAK', 'TLS', 'KHM', 'BGR', 'VEN', 'ROU', 'MYS', 'GMB', 'TCD', 'BOL', 'MNG', 'ZMB', 'ETH', 'NI

In [5]:
var2.head()

Unnamed: 0,Country Name,unit,Unnamed: 66,variable,country,Indicator Code,1990,1991,1992,1993,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
2,Afghanistan,Pers,,"Population, total",AFG,SP.POP.TOTL,12412311.0,13299016.0,14485543.0,15816601.0,...,29185511.0,30117411.0,31161378.0,32269592.0,33370804.0,34413603.0,35383028.0,36296111.0,37171922.0,38041757.0
4,Angola,Pers,,"Population, total",AGO,SP.POP.TOTL,11848385.0,12248901.0,12657361.0,13075044.0,...,23356247.0,24220660.0,25107925.0,26015786.0,26941773.0,27884380.0,28842482.0,29816769.0,30809787.0,31825299.0
5,Albania,Pers,,"Population, total",ALB,SP.POP.TOTL,3286542.0,3266790.0,3247039.0,3227287.0,...,2913021.0,2905195.0,2900401.0,2895092.0,2889104.0,2880703.0,2876101.0,2873457.0,2866376.0,2854191.0
6,Andorra,Pers,,"Population, total",AND,SP.POP.TOTL,54508.0,56666.0,58882.0,60974.0,...,84454.0,83748.0,82427.0,80770.0,79213.0,77993.0,77295.0,76997.0,77008.0,77146.0
8,United Arab Emirates,Pers,,"Population, total",ARE,SP.POP.TOTL,1828437.0,1937159.0,2052892.0,2173135.0,...,8549998.0,8946778.0,9141598.0,9197908.0,9214182.0,9262896.0,9360975.0,9487206.0,9630966.0,9770526.0


In [6]:
# combine data...

# for all of these, it's always var1 divided by var 2 and we want to ensure that this is done on countries. 
# Everything else should be consant across the table

def prep_df_for_division(df):
    
    df = df.set_index('country')
    
    year_cols = [y for y in df[df.columns] if (re.match(r"[0-9]{4,7}$", str(y)) is not None)]
    other_cols = list(set(df.columns) - set(year_cols))
    
    df = df.drop(other_cols, axis='columns')
    
    return df
    
# strip original metadata
var1 = prep_df_for_division(var1)
var2 = prep_df_for_division(var2)

# calculate new variables
new_df = var1 / var2

# generate new metadata
new_variable_name = var1_name + '-per-' + var2_name
new_df['variable'] = new_variable_name

# automatically generate the unit 
new_df['unit'] = var1_unit + ' / ' + var2_unit
    
new_df = new_df.reset_index()

# reorganise dataframe
new_df = utils.check_column_order(new_df)


In [10]:
var1

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFG,284.000,297.000,302.000,309.000,323.000,342.000,379.000,409.000,432.000,466.000,...,632.000,652.000,649.000,649.000,664.000,635.00,630.000,625.00,636.000,666.000
AGO,1180.000,1160.000,1110.000,1230.000,1630.000,1820.000,2020.000,2080.000,2260.000,2220.000,...,2190.000,2120.000,2130.000,2170.000,2110.000,2220.00,2190.000,2170.00,2110.000,2170.000
ALB,54.500,56.400,55.300,59.900,62.400,62.600,75.000,76.400,77.000,83.300,...,94.300,94.400,96.000,96.800,98.800,101.00,100.000,101.00,101.000,98.000
AND,0.667,0.692,0.696,0.675,0.651,0.638,0.637,0.616,0.603,0.594,...,0.541,0.524,0.507,0.513,0.512,0.50,0.493,0.49,0.493,0.484
ARE,457.000,481.000,501.000,555.000,553.000,645.000,745.000,799.000,836.000,895.000,...,1530.000,1600.000,1640.000,1670.000,1670.000,1780.00,1820.000,1820.00,1850.000,1900.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSM,5.840,5.840,5.700,5.830,5.920,5.860,5.900,5.900,5.970,6.000,...,6.880,7.100,7.150,7.300,7.590,7.96,8.150,7.98,8.160,8.300
YEM,125.000,122.000,125.000,129.000,130.000,133.000,149.000,166.000,184.000,199.000,...,340.000,360.000,374.000,396.000,389.000,334.00,319.000,320.00,311.000,328.000
ZAF,1880.000,1900.000,1890.000,1860.000,1820.000,1860.000,1900.000,1970.000,2030.000,2030.000,...,2390.000,2470.000,2530.000,2450.000,2480.000,2460.00,2450.000,2420.00,2410.000,2440.000
ZMB,517.000,524.000,529.000,526.000,522.000,519.000,490.000,502.000,544.000,524.000,...,645.000,625.000,692.000,705.000,703.000,714.00,710.000,689.00,698.000,713.000


In [7]:
# take a look at your new data frame

new_df

Unnamed: 0,country,unit,variable,1990,1991,1992,1993,1994,1995,1996,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,AFG,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000023,0.000022,0.000021,0.000020,0.000019,0.000019,0.000020,...,0.000022,0.000022,0.000021,0.000020,0.000020,0.000018,0.000018,0.000017,0.000017,0.000018
1,AGO,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000100,0.000095,0.000088,0.000094,0.000121,0.000131,0.000140,...,0.000094,0.000088,0.000085,0.000083,0.000078,0.000080,0.000076,0.000073,0.000068,0.000068
2,ALB,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000017,0.000017,0.000017,0.000019,0.000019,0.000020,0.000024,...,0.000032,0.000032,0.000033,0.000033,0.000034,0.000035,0.000035,0.000035,0.000035,0.000034
3,AND,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000012,0.000012,0.000012,0.000011,0.000010,0.000010,0.000010,...,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006,0.000006
4,ARE,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000250,0.000248,0.000244,0.000255,0.000241,0.000267,0.000293,...,0.000179,0.000179,0.000179,0.000182,0.000181,0.000192,0.000194,0.000192,0.000192,0.000194
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,WSM,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000036,0.000036,0.000034,0.000035,0.000035,0.000034,0.000034,...,0.000037,0.000038,0.000038,0.000038,0.000039,0.000041,0.000042,0.000041,0.000042,0.000042
190,YEM,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000011,0.000010,0.000010,0.000009,0.000009,0.000009,0.000010,...,0.000015,0.000015,0.000015,0.000016,0.000015,0.000013,0.000012,0.000011,0.000011,0.000011
191,ZAF,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000051,0.000050,0.000049,0.000047,0.000045,0.000045,0.000045,...,0.000047,0.000047,0.000048,0.000046,0.000045,0.000044,0.000044,0.000042,0.000042,0.000042
192,ZMB,Gg CH4 / yr / Pers,"CH4_total_excl_LULUCF-per-Population, total",0.000064,0.000064,0.000063,0.000061,0.000059,0.000057,0.000052,...,0.000047,0.000045,0.000048,0.000047,0.000046,0.000045,0.000043,0.000041,0.000040,0.000040


In [29]:
# If desired, you can set the unit of the new data here. 

# ****** BE CAREFUL!!! *******
# This option is just to allow you to make the name nicer for plots and will not actually change the units. An 'org_unit' column will 
# be added to teh dataframe for safety. 
# Recommended option: set to False and keep automatic calculation of units.

convert_unit = True
desired_unit = 'ktCO2 / capita'
conversion_factor = 1000

# For reference: 
# * 1 Gg / Thousand Pers = 1 t / person

conv_df = new_df.copy()
org_unit = new_df['unit'].unique()

print('*******************')
print('Converting unit from "' + org_unit + '" to "' + desired_unit + 
      '" using a conversion factor of ' + str(conversion_factor))
print('*******************')

if convert_unit:
    conv_df['unit'] = desired_unit
    
    # convert the data
    conv_df, other_cols = utils.set_non_year_cols_as_index(conv_df)
    conv_df = conv_df * conversion_factor
    conv_df = conv_df.reset_index()

    final_df = conv_df
    
else: 
    
    final_df = new_df

*******************
['Converting unit from "kt / ThousandPers" to "ktCO2 / capita" using a conversion factor of 1000']
*******************


In [31]:
## write the data to file

"""
First ensure that years, unit, 'country', and variable are all in data. If they are
can proceed to print data
"""

if 'country' not in new_df.columns or 'unit' not in new_df.columns:
    
    print('Missing required information! Please check your input data and processing!')
    
else:
    
    # define filename as composite of variable and source name
    fname_out = new_source_name + '_' + new_variable_name + '.csv' 
    fullfname_out = os.path.join('proc-data', fname_out)

    # check folder exists
    if not os.path.exists('proc-data'):
        os.makedirs('proc-data')

    # check if file already exists
    files_present = glob.glob(fullname_out)

    # if no matching files, write to csv, if there are matching files, print statement
    if not files_present:
        # write to csv in proc data folder
        final_df.to_csv(fullfname_out, index=False)
    
    else:
        print 'WARNING: This file already exists! adding a date stamp to the file name.'
        fname_out = new_source_name + '_' + new_variable_name + time.strftime("%Y%m%d-%H") + '.csv' 
        final_df.to_csv(fullfname_out, index=False)

    # celebrate success 
    print('Processed data written to file!)
    print(fullfname_out)
    

Processed data written to file! - proc-data\R-Andrew-2018-cement-CO2-UN-population.csv_Cement-process-emissions-CO2-per-population.csv
