# Collective progress

**Example notebook for creating anonymised, collective information on progress**

* Before running this notebook, you need to prepare the data you want to assess. To do so, please use the notebooks  "prepare-PRIMAP-hist-data-for-collective-progress-plots.ipynb" or "prepare-PRIMAP-data.ipynb" in case the data you need is not yet available.
* For testing, some example data is available in the folder "proc_data"
* Enter the name of the file that you wish to use in the second cell and some of the plotting parameters. After that you can run the full notebook with minimal changes. 

In [1]:
# import modules

# system 
import re
import os

# calculation
import pandas as pd
import numpy as np

# plotting
%matplotlib inline
import seaborn
import matplotlib

# global stocktake tools
from gst_tools.make_plots import *
import gst_tools.gst_utils as utils

In [2]:
from a_parameters import *

In [3]:
# USER INPUT

# First, choose which file you want to plot the data for

variable_name_absolute, proc_data_fname, source_name = utils.get_primap_variable_and_and_file_name(gas_names[raw_entity], raw_sector, raw_scenario, version)


if absolute == True:
    data_file_name = proc_data_fname#'PRIMAP-hist_v2.3.1_CO2-total-excl-LU.csv'
    variable_name_to_display = variable_name_absolute
    data_source_to_display = source_name
else:
    with open('gst_tools/name_relative_dset.txt') as f:
        data_file_name = f.readline()
    with open('gst_tools/name_relative_variable.txt') as g:
        variable_name_to_display = g.readline()
    with open('gst_tools/combined_source.txt') as h:
        data_source_to_display = h.readline()


# other options include...
# 'PRIMAP-hist_v2.0_KyotoGHG-AR4-total-excl-LU.csv'
# 'UN-population-data-2017.csv'
# 'PRIMAP-hist_v2.0_Energy-CO2.csv'
# 'PRIMAP-hist_UN-2017_calc__CO2-per-population.csv'
# 'PRIMAP-hist_UN-2017_calc_CO2-total-excl-LU-per-population.csv'
# 'WDI2017_GDP-PPP.csv'

# Second, choose which years you are interested in analysing
#years_of_interest = ['1990', '2005', '2016']

# Third, update data description display names!
# TODO - default to automatic if not specified.
#variable_name_to_display = 'Energy CO2 emissions'#'Total CO2 emissions excl. LULUCF'
#data_source_to_display = #'PRIMAP-hist'

# Save plots?
# Set the following to True if plots should be saved. 
# If False, plots will be shown on screen but not saved to a file.
#save_opt = True

In [4]:
# DATA READING AND PREP

# read the data from file 
fname_in = os.path.join('proc-data', data_file_name)
data = pd.read_csv(fname_in)

# Check the data format
if not utils.verify_data_format(data):          # WARNING: Script from Utils used here.
    print('WARNING: The data is not correctly formatted! Please check before continuing!')

# extract the key information
variable = data['variable'].unique()[0]
unit = data['unit'].unique()[0]

# tidy up for next steps
data_years = utils.set_countries_as_index(data)
data_years = data_years.dropna(axis=1, how='all')
data_years = data_years.dropna(axis=0, how='any')

# remove comment below to display the data
data_years

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFG,0.226388,0.198511,0.109765,0.096102,0.084330,0.075646,0.068423,0.062509,0.058264,0.046354,...,0.296380,0.411722,0.336956,0.273632,0.243926,0.239150,0.200661,0.189001,0.189928,0.257086
AGO,0.911517,0.914368,0.916463,0.917779,0.712395,1.333792,1.361043,1.116224,1.119821,1.140742,...,1.006155,1.019791,1.055444,1.010925,1.410449,1.043595,1.050534,0.992730,0.895819,0.848382
ALB,1.299238,1.851971,1.241131,1.214643,1.309416,1.279886,1.259457,1.318180,1.563034,1.682333,...,2.025389,2.292445,2.203144,2.276266,2.526735,2.159195,2.159173,2.491772,2.326980,2.484066
AND,7.503486,7.358910,7.234809,7.150589,7.004276,7.297213,7.706291,7.898256,8.325538,8.653368,...,6.583466,6.268806,6.345008,6.338987,6.274223,6.372367,6.520474,6.571685,6.414918,6.092344
ARE,25.705015,27.204788,25.963373,28.070046,29.376166,27.493697,27.450444,25.679794,26.979817,25.050328,...,18.011700,18.107077,21.768623,21.417914,20.728915,22.779053,21.685775,18.024274,17.651397,17.399268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSM,0.638832,0.658537,0.670735,0.664205,0.681728,0.699794,0.736132,0.755528,0.769502,0.789143,...,1.011057,1.082846,1.052414,1.043458,1.092498,1.219575,1.269662,1.254108,1.284875,1.791033
YEM,0.829207,0.757593,0.779673,0.646175,0.706411,0.757712,0.769267,0.813737,0.851048,0.963291,...,0.967400,0.957678,0.956149,1.260582,1.188840,0.618917,0.485862,0.456263,0.712314,0.706401
ZAF,7.445550,7.529371,6.774821,7.039454,7.346404,7.650396,7.599251,7.955809,7.737695,7.578055,...,8.727577,8.864744,8.384965,8.475030,8.891874,8.305293,8.255104,8.279285,7.890294,7.940809
ZMB,0.309823,0.297090,0.293444,0.292266,0.275092,0.239650,0.201290,0.256314,0.235140,0.179477,...,0.194767,0.205374,0.248874,0.259939,0.277276,0.274570,0.289059,0.385674,0.405147,0.396954


In [5]:
# Plot 1 - make a histogram of absolute data

# how should the xaxis be labelled?
xaxlabel = variable_name_to_display

# make a plot for each year
for selected_year in years_of_interest:
    
    title = 'Distribution of ' + variable_name_to_display + ' in ' + str(selected_year)
    
    make_histogram(data_years[selected_year], unit,             # WARNING: This is a script from the make_plots.py tool.
                   xlabel=xaxlabel, title=title, 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3,
                   save_plot=save_opt, plot_name=(variable + '-' + 'absolute' + '-' + str(selected_year)))


---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-absolute-1990 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 1990, dtype: float64)
upper outliers are: 
country
LUX    30.902187
QAT    34.013963
Name: 1990, dtype: float64
---




---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-absolute-2005 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2005, dtype: float64)
upper outliers are: 
country
ARE    30.512909
LUX    26.012667
QAT    61.242648
TTO    39.491029
Name: 2005, dtype: float64
---
---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-absolute-2016 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper outliers are: 
country
BHR    24.337334
QAT    44.078106
TTO    39.054475
Name: 2016, dtype: float64
---
bins set to range(0, 23)


In [6]:
data_years

Unnamed: 0_level_0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFG,0.226388,0.198511,0.109765,0.096102,0.084330,0.075646,0.068423,0.062509,0.058264,0.046354,...,0.296380,0.411722,0.336956,0.273632,0.243926,0.239150,0.200661,0.189001,0.189928,0.257086
AGO,0.911517,0.914368,0.916463,0.917779,0.712395,1.333792,1.361043,1.116224,1.119821,1.140742,...,1.006155,1.019791,1.055444,1.010925,1.410449,1.043595,1.050534,0.992730,0.895819,0.848382
ALB,1.299238,1.851971,1.241131,1.214643,1.309416,1.279886,1.259457,1.318180,1.563034,1.682333,...,2.025389,2.292445,2.203144,2.276266,2.526735,2.159195,2.159173,2.491772,2.326980,2.484066
AND,7.503486,7.358910,7.234809,7.150589,7.004276,7.297213,7.706291,7.898256,8.325538,8.653368,...,6.583466,6.268806,6.345008,6.338987,6.274223,6.372367,6.520474,6.571685,6.414918,6.092344
ARE,25.705015,27.204788,25.963373,28.070046,29.376166,27.493697,27.450444,25.679794,26.979817,25.050328,...,18.011700,18.107077,21.768623,21.417914,20.728915,22.779053,21.685775,18.024274,17.651397,17.399268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WSM,0.638832,0.658537,0.670735,0.664205,0.681728,0.699794,0.736132,0.755528,0.769502,0.789143,...,1.011057,1.082846,1.052414,1.043458,1.092498,1.219575,1.269662,1.254108,1.284875,1.791033
YEM,0.829207,0.757593,0.779673,0.646175,0.706411,0.757712,0.769267,0.813737,0.851048,0.963291,...,0.967400,0.957678,0.956149,1.260582,1.188840,0.618917,0.485862,0.456263,0.712314,0.706401
ZAF,7.445550,7.529371,6.774821,7.039454,7.346404,7.650396,7.599251,7.955809,7.737695,7.578055,...,8.727577,8.864744,8.384965,8.475030,8.891874,8.305293,8.255104,8.279285,7.890294,7.940809
ZMB,0.309823,0.297090,0.293444,0.292266,0.275092,0.239650,0.201290,0.256314,0.235140,0.179477,...,0.194767,0.205374,0.248874,0.259939,0.277276,0.274570,0.289059,0.385674,0.405147,0.396954


In [7]:
# Plot 2 - trends

# Calculate trends from the absolute data
# trends - % change in any given year
# rolling_trends - % annual change averaged over the specified number of years (here 5 is recommended)
# trends_unit - unit of the trend, here % change
trends, rolling_trends, trends_unit = utils.calculate_trends(data_years, num_years_trend=5)     #WARNING: Utils here.

# define some labels for the plots
trends_variable = 'average annual change'
thistitle = "5-year rolling average trend in \n" + variable_name_to_display + "\nin " + str(data_years.columns[-1])

# make a plot showing the trend in the final year of available data
make_histogram(rolling_trends.iloc[:,-1], trends_unit, 
               xlabel=trends_variable,
               title=thistitle,
               remove_outliers=True, ktuk=2,
               sourcename=data_source_to_display, 
               save_plot=save_opt, plot_name=(variable + '-' + 'rolling-average'))


Averaging trend over 5 years.
---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-rolling-average plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2019, dtype: float64)
upper outliers are: 
country
KHM    21.702814
LAO    44.364364
MMR    16.182025
NPL    17.001685
TON    14.728429
Name: 2019, dtype: float64
---
bins set to range(-13, 13)




In [8]:
# Plot 3 - change since year X

# run calculations - dataframe of differences in all years relative to the specified year
# the function returns both absolute and relative (%) values
df_abs_diff_1990, df_perc_diff_1990 = utils.calculate_diff_since_yearX(data_years, '1990')
df_abs_diff_2005, df_perc_diff_2005 = utils.calculate_diff_since_yearX(data_years, '2005')

# make plots

# a few selected years; difference from 1990
for selected_year in years_of_interest:
    make_histogram(df_perc_diff_1990[selected_year], "%", 
                   xlabel='change since 1990', 
                   title=('change in ' + variable_name_to_display + '\n from 1990 to ' + str(selected_year)), 
                   sourcename=data_source_to_display,
                   remove_outliers=True, ktuk=3, 
                   save_plot=save_opt, plot_name=(variable + '-' + 'change-since-1990' + '-in-' + str(selected_year)))

    
# decrease in the last year as compared to 2005 
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=False, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005'))

# same plot, but removing outliers to show difference...
make_histogram(df_perc_diff_2005.iloc[:,-1], '%', 
               xlabel='change since 2005', 
               title=('change in ' + variable_name_to_display + '\n from 2005 to ' + str(df_perc_diff_2005.columns[-1])), 
               sourcename=data_source_to_display,
               remove_outliers=True, 
               save_plot=save_opt, plot_name=(variable + '-' + 'change-since-2005-excl-outliers'))


Calculating difference compared to 1990
Calculating difference compared to 2005
---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-change-since-1990-in-1990 plot.
---------
---------
All values in the series are the same! Exiting plotting routine for Total CO2 emissions (excl. LULUCF)-per-Population, total-change-since-1990-in-1990
---------
---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-change-since-1990-in-2005 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2005, dtype: float64)
upper outliers are: 
country
GNQ    7554.084615
MDV     348.340973
SYC     281.906108
VNM     311.161563
Name: 2005, dtype: float64
---
bins set to range(-260, 260, 20)
---------
Making  Total CO2 emissions (excl. LULUCF)-per-Population, total-change-since-1990-in-2016 plot.
---------
-----------
Identifying and removing outliers
lower outliers are:
Series([], Name: 2016, dtype: float64)
upper outliers are:

## Below here is space for code for testing and debugging!

In [9]:
# show example data
#data.columns


In [10]:
#data_1 = data
#data_1 = data_1.dropna(axis=1, how='all')
#data_1 = data_1.dropna(axis=0, how='any')
#data_1

In [11]:
#data.loc[data['country'] == 'AFG']
