In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('CO2_emission.csv')
df.rename(columns={'Country Name': 'country_name'}, inplace=True)
df.drop(['Indicator Name', 'country_code', '2019.1'], axis=1, inplace=True)
df.columns = df.columns.str.lower()
df['country_name'] = df['country_name'].replace(['United States'], 'USA')
df.dropna(inplace=True)
df.head()

Unnamed: 0,country_name,region,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
1,Afghanistan,South Asia,0.191745,0.167682,0.095958,0.084721,0.075546,0.068468,0.062588,0.056827,0.052691,0.040157,0.036574,0.033785,0.045574,0.051518,0.041655,0.060419,0.066583,0.065312,0.128417,0.171862,0.243614,0.296506,0.259295,0.185624,0.146236,0.172897,0.149789,0.131695,0.163295,0.159824
2,Angola,Sub-Saharan Africa,0.553662,0.544539,0.543557,0.708984,0.836804,0.912141,1.072168,1.086637,1.091825,1.10986,0.988077,0.941829,0.895578,0.924869,0.930263,0.813539,0.82184,0.811754,0.888658,0.939404,0.976184,0.985522,0.950696,1.036294,1.099779,1.135044,1.031811,0.813301,0.777675,0.792137
3,Albania,Europe & Central Asia,1.819542,1.24281,0.6837,0.638307,0.645355,0.605436,0.612367,0.466921,0.572154,0.955359,1.026213,1.055496,1.232379,1.338985,1.404059,1.338209,1.339996,1.393931,1.384311,1.441494,1.527624,1.669423,1.50324,1.53363,1.668337,1.603775,1.557664,1.788786,1.782739,1.692248
4,Andorra,Europe & Central Asia,7.521832,7.235379,6.963079,6.724178,6.541579,6.733479,6.991595,7.307441,7.639539,7.923192,7.952286,7.721549,7.56624,7.242416,7.344262,7.35378,6.790543,6.531047,6.439304,6.156687,6.157198,5.850886,5.944654,5.9428,5.807128,6.026182,6.0806,6.104134,6.362975,6.481217
5,United Arab Emirates,Middle East & North Africa,30.195189,31.778496,29.080926,29.275678,30.849333,31.125018,30.928026,30.486333,29.663581,28.887108,27.035159,29.43027,28.501462,27.96927,27.038938,25.382381,22.935104,21.370286,22.011469,19.832349,19.03977,18.509457,19.207801,20.055648,20.051698,21.077642,21.480669,20.769022,18.390678,19.329563


In [3]:
df['country_name'].unique()

array(['Afghanistan', 'Angola', 'Albania', 'Andorra',
       'United Arab Emirates', 'Argentina', 'Armenia',
       'Antigua and Barbuda', 'Australia', 'Austria', 'Azerbaijan',
       'Burundi', 'Belgium', 'Benin', 'Burkina Faso', 'Bangladesh',
       'Bulgaria', 'Bahrain', 'Bahamas, The', 'Bosnia and Herzegovina',
       'Belarus', 'Belize', 'Bolivia', 'Brazil', 'Barbados',
       'Brunei Darussalam', 'Bhutan', 'Botswana',
       'Central African Republic', 'Canada', 'Switzerland', 'Chile',
       'China', "Cote d'Ivoire", 'Cameroon', 'Congo, Dem. Rep.',
       'Congo, Rep.', 'Colombia', 'Comoros', 'Cabo Verde', 'Costa Rica',
       'Cuba', 'Cyprus', 'Czech Republic', 'Germany', 'Djibouti',
       'Dominica', 'Denmark', 'Dominican Republic', 'Algeria', 'Ecuador',
       'Egypt, Arab Rep.', 'Spain', 'Estonia', 'Ethiopia', 'Finland',
       'Fiji', 'France', 'Gabon', 'United Kingdom', 'Georgia', 'Ghana',
       'Guinea', 'Gambia, The', 'Guinea-Bissau', 'Equatorial Guinea',
       'Greec

In [4]:
df['region'].unique()

array(['South Asia', 'Sub-Saharan Africa', 'Europe & Central Asia',
       'Middle East & North Africa', 'Latin America & Caribbean',
       'East Asia & Pacific', 'North America'], dtype=object)

In [5]:
# select a subset of countries to use
countries = ['USA', 'China', 'Canada', 'Germany', 'Italy', 'France', 
             'Mexico', 'Japan', 'India', 'Australia', 'Norway', 'Brazil'
            ]
new_df = df.loc[df['country_name'].isin(countries)].reset_index(drop=True)
new_df = new_df.round(2)
new_df.head()

Unnamed: 0,country_name,region,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Australia,East Asia & Pacific,15.45,15.32,15.34,15.46,15.69,16.06,16.43,16.63,17.56,17.63,17.72,17.8,17.98,17.72,18.17,18.15,18.14,18.52,18.3,18.22,17.59,17.3,17.02,16.44,15.83,15.86,15.91,15.82,15.49,15.24
1,Brazil,Latin America & Caribbean,1.33,1.35,1.35,1.37,1.4,1.49,1.59,1.68,1.71,1.75,1.79,1.8,1.77,1.71,1.79,1.78,1.78,1.85,1.95,1.81,2.03,2.12,2.28,2.42,2.52,2.37,2.17,2.2,2.07,2.06
2,Canada,North America,15.15,14.74,15.03,14.71,15.06,15.29,15.59,15.94,16.08,16.26,16.76,16.33,16.72,17.21,16.79,17.03,16.6,17.38,16.56,15.5,15.79,16.0,15.73,15.84,15.85,15.65,15.42,15.54,15.65,15.43
3,China,East Asia & Pacific,1.91,2.0,2.08,2.24,2.32,2.56,2.52,2.55,2.61,2.52,2.65,2.77,2.98,3.43,3.95,4.47,4.91,5.31,5.44,5.8,6.34,6.9,7.05,7.32,7.29,7.15,7.12,7.23,7.49,7.61
4,Germany,Europe & Central Asia,12.03,11.65,11.07,10.93,10.76,10.71,11.04,10.64,10.55,10.15,10.1,10.29,10.1,10.14,9.95,9.73,9.89,9.53,9.62,8.97,9.45,9.3,9.45,9.62,9.09,9.09,9.07,8.86,8.54,7.91


In [6]:
new_df.country_name.unique()

array(['Australia', 'Brazil', 'Canada', 'China', 'Germany', 'France',
       'India', 'Italy', 'Japan', 'Mexico', 'Norway', 'USA'], dtype=object)

## Stacked Area Chart

In [7]:
# transform to long format
areachart_data_long = pd.melt(new_df.drop(['region'], axis=1), 
                     id_vars=['country_name'], var_name='year', value_name='co2')

# pivot to wide
areachart_data = pd.pivot(areachart_data_long, index='year', columns='country_name')
areachart_data.head()

Unnamed: 0_level_0,co2,co2,co2,co2,co2,co2,co2,co2,co2,co2,co2,co2
country_name,Australia,Brazil,Canada,China,France,Germany,India,Italy,Japan,Mexico,Norway,USA
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1990,15.45,1.33,15.15,1.91,6.12,12.03,0.65,7.15,8.83,3.21,6.91,19.41
1991,15.32,1.35,14.74,2.0,6.5,11.65,0.68,7.12,8.91,3.37,6.41,19.0
1992,15.34,1.35,15.03,2.08,6.26,11.07,0.69,7.08,8.96,3.35,7.05,19.02
1993,15.46,1.37,14.71,2.24,5.91,10.93,0.7,6.98,8.87,3.39,7.52,19.22
1994,15.69,1.4,15.06,2.32,5.81,10.76,0.73,6.9,9.27,3.62,7.93,19.26


In [8]:
areachart_data.columns = areachart_data.columns.get_level_values(1)
areachart_data['year'] = areachart_data.index
areachart_data.reset_index(drop=True, inplace=True)
areachart_data['date'] = pd.to_datetime(areachart_data['year'])
areachart_data.head()

country_name,Australia,Brazil,Canada,China,France,Germany,India,Italy,Japan,Mexico,Norway,USA,year,date
0,15.45,1.33,15.15,1.91,6.12,12.03,0.65,7.15,8.83,3.21,6.91,19.41,1990,1990-01-01
1,15.32,1.35,14.74,2.0,6.5,11.65,0.68,7.12,8.91,3.37,6.41,19.0,1991,1991-01-01
2,15.34,1.35,15.03,2.08,6.26,11.07,0.69,7.08,8.96,3.35,7.05,19.02,1992,1992-01-01
3,15.46,1.37,14.71,2.24,5.91,10.93,0.7,6.98,8.87,3.39,7.52,19.22,1993,1993-01-01
4,15.69,1.4,15.06,2.32,5.81,10.76,0.73,6.9,9.27,3.62,7.93,19.26,1994,1994-01-01


In [9]:
areachart_data.to_csv('areachart_data.csv', index=False)

## Time Series Plot

Y: CO2 emissions  
X: Year 

- can select which country to show via dropdown

https://d3-graph-gallery.com/graph/line_filter.html

In [10]:
linechart_data = pd.melt(new_df.drop(['region'], axis=1), 
                     id_vars=['country_name'], var_name='year', value_name='co2')
linechart_data['date'] = pd.to_datetime(linechart_data['year'])
linechart_data = linechart_data.sort_values(by=['country_name', 'year'])
# linechart_data.drop(['year'], axis=1, inplace=True)
linechart_data

Unnamed: 0,country_name,year,co2,date
0,Australia,1990,15.45,1990-01-01
12,Australia,1991,15.32,1991-01-01
24,Australia,1992,15.34,1992-01-01
36,Australia,1993,15.46,1993-01-01
48,Australia,1994,15.69,1994-01-01
...,...,...,...,...
311,USA,2015,15.56,2015-01-01
323,USA,2016,15.15,2016-01-01
335,USA,2017,14.82,2017-01-01
347,USA,2018,15.22,2018-01-01


In [11]:
linechart_data.to_csv('linechart_data.csv', index=False)

## Bar Chart

Y: CO2 emissions  
X: Region

- can select which CO2 emissions column for the Y-axis via dropdown

https://d3-graph-gallery.com/graph/barplot_button_data_csv.html  
https://d3-graph-gallery.com/graph/lollipop_button_data_csv.html

In [12]:
# barchart_data = new_df.drop(['region'], axis=1)
# barchart_data.columns = barchart_data.columns.map(lambda x : 'co2_'+x if x != 'country_name' else x)
# barchart_data = barchart_data[['country_name', 'co2_1990', 'co2_2010']]
# barchart_data.head()

In [13]:
barchart_data_mean = linechart_data.groupby('country_name', as_index=False).mean()
barchart_data_mean['metric'] = 'mean'
barchart_data_mean

Unnamed: 0,country_name,co2,metric
0,Australia,16.826333,mean
1,Brazil,1.843,mean
2,Canada,15.921,mean
3,China,4.550667,mean
4,France,5.617,mean
5,Germany,9.941,mean
6,India,1.127333,mean
7,Italy,6.935667,mean
8,Japan,9.224,mean
9,Mexico,3.782333,mean


In [14]:
barchart_data_max = linechart_data[['country_name', 'co2']].groupby('country_name', as_index=False).max()
barchart_data_max['metric'] = 'max'
barchart_data_max

Unnamed: 0,country_name,co2,metric
0,Australia,18.52,max
1,Brazil,2.52,max
2,Canada,17.38,max
3,China,7.61,max
4,France,6.5,max
5,Germany,12.03,max
6,India,1.81,max
7,Italy,8.19,max
8,Japan,9.91,max
9,Mexico,4.19,max


In [15]:
barchart_data_min = linechart_data[['country_name', 'co2']].groupby('country_name', as_index=False).min()
barchart_data_min['metric'] = 'min'
barchart_data_min

Unnamed: 0,country_name,co2,metric
0,Australia,15.24,min
1,Brazil,1.33,min
2,Canada,14.71,min
3,China,1.91,min
4,France,4.47,min
5,Germany,7.91,min
6,India,0.65,min
7,Italy,5.31,min
8,Japan,8.54,min
9,Mexico,3.21,min


In [16]:
barchart_data2 = pd.concat([barchart_data_mean, barchart_data_max, barchart_data_min])
barchart_data2['co2'] = barchart_data2['co2'].round(2)
barchart_data2.head()

Unnamed: 0,country_name,co2,metric
0,Australia,16.83,mean
1,Brazil,1.84,mean
2,Canada,15.92,mean
3,China,4.55,mean
4,France,5.62,mean


In [17]:
barchart_data2.to_csv('barchart_data2.csv', index=False)

## Advanced, Parallel Coordinates

- An axis for each CO2 variable of certain years, perhaps from 2005 onwards?
- show a focus of what the CO2 emissions has looked like in a specific timeframe, instead of the entire timeframe in the dataset and show at a country level
- Lines color coded by country

https://d3-graph-gallery.com/graph/parallel_custom.html

In [18]:
plot3_data = new_df.drop(['region'], axis=1)
plot3_data.columns = plot3_data.columns.map(lambda x : 'co2_'+x if x != 'country_name' else x)
plot3_data = plot3_data[['country_name', 'co2_2010', 'co2_2011', 'co2_2012', 'co2_2013', 'co2_2014', 
                         'co2_2015', 'co2_2016', 'co2_2017', 'co2_2018']]
plot3_data.columns = ['country_name', '2010', '2011', '2012', '2013',  '2014', '2015', '2016', '2017', '2018']
plot3_data.head()

Unnamed: 0,country_name,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,Australia,17.59,17.3,17.02,16.44,15.83,15.86,15.91,15.82,15.49
1,Brazil,2.03,2.12,2.28,2.42,2.52,2.37,2.17,2.2,2.07
2,Canada,15.79,16.0,15.73,15.84,15.85,15.65,15.42,15.54,15.65
3,China,6.34,6.9,7.05,7.32,7.29,7.15,7.12,7.23,7.49
4,Germany,9.45,9.3,9.45,9.62,9.09,9.09,9.07,8.86,8.54


In [19]:
plot3_data.to_csv('parallel_coord_data.csv', index=False)

##  Advanced, Small Multiples

In [20]:
smallmult_data_country = linechart_data.loc[linechart_data['year'] >= '2010'].reset_index(drop=True)
smallmult_data_country

Unnamed: 0,country_name,year,co2,date
0,Australia,2010,17.59,2010-01-01
1,Australia,2011,17.30,2011-01-01
2,Australia,2012,17.02,2012-01-01
3,Australia,2013,16.44,2013-01-01
4,Australia,2014,15.83,2014-01-01
...,...,...,...,...
115,USA,2015,15.56,2015-01-01
116,USA,2016,15.15,2016-01-01
117,USA,2017,14.82,2017-01-01
118,USA,2018,15.22,2018-01-01


In [21]:
smallmult_data_country.to_csv('smallmult_data_country.csv', index=False)