# Setup

Import common modules. Ensure that Python and Scikit-Learn in proper versions are used.

In [1]:
import sys
assert sys.version_info >= (3, 5)

import sklearn
assert sklearn.__version__ >= "0.20"

import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


# Download Data

In [2]:
import os
import tarfile
from urllib import request

DATA_DIR = "data"
COUNTRY_CODE_URL = "https://raw.githubusercontent.com/datasets/country-codes/master/data/country-codes.csv"

def fetch_countries_codes(country_code_url=COUNTRY_CODE_URL, data_dir=DATA_DIR):
    if not os.path.isdir(data_dir):
        os.makedirs(data_dir)
    response = request.urlretrieve(country_code_url, os.path.join(DATA_DIR, "country-codes.csv"))
 

In [3]:
fetch_countries_codes()

os.listdir("./data")

['.DS_Store', 'owid', 'country-codes.csv', 'income.json', 'nasa']

# Load Data

In [4]:
import pandas as pd

DATASET_PATH = "./data"
OWID_DATASETS = os.path.join(DATASET_PATH, "owid")
CO2_EMISSION_CSV_PATH = os.path.join(OWID_DATASETS, "owid-co2-data.csv")
COUNTRY_CODE_CSV_PATH = os.path.join(DATASET_PATH, "country-codes.csv")

co2_emission = pd.read_csv(CO2_EMISSION_CSV_PATH)
country_codes = pd.read_csv(COUNTRY_CODE_CSV_PATH)

# Quick take a look at the data

In [5]:
co2_emission.head()

Unnamed: 0,country,year,iso_code,population,gdp,cement_co2,cement_co2_per_capita,co2,co2_growth_abs,co2_growth_prct,...,share_global_cumulative_other_co2,share_global_flaring_co2,share_global_gas_co2,share_global_luc_co2,share_global_oil_co2,share_global_other_co2,total_ghg,total_ghg_excluding_lucf,trade_co2,trade_co2_share
0,Afghanistan,1850,AFG,3752993.0,,,,,,,...,,,,0.121,,,,,,
1,Afghanistan,1851,AFG,3769828.0,,,,,,,...,,,,0.118,,,,,,
2,Afghanistan,1852,AFG,3787706.0,,,,,,,...,,,,0.116,,,,,,
3,Afghanistan,1853,AFG,3806634.0,,,,,,,...,,,,0.115,,,,,,
4,Afghanistan,1854,AFG,3825655.0,,,,,,,...,,,,0.114,,,,,,


In [6]:
co2_emission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46523 entries, 0 to 46522
Data columns (total 74 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    46523 non-null  object 
 1   year                                       46523 non-null  int64  
 2   iso_code                                   39862 non-null  object 
 3   population                                 38574 non-null  float64
 4   gdp                                        14551 non-null  float64
 5   cement_co2                                 24974 non-null  float64
 6   cement_co2_per_capita                      22714 non-null  float64
 7   co2                                        31349 non-null  float64
 8   co2_growth_abs                             28944 non-null  float64
 9   co2_growth_prct                            25032 non-null  float64
 10  co2_including_luc     

# Clearing country codes dataset

In [7]:
country_codes = country_codes.dropna(subset=['ISO3166-1-numeric'])
country_codes = country_codes.astype({'ISO3166-1-numeric':'int'})
country_codes

country_iso_codes = country_codes.rename(columns = {'ISO3166-1-numeric': 'iso_code_numeric', 'ISO3166-1-Alpha-3': 'iso_code'})
country_iso_codes = country_iso_codes[['iso_code', 'iso_code_numeric']]
country_iso_codes.head(5)

Unnamed: 0,iso_code,iso_code_numeric
0,TWN,158
1,AFG,4
2,ALB,8
3,DZA,12
4,ASM,16


# Enrich co2 emission dataset with numeric code

In [8]:
co2_emission.loc[co2_emission['iso_code']=='USA']

world_co2_emission = pd.merge(left=co2_emission, right=country_iso_codes, on='iso_code', how='left')
world_co2_emission = world_co2_emission.dropna(subset=['iso_code_numeric'])
world_co2_emission = world_co2_emission.astype({'iso_code_numeric':'int'})


# C02 emission in between 1800 - 2020

In [11]:
import altair as alt
import janitor
import pandas as pd
from vega_datasets import data

def world_co2_emission_in_year(co2_emission, year):
    return world_co2_emission.loc[world_co2_emission['year'] == year]


world_source = world_co2_emission_in_year(world_co2_emission, 2000)
source = alt.topo_feature(data.world_110m.url, "countries")
background = alt.Chart(source).mark_geoshape(fill="#ececec")

charts = []
for i in [1800, 1850, 1900, 1950, 2000, 2020]:    
    foreground = (
        alt.Chart(source).mark_geoshape(
            stroke="black", strokeWidth=0.15
        ).encode(
            color=alt.Color('co2:Q'),
            tooltip=[
                alt.Tooltip("country:N", title="Country"),
                alt.Tooltip("co2:Q", title="C02 emission"),
            ],
        ).transform_lookup(
            lookup="id",
            from_=alt.LookupData(world_co2_emission_in_year(world_co2_emission, i), "iso_code_numeric", ["co2", "country"]),
        )
    )

    chart = (
        (background + foreground)
        .properties(width=400, height=250)
        .project("naturalEarth1")
    )
    charts.append(chart)

alt.concat(*charts, columns=2)
