In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

## Data Acquisition and Cleaning

First, we grab the raw html from the website, and clean up the data to put it in a usable format.

In [2]:
# https://www.nu3.de/blogs/nutrition/food-carbon-footprint-index-2018
food = pd.read_html("https://www.nu3.de/blogs/nutrition/food-carbon-footprint-index-2018")[0]
food = food.droplevel([0,1], axis=1)

In [3]:
columns = ['#', 'Country', 'porksupply', 'porkco2', 'chickensupply', 'chickenco2', 'beefsupply', 'beefco2', 'lambsupply', 'lambco2', 
           'fishsupply', 'fishco2', 'eggsupply', 'eggco2', 'milkcheesesupply', 'milkcheeseco2', 'totalanimalco2', 
           'wheatprodsupply', 'wheatprodco2', 'ricesupply', 'riceco2', 'soybeansupply', 'soybeanco2', 
           'nutssupply', 'nutsco2', 'totalnonanimalco2', 'difftotalco2']

food.rename(columns=dict(zip(food.columns, columns)), inplace=True)
food.head()

Unnamed: 0,#,Country,porksupply,porkco2,chickensupply,chickenco2,beefsupply,beefco2,lambsupply,lambco2,...,wheatprodsupply,wheatprodco2,ricesupply,riceco2,soybeansupply,soybeanco2,nutssupply,nutsco2,totalnonanimalco2,difftotalco2
0,1,Argentina,10.51,37.2,38.66,41.53,55.48,1712.0,1.56,54.63,...,103.11,19.66,8.77,11.22,0.0,0.0,0.49,0.87,31.75,2108.9
1,2,Australia,24.14,85.44,46.12,49.54,33.86,1044.85,9.87,345.65,...,70.46,13.44,11.03,14.12,0.19,0.09,8.73,15.45,43.09,1852.46
2,3,Albania,10.88,38.51,13.23,14.21,22.5,694.3,15.32,536.5,...,138.64,26.44,7.78,9.96,0.0,0.0,4.36,7.72,44.11,1689.62
3,4,Iceland,21.69,76.77,26.87,28.86,13.36,412.26,21.12,739.62,...,72.92,13.91,3.89,4.98,0.11,0.05,3.88,6.87,25.8,1679.75
4,5,New Zealand,22.29,78.9,34.98,37.58,22.49,693.99,18.91,662.23,...,76.91,14.67,9.16,11.72,0.44,0.2,8.22,14.55,41.14,1668.67


In [4]:
# 130 different countries
food.shape

(130, 27)

In [5]:
[col for col in columns if re.findall('co2', col)]

['porkco2',
 'chickenco2',
 'beefco2',
 'lambco2',
 'fishco2',
 'eggco2',
 'milkcheeseco2',
 'totalanimalco2',
 'wheatprodco2',
 'riceco2',
 'soybeanco2',
 'nutsco2',
 'totalnonanimalco2',
 'difftotalco2']

In [6]:
food_co2 = food.drop(columns=['#', 'totalanimalco2', 'totalnonanimalco2', 'difftotalco2'] 
                               + [col for col in columns if re.findall('supply', col)])
food_co2['totalco2'] = food['totalanimalco2'] + food['totalnonanimalco2']
food_co2.head()

Unnamed: 0,Country,porkco2,chickenco2,beefco2,lambco2,fishco2,eggco2,milkcheeseco2,wheatprodco2,riceco2,soybeanco2,nutsco2,totalco2
0,Argentina,37.2,41.53,1712.0,54.63,6.96,10.46,277.87,19.66,11.22,0.0,0.87,2172.4
1,Australia,85.44,49.54,1044.85,345.65,28.25,7.82,334.01,13.44,14.12,0.09,15.45,1938.64
2,Albania,38.51,14.21,694.3,536.5,6.15,11.44,432.62,26.44,9.96,0.0,7.72,1777.84
3,Iceland,76.77,28.86,412.26,739.62,118.81,7.57,321.66,13.91,4.98,0.05,6.87,1731.35
4,New Zealand,78.9,37.58,693.99,662.23,32.51,9.1,195.5,14.67,11.72,0.2,14.55,1750.94


In [9]:
food_supply = food.drop(columns=['#', 'totalanimalco2', 'totalnonanimalco2', 'difftotalco2'] 
                               + [col for col in columns if re.findall('co2', col)][:-2])
columns_supply = food_supply.columns[1:-1]
food_supply['totalsupply'] = food_supply.loc[:, columns_supply].values.sum(axis=1)
food_supply.head()

Unnamed: 0,Country,porksupply,chickensupply,beefsupply,lambsupply,fishsupply,eggsupply,milkcheesesupply,wheatprodsupply,ricesupply,soybeansupply,nutssupply,totalsupply
0,Argentina,10.51,38.66,55.48,1.56,4.36,11.39,195.08,103.11,8.77,0.0,0.49,428.92
1,Australia,24.14,46.12,33.86,9.87,17.69,8.51,234.49,70.46,11.03,0.19,8.73,456.36
2,Albania,10.88,13.23,22.5,15.32,3.85,12.45,303.72,138.64,7.78,0.0,4.36,528.37
3,Iceland,21.69,26.87,13.36,21.12,74.41,8.24,225.82,72.92,3.89,0.11,3.88,468.43
4,New Zealand,22.29,34.98,22.49,18.91,20.36,9.91,137.25,76.91,9.16,0.44,8.22,352.7


In [11]:
# check that the totalco2 is roughly the sum of the co2 columns
max(food_co2.loc[:, food_co2.columns[1:-1]].values.sum(axis=1) - food_co2['totalco2'].values)

0.029999999999972715

#### Motivation for conversion of data to percentages:
We converted the supply columns to percentages so that it is more comparable to an individual's diet. People tend to have a better grasp of the relative amounts of each food group they eat rather than the raw amount, which is what the data was before cleaning. That way, our model can take inputs of the percent of each food group an individual consumes on an annual basis, and relate that directly to the total carbon emissions in kilograms per year (the units of `totalco2`).

In [12]:
# convert columns into percentages
food_pct = pd.DataFrame([])
for col in columns_supply:
    food_pct[col + '_pct'] = food_supply[col] / food_supply['totalsupply']
    
# add the y variable and country
food_pct['totalco2'] = food_co2['totalco2']
food_pct['Country'] = food['Country']
food_pct.head()

Unnamed: 0,porksupply_pct,chickensupply_pct,beefsupply_pct,lambsupply_pct,fishsupply_pct,eggsupply_pct,milkcheesesupply_pct,wheatprodsupply_pct,ricesupply_pct,soybeansupply_pct,totalco2,Country
0,0.024503,0.090133,0.129348,0.003637,0.010165,0.026555,0.454817,0.240394,0.020447,0.0,2172.4,Argentina
1,0.052897,0.101061,0.074196,0.021628,0.038763,0.018648,0.513827,0.154396,0.02417,0.000416,1938.64,Australia
2,0.020592,0.025039,0.042584,0.028995,0.007287,0.023563,0.574824,0.262392,0.014725,0.0,1777.84,Albania
3,0.046304,0.057362,0.028521,0.045087,0.15885,0.017591,0.482078,0.155669,0.008304,0.000235,1731.35,Iceland
4,0.063198,0.099178,0.063765,0.053615,0.057726,0.028098,0.389141,0.218061,0.025971,0.001248,1750.94,New Zealand


In [13]:
# food_pct.to_csv('../../data/nu3-pctfoodsupply-CO2-emissions.csv')

---

### Exploratory Data Analysis

We now have one main dataset that will be used for modelling: `food_pct`.

Even though we are not looking at country location in our model, we thought it'd be interesting to plot it spatially and see if there are any trends or clusters that have higher kgCO2/person/year.

Using this [website](https://towardsdatascience.com/using-python-to-create-a-world-map-from-a-list-of-country-names-cd7480d03b10) as a basis, we plot a world map of the KG CO2 emissions per person.

In [14]:
country_co2 = food_pct.loc[:, ['totalco2', 'Country']]
country_co2.rename(columns = {'Country' : 'CountryName'}, inplace=True)
country_co2.head()

Unnamed: 0,totalco2,CountryName
0,2172.4,Argentina
1,1938.64,Australia
2,1777.84,Albania
3,1731.35,Iceland
4,1750.94,New Zealand


In [63]:
#installation
# !pip install pycountry-convert

#function to convert to alpha2 country codes and continents
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2

def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Unknown' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Unknown' 
    
    return (cn_a2_code, cn_continent)

In [16]:
codes = country_co2['CountryName'].apply(get_continent)

country_co2['Country'] = [code[0] for code in codes]
country_co2['Continent'] = [code[1] for code in codes]
country_co2.head()

Unnamed: 0,totalco2,CountryName,Country,Continent
0,2172.4,Argentina,AR,SA
1,1938.64,Australia,AU,OC
2,1777.84,Albania,AL,EU
3,1731.35,Iceland,IS,EU
4,1750.94,New Zealand,NZ,OC


In [17]:
#installation
# !pip install geopy

#function to get longitude and latitude data from country name
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="nu3-data-analysis")
def geolocate(country):
    try:
        # Geolocate the center of the country
        loc = geolocator.geocode(country)
        # And return latitude and longitude
        return (loc.latitude, loc.longitude)
    except:
        # Return missing value
        return (np.nan, np.nan)

In [18]:
latlons = country_co2['Country'].apply(geolocate)

country_co2['Latitude'] = [latlon[0] for latlon in latlons]
country_co2['Longitude'] = [latlon[1] for latlon in latlons]
country_co2.head()

Unnamed: 0,totalco2,CountryName,Country,Continent,Latitude,Longitude
0,2172.4,Argentina,AR,SA,-34.996496,-64.967282
1,1938.64,Australia,AU,OC,-24.776109,134.755
2,1777.84,Albania,AL,EU,33.258882,-86.829534
3,1731.35,Iceland,IS,EU,64.984182,-18.105901
4,1750.94,New Zealand,NZ,OC,-41.500083,172.834408


In [19]:
country_co2.loc[country_co2['Latitude'].isna(), :]

Unnamed: 0,totalco2,CountryName,Country,Continent,Latitude,Longitude
21,1405.7,Israel,IL,AS,,
32,1109.93,Armenia,AM,AS,,
107,241.92,Ethiopia,ET,AF,,
121,282.62,India,IN,AS,,


In [20]:
# temporarily removing them to test the world map
country_co2_clean = country_co2.loc[~country_co2['Latitude'].isna(), :]

In [25]:
#installation
# !pip install folium

# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster

#empty map
world_map= folium.Map(tiles="cartodbpositron")
marker_cluster = MarkerCluster().add_to(world_map)
#for each coordinate, create circlemarker of user percent
for i in range(len(country_co2_clean)):
        lat = country_co2_clean.iloc[i]['Latitude']
        long = country_co2_clean.iloc[i]['Longitude']
        radius=5
        popup_text = """Country : {}<br>
                    kgCO2/person/year : {}<br>"""
        popup_text = popup_text.format(country_co2_clean.iloc[i]['CountryName'],
                                   country_co2_clean.iloc[i]['totalco2']
                                   )
        folium.CircleMarker(location = [lat, long], radius=radius, popup= popup_text, fill =True).add_to(marker_cluster)
#show the map
world_map

#### Another map:

In [74]:
# !pip install country-converter

import country_converter as coco

country_co2['CountryCode'] = coco.convert(names=country_co2['Country'].to_list(), to='ISO3')

Unknown not found in regex
Unknown not found in regex


In [75]:
import plotly.express as px

fig = px.choropleth(country_co2, locations="CountryCode",
                    color="totalco2", 
                    hover_name="Country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma)
fig.show()