In [5]:
import pandas as pd
import json
from pprint import pprint
from util.managers.DataframeManager import DataframeManager
from util.managers.DataframeManager import ColumnName
from util.managers.CountryCodeManager import CountryCodeManager
from util.managers.UnitConverterManager import UnitConverterManager

# Variables used across the notebook and note changed

In [6]:
DATASET_ID = "T006"
dataframeManager = DataframeManager(DATASET_ID)
countryCodeManager = CountryCodeManager()

# Opening the dataset

In [7]:
filename = "modal-split-of-freight-transport.csv"
path = "/Users/hlinero/Desktop/iTEM Material/raw dataset/T006/{}".format(filename)
df = dataframeManager.get_dataframe_from_csv_file(path, ";")
df

Unnamed: 0,Date,Frequency,Geo,Measure,Tra Mode,Value
0,2012,Annual,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,2008,Annual,Croatia,Percentage,Roads,71.7
2,2009,Annual,Croatia,Percentage,Roads,71.8
3,2017,Annual,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,2009,Annual,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...,...
1401,2009,Annual,Norway,Percentage,Roads,83.6
1402,2011,Annual,Norway,Percentage,Roads,84.2
1403,2006,Annual,Switzerland,Percentage,Roads,65.0
1404,2008,Annual,Switzerland,Percentage,Inland waterways,0.1


# Dropping unnecessary columns

In [9]:
df.drop(columns=["Frequency"], inplace = True)
df

Unnamed: 0,Date,Geo,Measure,Tra Mode,Value
0,2012,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,2008,Croatia,Percentage,Roads,71.7
2,2009,Croatia,Percentage,Roads,71.8
3,2017,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,2009,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...
1401,2009,Norway,Percentage,Roads,83.6
1402,2011,Norway,Percentage,Roads,84.2
1403,2006,Switzerland,Percentage,Roads,65.0
1404,2008,Switzerland,Percentage,Inland waterways,0.1


# Renaming the "Geo" column to "Country"
    Rule: Renaming the column to comply with the template

In [10]:
dataframeManager.rename_column(current_name="Geo",df=df,new_name=ColumnName.COUNTRY.value)
df

Unnamed: 0,Date,Country,Measure,Tra Mode,Value
0,2012,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,2008,Croatia,Percentage,Roads,71.7
2,2009,Croatia,Percentage,Roads,71.8
3,2017,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,2009,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...
1401,2009,Norway,Percentage,Roads,83.6
1402,2011,Norway,Percentage,Roads,84.2
1403,2006,Switzerland,Percentage,Roads,65.0
1404,2008,Switzerland,Percentage,Inland waterways,0.1


# Renaming the "Date" column to "Year"
    Rule: Renaming the column to comply with the template

In [11]:
dataframeManager.rename_column(current_name="Date",df=df,new_name=ColumnName.YEAR.value)
df

Unnamed: 0,Year,Country,Measure,Tra Mode,Value
0,2012,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,2008,Croatia,Percentage,Roads,71.7
2,2009,Croatia,Percentage,Roads,71.8
3,2017,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,2009,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...
1401,2009,Norway,Percentage,Roads,83.6
1402,2011,Norway,Percentage,Roads,84.2
1403,2006,Switzerland,Percentage,Roads,65.0
1404,2008,Switzerland,Percentage,Inland waterways,0.1


# Adding the "Source" column
    Rule: This dataset comes from the Eurostat

In [12]:
dataframeManager.simple_column_insert(cell_value="Eurostat", dataframe=df, column_name= ColumnName.SOURCE.value)
df

Unnamed: 0,Source,Year,Country,Measure,Tra Mode,Value
0,Eurostat,2012,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,Eurostat,2008,Croatia,Percentage,Roads,71.7
2,Eurostat,2009,Croatia,Percentage,Roads,71.8
3,Eurostat,2017,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,Eurostat,2009,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...,...
1401,Eurostat,2009,Norway,Percentage,Roads,83.6
1402,Eurostat,2011,Norway,Percentage,Roads,84.2
1403,Eurostat,2006,Switzerland,Percentage,Roads,65.0
1404,Eurostat,2008,Switzerland,Percentage,Inland waterways,0.1


# Getting the ISO code for each country
    Rule: For each country we need its ISO code

# Determining which countries do not have an ISO code
    Rule: As shown below, there are two countries which do not have an ISO code. The mapping that willbe done in order to handle those cases are as follows:
       > Germany (until 1990 former territory of the FRG) ----> 
       > European Union (current composition) ---->

In [14]:
list_of_countries = list(set(df[ColumnName.COUNTRY.value]))
countries_with_no_iso_code = countryCodeManager.get_list_of_countries_with_no_iso_code(list_of_countries)
countries_with_no_iso_code

['Germany (until 1990 former territory of the FRG)',
 'European Union (current composition)']

# Cleaning the list of countries to obtain their ISO code

In [15]:
country_column = list(df[ColumnName.COUNTRY.value])
list_of_iso_code = []

for country in country_column:
    if country == "Germany (until 1990 former territory of the FRG)":
        code = "N/A"
    elif country == "European Union (current composition)":
        code = "N/A"
    else:
        code = countryCodeManager.get_iso_code_for_country(country)

    # Appending the code to the list
    list_of_iso_code.append(code)
        
# Assert that the size of the list of iso codes is equivalent to the size of country column
assert len(list_of_iso_code) == len(country_column)

# Assert that all the iso codes are 3 characters long
for iso_code in list_of_iso_code:
    assert len(iso_code) == 3
    
# Adding the ISO column to the dataframe
df.insert(2, ColumnName.ISO_CODE.value, list_of_iso_code, True)
df        

Unnamed: 0,Source,Year,ISO Code,Country,Measure,Tra Mode,Value
0,Eurostat,2012,CZE,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,Eurostat,2008,HRV,Croatia,Percentage,Roads,71.7
2,Eurostat,2009,HRV,Croatia,Percentage,Roads,71.8
3,Eurostat,2017,BEL,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,Eurostat,2009,GRC,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...,...,...
1401,Eurostat,2009,NOR,Norway,Percentage,Roads,83.6
1402,Eurostat,2011,NOR,Norway,Percentage,Roads,84.2
1403,Eurostat,2006,CHE,Switzerland,Percentage,Roads,65.0
1404,Eurostat,2008,CHE,Switzerland,Percentage,Inland waterways,0.1


# Getting the ITEM code for each ISO region
    Rule: For each coutry, we need to assign an ITEM region

# Determining which counties do not have an ITEM region
    Rule: As seen below, all countries have a region.

In [17]:
list_of_unique_iso_codes = list(set(df[ColumnName.ISO_CODE.value]))
iso_codes_with_no_region = countryCodeManager.get_list_of_iso_codes_with_no_region(list_of_unique_iso_codes)
iso_codes_with_no_region

['N/A']

# Assigning the ITEM region column

In [18]:
# Getting the complete list of iso codes
list_of_all_codes = df[ColumnName.ISO_CODE.value]

# Getting the list of regions
item_regions = countryCodeManager.get_list_of_regions_for_iso_codes(list_of_all_codes)

# Hardcoding the region value for WLD ISO Code
for index in range(0, len(list_of_all_codes)):
    if list_of_all_codes[index] == "N/A":
        item_regions[index] = "EU-27"

# Adding the column to the dataframe
df.insert(3, ColumnName.ITEM_REGION.value, item_regions, True)
df

Unnamed: 0,Source,Year,ISO Code,Region,Country,Measure,Tra Mode,Value
0,Eurostat,2012,CZE,EU-27,Czech Republic,Percentage,"Railways, inland waterways - sum of available ...",30.6
1,Eurostat,2008,HRV,EU-27,Croatia,Percentage,Roads,71.7
2,Eurostat,2009,HRV,EU-27,Croatia,Percentage,Roads,71.8
3,Eurostat,2017,BEL,EU-27,Belgium,Percentage,"Railways, inland waterways - sum of available ...",26.3
4,Eurostat,2009,GRC,EU-27,Greece,Percentage,"Railways, inland waterways - sum of available ...",2.1
...,...,...,...,...,...,...,...,...
1401,Eurostat,2009,NOR,Non-EU Europe,Norway,Percentage,Roads,83.6
1402,Eurostat,2011,NOR,Non-EU Europe,Norway,Percentage,Roads,84.2
1403,Eurostat,2006,CHE,Non-EU Europe,Switzerland,Percentage,Roads,65.0
1404,Eurostat,2008,CHE,Non-EU Europe,Switzerland,Percentage,Inland waterways,0.1


# Adding the 'Variable' and 'Unit' column
    Rule: 