In [1]:
import pandas as pd
from pprint import pprint
from util.DatasetManager import DatasetManager
from util.DataframeManager import DataframeManager
from util.CountryCodeManager import CountryCodeManager

# Variables used all over the notebook and *not changed*

In [2]:
DATASET_ID = "T004"
datasetManager = DatasetManager(DATASET_ID)
countryCodeManager = CountryCodeManager()

# Opening the dataset and getting general info

In [3]:
filename = "T004_new-road-vehicle-registrations-by-vehicle-category-and-fuel-type.csv"
path = "/Users/hlinero/Desktop/iTEM Material/raw dataset/T004/{}".format(filename)
df = datasetManager.get_dataframe_from_csv_file(path, ";")
df

Unnamed: 0,Country,Date,Frequency,Fuel type,Type of vehicle,Value
0,Malta,2015,Annual,Diesel,New lorries (vehicle wt over 3500 kg),348.0
1,Malta,1998,Annual,Diesel,New passenger cars,2792.0
2,Malta,2002,Annual,Diesel,New passenger cars,5409.0
3,Malta,2002,Annual,Diesel,"New motor coaches, buses and trolley buses",37.0
4,Malta,2007,Annual,Diesel,"New motor coaches, buses and trolley buses",48.0
...,...,...,...,...,...,...
16145,Austria,2016,Annual,Alternative (total),New light goods vehicles,567.0
16146,Austria,2007,Annual,Alternative (total),New road tractors,0.0
16147,Austria,2011,Annual,Alternative (total),New road tractors,0.0
16148,Austria,2013,Annual,Alternative (total),New road tractors,0.0


# Removing unnecessary columns
    Rule: To comply with the latest template, we are dropping unnecessary columns.

In [4]:
# We are dropping the "Fruequncy" column because its value is not part of the template
df.drop(columns=["Frequency"], inplace=True)
df

Unnamed: 0,Country,Date,Fuel type,Type of vehicle,Value
0,Malta,2015,Diesel,New lorries (vehicle wt over 3500 kg),348.0
1,Malta,1998,Diesel,New passenger cars,2792.0
2,Malta,2002,Diesel,New passenger cars,5409.0
3,Malta,2002,Diesel,"New motor coaches, buses and trolley buses",37.0
4,Malta,2007,Diesel,"New motor coaches, buses and trolley buses",48.0
...,...,...,...,...,...
16145,Austria,2016,Alternative (total),New light goods vehicles,567.0
16146,Austria,2007,Alternative (total),New road tractors,0.0
16147,Austria,2011,Alternative (total),New road tractors,0.0
16148,Austria,2013,Alternative (total),New road tractors,0.0


# Adding the 'Source' column
    Rule: Add the same source to all rows since all data comes from the same source

In [5]:
DataframeManager.simple_column_insert(df,"Source","United Nations Economic Commission for Europe")
df

Unnamed: 0,Source,Country,Date,Fuel type,Type of vehicle,Value
0,United Nations Economic Commission for Europe,Malta,2015,Diesel,New lorries (vehicle wt over 3500 kg),348.0
1,United Nations Economic Commission for Europe,Malta,1998,Diesel,New passenger cars,2792.0
2,United Nations Economic Commission for Europe,Malta,2002,Diesel,New passenger cars,5409.0
3,United Nations Economic Commission for Europe,Malta,2002,Diesel,"New motor coaches, buses and trolley buses",37.0
4,United Nations Economic Commission for Europe,Malta,2007,Diesel,"New motor coaches, buses and trolley buses",48.0
...,...,...,...,...,...,...
16145,United Nations Economic Commission for Europe,Austria,2016,Alternative (total),New light goods vehicles,567.0
16146,United Nations Economic Commission for Europe,Austria,2007,Alternative (total),New road tractors,0.0
16147,United Nations Economic Commission for Europe,Austria,2011,Alternative (total),New road tractors,0.0
16148,United Nations Economic Commission for Europe,Austria,2013,Alternative (total),New road tractors,0.0


# Renaming the column "Date" to "Year"
    Rule: To comply with the current version of the template, the columns showing year values must be called "Year"

In [6]:
DataframeManager.rename_column(df, "Date", "Year")
df

Unnamed: 0,Source,Country,Year,Fuel type,Type of vehicle,Value
0,United Nations Economic Commission for Europe,Malta,2015,Diesel,New lorries (vehicle wt over 3500 kg),348.0
1,United Nations Economic Commission for Europe,Malta,1998,Diesel,New passenger cars,2792.0
2,United Nations Economic Commission for Europe,Malta,2002,Diesel,New passenger cars,5409.0
3,United Nations Economic Commission for Europe,Malta,2002,Diesel,"New motor coaches, buses and trolley buses",37.0
4,United Nations Economic Commission for Europe,Malta,2007,Diesel,"New motor coaches, buses and trolley buses",48.0
...,...,...,...,...,...,...
16145,United Nations Economic Commission for Europe,Austria,2016,Alternative (total),New light goods vehicles,567.0
16146,United Nations Economic Commission for Europe,Austria,2007,Alternative (total),New road tractors,0.0
16147,United Nations Economic Commission for Europe,Austria,2011,Alternative (total),New road tractors,0.0
16148,United Nations Economic Commission for Europe,Austria,2013,Alternative (total),New road tractors,0.0


# Getting the ISO Code for each Country
    Rule: For each country we have to assign their respective ISO code. 

## Determining which countries do not appear in the list of ISO codes

### As seen from the below cell, it appears that "Czechia" and "The former Yugoslav Republic of Macedonia" do not appear in our list of ISO codes.  However, they do appear but under the following names:
    > Czechia ---> Czech Republic
    > The former Yugoslav Republic of Macedonia ---> Macedonia, The Former Yugoslav Republic of

In [7]:
# Getting the list of countries available
list_of_countries = list(set(df["Country"]))

# Getting the list of countries with no ISO code
countries_with_no_ISO_code = countryCodeManager.get_list_of_countries_with_no_iso_code(list_of_countries)
        
# Print this list of countries with no ISO codes
countries_with_no_ISO_code

['The former Yugoslav Republic of Macedonia', 'Czechia']

## Adding the ISO column to the dataset

In [8]:
# Creating a list to hold all the iso_codes
dirty_list_of_all_countries = df["Country"]
clean_list_of_all_countries = []

# Cleaning the list and replacing the missing countries with the appropriate name
for country in dirty_list_of_all_countries:
    if country == "Czechia":
        clean_list_of_all_countries.append("Czech Republic")
    elif country == "The former Yugoslav Republic of Macedonia":
        clean_list_of_all_countries.append("Macedonia, The Former Yugoslav Republic of")
    else:
        clean_list_of_all_countries.append(country)
        
# Ensuring the size of the clean list is equal to the old list
assert len(clean_list_of_all_countries) == len(dirty_list_of_all_countries)

# Getting the list of iso codes
list_of_iso_codes = countryCodeManager.get_list_of_iso_for_countries(clean_list_of_all_countries)

# Adding the column to the dataframe
df["ISO Code"] = list_of_iso_codes
df

Unnamed: 0,Source,Country,Year,Fuel type,Type of vehicle,Value,ISO Code
0,United Nations Economic Commission for Europe,Malta,2015,Diesel,New lorries (vehicle wt over 3500 kg),348.0,MLT
1,United Nations Economic Commission for Europe,Malta,1998,Diesel,New passenger cars,2792.0,MLT
2,United Nations Economic Commission for Europe,Malta,2002,Diesel,New passenger cars,5409.0,MLT
3,United Nations Economic Commission for Europe,Malta,2002,Diesel,"New motor coaches, buses and trolley buses",37.0,MLT
4,United Nations Economic Commission for Europe,Malta,2007,Diesel,"New motor coaches, buses and trolley buses",48.0,MLT
...,...,...,...,...,...,...,...
16145,United Nations Economic Commission for Europe,Austria,2016,Alternative (total),New light goods vehicles,567.0,AUT
16146,United Nations Economic Commission for Europe,Austria,2007,Alternative (total),New road tractors,0.0,AUT
16147,United Nations Economic Commission for Europe,Austria,2011,Alternative (total),New road tractors,0.0,AUT
16148,United Nations Economic Commission for Europe,Austria,2013,Alternative (total),New road tractors,0.0,AUT


# Getting the ITEM Region for each country
    Rule: For each country, we need to assign an ITEM region

## Determining which countries are missing an ITEM region
    As seen from the cell below, there is no country that does no have a respective ITEM region. Therefore, no further cleaning needs to be done to get the item regions.

In [9]:
# Getting the list of ISO codes
list_of_iso_codes = list(set(df["ISO Code"]))

# Getting the list of ISO code with no region
iso_code_with_no_region = countryCodeManager.get_list_of_iso_codes_with_no_region(list_of_iso_codes)

# printing the list of ISO codes
iso_code_with_no_region

[]

# Adding the ITEM region column to the dataset

In [10]:
# Getting the complete list of iso codes
list_of_all_codes = df["ISO Code"]

item_region = countryCodeManager.get_list_of_regions_for_iso_codes(list_of_all_codes)

# Adding the column to the dataframe
df["ITEM Region"] = item_region
df

Unnamed: 0,Source,Country,Year,Fuel type,Type of vehicle,Value,ISO Code,ITEM Region
0,United Nations Economic Commission for Europe,Malta,2015,Diesel,New lorries (vehicle wt over 3500 kg),348.0,MLT,EU-27
1,United Nations Economic Commission for Europe,Malta,1998,Diesel,New passenger cars,2792.0,MLT,EU-27
2,United Nations Economic Commission for Europe,Malta,2002,Diesel,New passenger cars,5409.0,MLT,EU-27
3,United Nations Economic Commission for Europe,Malta,2002,Diesel,"New motor coaches, buses and trolley buses",37.0,MLT,EU-27
4,United Nations Economic Commission for Europe,Malta,2007,Diesel,"New motor coaches, buses and trolley buses",48.0,MLT,EU-27
...,...,...,...,...,...,...,...,...
16145,United Nations Economic Commission for Europe,Austria,2016,Alternative (total),New light goods vehicles,567.0,AUT,EU-27
16146,United Nations Economic Commission for Europe,Austria,2007,Alternative (total),New road tractors,0.0,AUT,EU-27
16147,United Nations Economic Commission for Europe,Austria,2011,Alternative (total),New road tractors,0.0,AUT,EU-27
16148,United Nations Economic Commission for Europe,Austria,2013,Alternative (total),New road tractors,0.0,AUT,EU-27


# Determining the vehicle types available in the dataset


In [11]:
list(set(df["Type of vehicle"]))

['New lorries (vehicle wt over 3500 kg)',
 'New road tractors',
 'New motor coaches, buses and trolley buses',
 'New light goods vehicles',
 'New passenger cars']

# Determining the available fuel types in the dataset

In [12]:
list(set(df["Fuel type"]))

['Total',
 '- Biodiesel',
 '- LPG',
 '- Bi-fuel vehicles',
 '- Plug-in hybrid petrol-electric',
 '- Hybrid electric-diesel',
 '- Hydrogen and fuel cells',
 '- Electricity',
 'Diesel',
 '- Hybrid electric-petrol',
 '- Petrol (excluding hybrids)',
 '- Liquefied natural gas (LNG)',
 'Alternative (total)',
 '- Compressed natural gas (CNG)',
 '- Bioethanol',
 'Petrol',
 '- Plug-in hybrid diesel-electric',
 '- Diesel (excluding hybrids)']