In [27]:
import pandas as pd
import json
from pprint import pprint
from item.historical.scripts.util.managers.dataframe import DataframeManager
from item.historical.scripts.util.managers.dataframe import ColumnName
from item.historical.scripts.util.managers.country_code import CountryCodeManager
from item.common import paths

# Variables used across the noteboook and not changed

In [28]:
DATASET_ID = "T007"
dataframeManager = DataframeManager(DATASET_ID)
countryCodeManager = CountryCodeManager()

# Opening the dataset

In [29]:
path = paths['data']/'historical'/'input'/'T007_input.csv'
df = pd.read_csv(path, ";")
df

Unnamed: 0,Date,Frequency,Geo,Measure,Value,Vehicle
0,2000,Annual,European Union (28 countries),Percentage,7.1,Trains
1,2007,Annual,European Union (28 countries),Percentage,7.1,Trains
2,2000,Annual,European Union (28 countries),Percentage,82.5,Passenger cars
3,2002,Annual,European Union (28 countries),Percentage,83.4,Passenger cars
4,2005,Annual,European Union (28 countries),Percentage,83.4,Passenger cars
...,...,...,...,...,...,...
2549,1990,Annual,Germany (until 1990 former territory of the FRG),Percentage,9.1,"Motor coaches, buses and trolley buses"
2550,1997,Annual,Germany (until 1990 former territory of the FRG),Percentage,7.1,"Motor coaches, buses and trolley buses"
2551,2004,Annual,Germany (until 1990 former territory of the FRG),Percentage,6.7,"Motor coaches, buses and trolley buses"
2552,2010,Annual,Germany (until 1990 former territory of the FRG),Percentage,6.0,"Motor coaches, buses and trolley buses"


# Dropping unnecessary columns

In [30]:
df.drop(columns=["Frequency", "Measure" ], inplace = True)
df

Unnamed: 0,Date,Geo,Value,Vehicle
0,2000,European Union (28 countries),7.1,Trains
1,2007,European Union (28 countries),7.1,Trains
2,2000,European Union (28 countries),82.5,Passenger cars
3,2002,European Union (28 countries),83.4,Passenger cars
4,2005,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...
2549,1990,Germany (until 1990 former territory of the FRG),9.1,"Motor coaches, buses and trolley buses"
2550,1997,Germany (until 1990 former territory of the FRG),7.1,"Motor coaches, buses and trolley buses"
2551,2004,Germany (until 1990 former territory of the FRG),6.7,"Motor coaches, buses and trolley buses"
2552,2010,Germany (until 1990 former territory of the FRG),6.0,"Motor coaches, buses and trolley buses"


# Renaming the "Geo" column to "Country"

In [31]:
dataframeManager.rename_column(current_name="Geo",df=df,new_name=ColumnName.COUNTRY.value)
df

Unnamed: 0,Date,Country,Value,Vehicle
0,2000,European Union (28 countries),7.1,Trains
1,2007,European Union (28 countries),7.1,Trains
2,2000,European Union (28 countries),82.5,Passenger cars
3,2002,European Union (28 countries),83.4,Passenger cars
4,2005,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...
2549,1990,Germany (until 1990 former territory of the FRG),9.1,"Motor coaches, buses and trolley buses"
2550,1997,Germany (until 1990 former territory of the FRG),7.1,"Motor coaches, buses and trolley buses"
2551,2004,Germany (until 1990 former territory of the FRG),6.7,"Motor coaches, buses and trolley buses"
2552,2010,Germany (until 1990 former territory of the FRG),6.0,"Motor coaches, buses and trolley buses"


# Renaming the "Date" column to "Year"

In [32]:
dataframeManager.rename_column(current_name="Date",df=df,new_name=ColumnName.YEAR.value)
df

Unnamed: 0,Year,Country,Value,Vehicle
0,2000,European Union (28 countries),7.1,Trains
1,2007,European Union (28 countries),7.1,Trains
2,2000,European Union (28 countries),82.5,Passenger cars
3,2002,European Union (28 countries),83.4,Passenger cars
4,2005,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...
2549,1990,Germany (until 1990 former territory of the FRG),9.1,"Motor coaches, buses and trolley buses"
2550,1997,Germany (until 1990 former territory of the FRG),7.1,"Motor coaches, buses and trolley buses"
2551,2004,Germany (until 1990 former territory of the FRG),6.7,"Motor coaches, buses and trolley buses"
2552,2010,Germany (until 1990 former territory of the FRG),6.0,"Motor coaches, buses and trolley buses"


# Adding the "Source" column
    Rue: This dataset comes from Eurostat

In [33]:
dataframeManager.simple_column_insert(cell_value="Eurostat", dataframe=df, column_name= ColumnName.SOURCE.value)
df

Unnamed: 0,Source,Year,Country,Value,Vehicle
0,Eurostat,2000,European Union (28 countries),7.1,Trains
1,Eurostat,2007,European Union (28 countries),7.1,Trains
2,Eurostat,2000,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...
2549,Eurostat,1990,Germany (until 1990 former territory of the FRG),9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,Germany (until 1990 former territory of the FRG),7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,Germany (until 1990 former territory of the FRG),6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,Germany (until 1990 former territory of the FRG),6.0,"Motor coaches, buses and trolley buses"


# Renaming the "Germany" country name
    Rule: In this dataset, the country "Germany" appears as "Germany (until 1990 former territory of the FRG)." To make it consistent with the other datsasets, we will rename "Germany (until 1990 former territory of the FRG)" to just "Germany"

In [34]:
# variable holding the cleaned list of countries
cleaned_list_of_countries = []

# Cleaning the country names
for country in list(df["Country"]):
    if country == "Germany (until 1990 former territory of the FRG)":
        cleaned_list_of_countries.append("Germany")
    else:
        cleaned_list_of_countries.append(country)

# Ensure the new list of countries is the same size as the df
assert len(df) == len(cleaned_list_of_countries)

# Dropping the current country column
df.drop(columns=["Country" ], inplace = True)

# Adding the new country column
df.insert(2, ColumnName.COUNTRY.value, cleaned_list_of_countries, True)
df

Unnamed: 0,Source,Year,Country,Value,Vehicle
0,Eurostat,2000,European Union (28 countries),7.1,Trains
1,Eurostat,2007,European Union (28 countries),7.1,Trains
2,Eurostat,2000,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...
2549,Eurostat,1990,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,Germany,6.0,"Motor coaches, buses and trolley buses"


# Getting the ISO code for each country

## Determining which countries do not have an ISO code
    Rule: As shown below, there are countries which do not have an ISO code. The mapping that will be done in order to handle those cases are as follows:
       > European Union (28 countries) ----> EU28
       > European Union (27 countries) ----> EU27
       > Former Yugoslav Republic of Macedonia, the --> As in T005, we had problem with this country. We will do the same as in T005

In [35]:
list_of_countries = list(set(df[ColumnName.COUNTRY.value]))
countries_with_no_iso_code = countryCodeManager.get_list_of_countries_with_no_iso_code(list_of_countries)
countries_with_no_iso_code

['European Union (27 countries)',
 'Former Yugoslav Republic of Macedonia, the',
 'European Union (28 countries)']

# Cleaning the list of countries to obtain their ISO code

In [36]:
country_column = list(df[ColumnName.COUNTRY.value])
list_of_iso_code = []

for country in country_column:
    if country == "European Union (28 countries)":
        code = "EU28"
    elif country == "European Union (27 countries)":
        code = "EU27"
    else:
        code = countryCodeManager.get_iso_code_for_country(country)

    # Appending the code to the list
    list_of_iso_code.append(code)
        
# Assert that the size of the list of iso codes is equivalent to the size of country column
assert len(list_of_iso_code) == len(country_column)
    
# Adding the ISO column to the dataframe
df.insert(2, ColumnName.ISO_CODE.value, list_of_iso_code, True)
df 

Unnamed: 0,Source,Year,ISO Code,Country,Value,Vehicle
0,Eurostat,2000,EU28,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,Germany,6.0,"Motor coaches, buses and trolley buses"


# Getting the ITEM region for each ISO code

## Determining which countries do not have an ITEM region
    Rule: As seen below, EU27,EU28, N/A (which comes from Macedonia) do not have a region. Therefore, the following mapping will be done to assign a region
    ISO code --> ITEM region
    EU27 --> EU-27
    EU28 --> EU-28

In [37]:
list_of_unique_iso_codes = list(set(df[ColumnName.ISO_CODE.value]))
iso_codes_with_no_region = countryCodeManager.get_list_of_iso_codes_with_no_region(list_of_unique_iso_codes)
iso_codes_with_no_region

['N/A', 'EU28', 'EU27']

## Assigning the ITEM region column

In [38]:
# Getting the complete list of iso codes
list_of_all_codes = df[ColumnName.ISO_CODE.value]

# Getting the list of regions
item_regions = countryCodeManager.get_list_of_regions_for_iso_codes(list_of_all_codes)

# Hardcoding the region value for WLD ISO Code
for index in range(0, len(list_of_all_codes)):
    if list_of_all_codes[index] == "EU27":
        item_regions[index] = "EU-27"
    elif list_of_all_codes[index] == "EU28":
        item_regions[index] = "EU-28"

# Adding the column to the dataframe
df.insert(3, ColumnName.ITEM_REGION.value, item_regions, True)
df

Unnamed: 0,Source,Year,ISO Code,Region,Country,Value,Vehicle
0,Eurostat,2000,EU28,EU-28,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,EU-28,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,EU-28,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,EU-28,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,EU-28,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,EU-27,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,EU-27,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,EU-27,Germany,6.0,"Motor coaches, buses and trolley buses"


# Adding the 'Variable' and 'Unit' columns
    Rule: This dataset is associated to passenger data. Therefore, the variable is "Passenger Activity" and the unit is '% in total inland passenger-km / yr'

## Variable column

In [39]:
dataframeManager.simple_column_insert(df, ColumnName.VARIABLE.value, "Passenger Activity", 4)
df

Unnamed: 0,Source,Year,ISO Code,Region,Variable,Country,Value,Vehicle
0,Eurostat,2000,EU28,EU-28,Passenger Activity,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,EU-28,Passenger Activity,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,EU-28,Passenger Activity,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,EU-28,Passenger Activity,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,EU-28,Passenger Activity,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Passenger Activity,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,EU-27,Passenger Activity,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,EU-27,Passenger Activity,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,EU-27,Passenger Activity,Germany,6.0,"Motor coaches, buses and trolley buses"


## Unit column

In [40]:
dataframeManager.simple_column_insert(df, ColumnName.UNIT.value, "% in total inland passenger-km / yr", 5)
df

Unnamed: 0,Source,Year,ISO Code,Region,Variable,Unit,Country,Value,Vehicle
0,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Germany,6.0,"Motor coaches, buses and trolley buses"


# Adding the 'Service' column
    Rule: Since all the variable is 'Passenger Activity' the service is 'Passenger'

In [41]:
dataframeManager.simple_column_insert(df, ColumnName.SERVICE.value, "Passenger", 6)
df

Unnamed: 0,Source,Year,ISO Code,Region,Variable,Unit,Service,Country,Value,Vehicle
0,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Germany,6.0,"Motor coaches, buses and trolley buses"


# Adding the 'Technology' and 'Fuel' columns
    Rule: Since the dataset does not provide any information about 'technology' and 'fuel,' we set both values as 'All'

In [42]:
dataframeManager.simple_column_insert(df, ColumnName.TECHNOLOGY.value, "All", 7)
dataframeManager.simple_column_insert(df, ColumnName.FUEL.value, "All", 8)
df

Unnamed: 0,Source,Year,ISO Code,Region,Variable,Unit,Service,Technology,Fuel,Country,Value,Vehicle
0,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,European Union (28 countries),7.1,Trains
1,Eurostat,2007,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,European Union (28 countries),7.1,Trains
2,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,European Union (28 countries),82.5,Passenger cars
3,Eurostat,2002,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,European Union (28 countries),83.4,Passenger cars
4,Eurostat,2005,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,European Union (28 countries),83.4,Passenger cars
...,...,...,...,...,...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,Germany,9.1,"Motor coaches, buses and trolley buses"
2550,Eurostat,1997,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,Germany,7.1,"Motor coaches, buses and trolley buses"
2551,Eurostat,2004,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,Germany,6.7,"Motor coaches, buses and trolley buses"
2552,Eurostat,2010,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,All,All,Germany,6.0,"Motor coaches, buses and trolley buses"


# Setting the 'Mode' and 'Vehicle Type' Columns based on the given data
    Rule: The following is the mapping done for setting the mode:
    Vehicle --> Mode --> Vehicle Type
    Trains --> Rail --> All
    Passenger cars --> Road --> LDV
    Motor coaches, buses and trolley buses --> Road --> Bus

In [43]:
list(set(df["Vehicle"]))

['Passenger cars', 'Motor coaches, buses and trolley buses', 'Trains']

In [44]:
# variables holding the mode and vehicle type
result_mode = []
result_vehicle_type = []

# Getting the mode and vehicle type of each row
for index, row in df.iterrows():
    vehicle = row['Vehicle']
    if vehicle == 'Trains':
        result_mode.append("Rail")
        result_vehicle_type.append("All")
    elif vehicle == 'Motor coaches, buses and trolley buses':
        result_mode.append("Road")
        result_vehicle_type.append("Bus")
    else:
        result_mode.append("Road")
        result_vehicle_type.append("LDV")

# Asserting that the len of the results
assert len(df) == len(result_mode)
assert len(df) == len(result_vehicle_type)

# Adding the columns to the dataframe
df.insert(7, ColumnName.MODE.value, result_mode, True)
df.insert(8, ColumnName.VEHICLE_TYPE.value, result_vehicle_type, True)

# Removing the 'Tra Mode' column
df.drop(columns=["Vehicle"], inplace = True)
df

Unnamed: 0,Source,Year,ISO Code,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Country,Value
0,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Rail,All,All,All,European Union (28 countries),7.1
1,Eurostat,2007,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Rail,All,All,All,European Union (28 countries),7.1
2,Eurostat,2000,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,European Union (28 countries),82.5
3,Eurostat,2002,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,European Union (28 countries),83.4
4,Eurostat,2005,EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,European Union (28 countries),83.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,Eurostat,1990,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,Germany,9.1
2550,Eurostat,1997,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,Germany,7.1
2551,Eurostat,2004,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,Germany,6.7
2552,Eurostat,2010,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,Germany,6.0


# Reordering the columns
    Rule: To comply with the current template, the columns must be shown in a specific order. The dataframe manager specifies the order of the columns

In [45]:
df = dataframeManager.reorder_columns(df)
df

Unnamed: 0,Source,Country,ISO Code,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Value,Year
0,Eurostat,European Union (28 countries),EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Rail,All,All,All,7.1,2000
1,Eurostat,European Union (28 countries),EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Rail,All,All,All,7.1,2007
2,Eurostat,European Union (28 countries),EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,82.5,2000
3,Eurostat,European Union (28 countries),EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,83.4,2002
4,Eurostat,European Union (28 countries),EU28,EU-28,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,LDV,All,All,83.4,2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,Eurostat,Germany,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,9.1,1990
2550,Eurostat,Germany,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,7.1,1997
2551,Eurostat,Germany,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,6.7,2004
2552,Eurostat,Germany,DEU,EU-27,Passenger Activity,% in total inland passenger-km / yr,Passenger,Road,Bus,All,All,6.0,2010


# Exporting Results

In [68]:
# Programming Friendly View
dataframeManager.create_programming_friendly_file(df)

# User Friendly View
dataframeManager.create_user_friendly_file(df)

> PF File saved at: /Users/hlinero/Documents/database/item/historical/scripts
> UF File saved at: /Users/hlinero/Documents/database/item/historical/scripts
