In [26]:
from item.historical import input_file
import pandas as pd
from pprint import pprint

# Creating the dataframe and viewing the data

In [27]:
# Creating a dataframe from the csv data
data_id_int = 0
data_id = "T{:03}".format(data_id_int)

df = pd.read_csv(input_file(data_id_int))

In [28]:
# Viewing the data available
df.head(5)

Unnamed: 0,COUNTRY,Country,VARIABLE,Variable,YEAR,Year,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
0,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1990,1990,PKM,Passenger-kilometres,6,Millions,,,90323.0,,
1,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1991,1991,PKM,Passenger-kilometres,6,Millions,,,82691.0,,
2,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1992,1992,PKM,Passenger-kilometres,6,Millions,,,69357.0,,
3,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1993,1993,PKM,Passenger-kilometres,6,Millions,,,47142.0,,
4,UKR,Ukraine,T-PASS-RD-BUS,Road passenger transport by buses and coaches,1994,1994,PKM,Passenger-kilometres,6,Millions,,,39952.0,,


# Getting general info about the data

In [29]:
# Getting the shape of the df
rows, cols = df.shape
print(">> Number of Columns: {}".format(cols))
print(">> Number of Rows: {}".format(rows))

# Grouping the data by Countries
group_by_country = df.groupby(df.Country)
list_of_countries = list(group_by_country.groups.keys())
print(">> Number of countries: {}".format(len(list_of_countries)))

>> Number of Columns: 15
>> Number of Rows: 9226
>> Number of countries: 54


# Determining data consistency for the column "Variable"
### Rationale: The variable "Road Passenger Transport" is the sum of "RPT by buses and coaches" + "RPT by passenger car". Therefore, to we need to identify for each country in what years the variables "Road Passenger Transport" is present but one if its components is missing.

In [30]:
# Determining the available variables per country
dic_country_df = {}
for name, group in group_by_country:
    dic_country_df[name] = group
    
# For each country, store the years that generate problems
result_per_country = {}
for country in list_of_countries:
    
    # Get the dataframe corresponding to country X
    df_for_country_x = dic_country_df[country]
    
    # Get the years available to the country X
    years_for_country_x = list(set(df_for_country_x.Year))
    
    # group the dataframe of the country X based on year
    country_x_grouped_by_year = df_for_country_x.groupby(df_for_country_x.Year)
    
    # Creating a variable for holding the number of variables per year
    dic_problematic_years = []
    
    # For each year of country X, identify the number of variables
    for year in years_for_country_x:
        
        # get the group of year Y
        df_year_Y_country_X = country_x_grouped_by_year.get_group(year)
        
        # get the list of variables available in year Y
        variables_available_in_year_y = list(df_year_Y_country_X["Variable"])
        
        if "Road passenger transport" in variables_available_in_year_y:
            if "Road passenger transport by buses and coaches" in variables_available_in_year_y and "Road passenger transport by passenger cars" in variables_available_in_year_y:
                continue
            else:
                dic_problematic_years.append(year)
    
    # Save the problematic data for the year
    if len(dic_problematic_years) > 0:
        result_per_country[country] = dic_problematic_years

pprint(result_per_country)

{'Albania': [1970,
             1971,
             1972,
             1973,
             1974,
             1975,
             1976,
             1977,
             1978,
             1979,
             1980,
             1981,
             1982,
             1983,
             1984,
             1985,
             1986,
             1987,
             1988,
             1989,
             1990,
             1991],
 'Armenia': [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017],
 'Azerbaijan': [1970,
                1971,
                1972,
                1973,
                1974,
                1975,
                1976,
                1977,
                1978,
                1979,
                1980,
                1981,
                1982,
                1983,
                1984],
 'Belarus': [1970,
             1971,
             1972,
             1973,
             1974,
             1975,
             1976,
             1977,
             1978,
           

## The results from the above cell demonstrates that there are 22 countries that for certain years have missing data. Below are the rules on how to handle each country:

###  Rule #1: The following countries keep them as they are: Bosnia-Herzegovina, China, Croatia, Estonia, Georgia, India, Latvia, Mexico, Moldova, Montenegro, Republic of, Romania, Turkey, Ukraine.

### Rule #2: Perform the following operations for each country:
    - Albania: Remove the "Road passenger transport" & "Road passenger transport by buses and coaches" categories during the problematic time periods.
    - Armenia: Remove the "Road passenger transport" category during the problematic time periods.
    - Azerbaijan: Remove the "Road passenger transport" category during the problematic time periods.
    - Belarus: Remove the "Road passenger transport" category during the problematic time periods.
    - Bulgaria: Remove the "Road passenger transport" category during the problematic time periods.
    - Canada: Remove the "Road passenger transport" category during the problematic time periods.
    - Russian Federation: Remove the "Road passenger transport" category during the problematic time periods.
    - Switzerland: Remove the "Road passenger transport" category during the problematic time periods.
    - United States: Remove the "Road passenger transport" category during the problematic time periods.

## Applying Rule#2 to handle all the problematic countries

In [31]:
# Getting the countries to handle
list_of_countries_that_need_cleaning = ['Albania', 'Armenia', 'Azerbaijan', 'Belarus', 'Bulgaria', 'Canada', 'Russian Federation', 'Switzerland', 'United States']

# Storing the variable to erase
variable_to_erase = "Road passenger transport"

# For each country that needs cleaning, do the following
list_of_indices_to_erase = []
for country in list_of_countries_that_need_cleaning:
    
    # Get the dataframe for the country
    country_to_clean_df = group_by_country.get_group(country)
    
    # Get the list of "dirty" years for the given country
    list_of_dirty_years_for_country_x = result_per_country[country]
    
    # For each dirty year, perform the following code
    for year in list_of_dirty_years_for_country_x:
        filtered_for_desired_rows = country_to_clean_df[(country_to_clean_df.Variable == variable_to_erase) & (country_to_clean_df.Year == year)]
        list_of_indices_to_erase = list_of_indices_to_erase + list(filtered_for_desired_rows.index)
        
        if country == 'Albania':
            albania_specific_bus_row = country_to_clean_df[(country_to_clean_df.Variable == "Road passenger transport by buses and coaches") & (country_to_clean_df.Year == year)]
            list_of_indices_to_erase = list_of_indices_to_erase + list(albania_specific_bus_row.index)

In [32]:
# Drop all the indices
df.drop(list_of_indices_to_erase, inplace=True)

In [33]:
# Group by country since we recently erased rows
group_by_country = df.groupby(df.Country)

# Determining the available variables per country
dic_country = {}
for name, group in group_by_country:
    local = {}
    local["Variables"] = set(group.Variable)
    local["Number of Vars"] = len(set(group.Variable))
    dic_country[name] = local

In [34]:
# Five is the max number of variables a country can have. So,check what countries have less than 5
dic_special_countries = {}

# Filtering the countries
for country in dic_country.keys():
    # Getting the number of variables for each country
    if dic_country[country]["Number of Vars"] < 5:
        dic_special_countries[country] = dic_country[country]

# Printing the result
pprint(dic_special_countries)

{'Bosnia-Herzegovina': {'Number of Vars': 4,
                        'Variables': {'Rail passenger transport',
                                      'Road passenger transport',
                                      'Road passenger transport by buses and '
                                      'coaches',
                                      'Total inland passenger transport'}},
 'China': {'Number of Vars': 3,
           'Variables': {'Rail passenger transport',
                         'Road passenger transport',
                         'Total inland passenger transport'}},
 'Croatia': {'Number of Vars': 4,
             'Variables': {'Rail passenger transport',
                           'Road passenger transport',
                           'Road passenger transport by buses and coaches',
                           'Total inland passenger transport'}},
 'Estonia': {'Number of Vars': 4,
             'Variables': {'Rail passenger transport',
                           'Road passenger t

### NOTE:  Based on the analysis done above, we discovered that 17 countries are missing variables and thus it is necessary to handle such countries.

### Rule 1: Countries having 1 or 3 variables are left as is. Therefore, the following countries are left as is: 
    China, Georgia, India, Ireland, Israel, Liechtenstein, Luxembourg, Montenegro, Turkey.
    
### Rule 2: To countries containing 4 variables we will the following two variables:  <i>Road Passenger Transport</i>  & <i>Total Inland Passenger Transport</i>. Below are the counties to which such variables are erased:
    Bosnia-Herzegovina, Croatia, Estonia, Latvia, Mexico, Maldova, Romania, Ukraine



# Applying the Rules for data consistency

In [35]:
# Variables to erase
variables_to_erase = ["Road passenger transport", "Total inland passenger transport"]

# Gathering the list of countries to which we need to erase variables
countries_to_erase_variables = []
for key in dic_special_countries.keys():
    if dic_special_countries[key]["Number of Vars"] == 4:
        countries_to_erase_variables.append(key)

# List of index to erase
index_to_erase = []

# For each row of the countries of interest, determine the indices that need to be erased
for country in countries_to_erase_variables:
    df_for_X_country = group_by_country.get_group(country)
    filtered_for_desired_rows = df_for_X_country[(df_for_X_country.Variable == variables_to_erase[0])|(df_for_X_country.Variable == variables_to_erase[1])]
    index_to_erase = index_to_erase + list(filtered_for_desired_rows.index)

# Erasing from the dataframe the indices selected
df.drop(index_to_erase, inplace=True)

# Dropping repeated columns and renaming columns
### Rule: To comply with the latest template, we are droping repeated columns and renaming others.

In [36]:
# Droping the repeated columns
columns_to_delete = ["COUNTRY", "YEAR", "VARIABLE","Reference Period Code","Unit Code","Reference Period", "Flag Codes", "Flags", "PowerCode Code"]
df.drop(columns=columns_to_delete, inplace = True)

In [37]:
# Renaming the columns to the appropriate name
df.rename(columns={"Country":"Region"}, inplace = True)

# Viewing result
df

Unnamed: 0,Region,Variable,Year,Unit,PowerCode,Value
0,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000
1,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000
2,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000
3,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000
4,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000
...,...,...,...,...,...,...
9221,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068
9222,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265
9223,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431
9224,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347


# Adding the 'Source' column
### Rule: Add the same source to all rows since all data comes from same source

In [38]:
# Creating the values for the source column
source_column = ["International Transport Forum"] * len(df)

# Adding the column to the dataframe
df.insert(0,"Source", source_column, True)
df

Unnamed: 0,Source,Region,Variable,Year,Unit,PowerCode,Value
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1990,Passenger-kilometres,Millions,90323.00000
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1991,Passenger-kilometres,Millions,82691.00000
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1992,Passenger-kilometres,Millions,69357.00000
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1993,Passenger-kilometres,Millions,47142.00000
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,1994,Passenger-kilometres,Millions,39952.00000
...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,2013,Passenger-kilometres,Millions,47955.53068
9222,International Transport Forum,Argentina,Total inland passenger transport,2014,Passenger-kilometres,Millions,49094.90265
9223,International Transport Forum,Argentina,Total inland passenger transport,2015,Passenger-kilometres,Millions,51422.02431
9224,International Transport Forum,Argentina,Total inland passenger transport,2016,Passenger-kilometres,Millions,54904.03347


# Adding the 'Service' column
### Rule: Since all the data is associated to passenger data, the service for all rows corresponds to 'Passenger'

In [40]:
# Creating the values for the source column --> Based on the variables, all data is associated to Passengers
source_column = ["Passenger"] * len(df)

# Adding the column to the dataframe
df.insert(3,"Service", source_column, True)
df

Unnamed: 0,Source,Region,Variable,Service,Year,Unit,PowerCode,Value
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1990,Passenger-kilometres,Millions,90323.00000
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1991,Passenger-kilometres,Millions,82691.00000
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1992,Passenger-kilometres,Millions,69357.00000
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1993,Passenger-kilometres,Millions,47142.00000
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1994,Passenger-kilometres,Millions,39952.00000
...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2013,Passenger-kilometres,Millions,47955.53068
9222,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2014,Passenger-kilometres,Millions,49094.90265
9223,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2015,Passenger-kilometres,Millions,51422.02431
9224,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2016,Passenger-kilometres,Millions,54904.03347


# Managing the 'Mode' and 'Vehicle_type' Columns
### Rule: We use keywords from the values on the "Variable" column to determine the 'Mode' and 'Vehicle' type.

In [41]:
# For each row, we are going to read the value in the "Variable" column and decide on the "Mode" and "Vehicle_Type"
list_vehicle_type = []
list_mode = []

# Looping through each row, reading the value in 'Variable' column and deciding on the "Mode" and "Vehicle_type"
for index, row in df.iterrows():
    
    # Determining the mode and vehicle type
    if "Rail" in row.Variable:
        list_mode.append("Rail")
        list_vehicle_type.append("All")
        
    elif "Road" in row.Variable:
        list_mode.append("Road")
        
        if "by buses" in row.Variable:
            list_vehicle_type.append("Bus")
        elif "by passenger" in row.Variable:
            list_vehicle_type.append("LDV")
        else:
            list_vehicle_type.append("All")
        
    else:
        list_mode.append("All")
        list_vehicle_type.append("All")

# Adding the "Mode" and "Vehicle type" to the dataframe
df["Mode"] = list_mode
df["Vehicle Type"] = list_vehicle_type

In [42]:
df

Unnamed: 0,Source,Region,Variable,Service,Year,Unit,PowerCode,Value,Mode,Vehicle Type
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1990,Passenger-kilometres,Millions,90323.00000,Road,Bus
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1991,Passenger-kilometres,Millions,82691.00000,Road,Bus
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1992,Passenger-kilometres,Millions,69357.00000,Road,Bus
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1993,Passenger-kilometres,Millions,47142.00000,Road,Bus
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1994,Passenger-kilometres,Millions,39952.00000,Road,Bus
...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2013,Passenger-kilometres,Millions,47955.53068,All,All
9222,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2014,Passenger-kilometres,Millions,49094.90265,All,All
9223,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2015,Passenger-kilometres,Millions,51422.02431,All,All
9224,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2016,Passenger-kilometres,Millions,54904.03347,All,All


# Managing the 'Fuel' and 'Technology' column
### Rule: The dataset does not provide any data about those two columns, so we added the default value in both cases

In [43]:
# Adding the "Technology" and "Fuel" columns
df["Fuel"] = ["All"]*len(df) 
df["Technology"] = ["All"]*len(df)
df

Unnamed: 0,Source,Region,Variable,Service,Year,Unit,PowerCode,Value,Mode,Vehicle Type,Fuel,Technology
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1990,Passenger-kilometres,Millions,90323.00000,Road,Bus,All,All
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1991,Passenger-kilometres,Millions,82691.00000,Road,Bus,All,All
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1992,Passenger-kilometres,Millions,69357.00000,Road,Bus,All,All
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1993,Passenger-kilometres,Millions,47142.00000,Road,Bus,All,All
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1994,Passenger-kilometres,Millions,39952.00000,Road,Bus,All,All
...,...,...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2013,Passenger-kilometres,Millions,47955.53068,All,All,All,All
9222,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2014,Passenger-kilometres,Millions,49094.90265,All,All,All,All
9223,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2015,Passenger-kilometres,Millions,51422.02431,All,All,All,All
9224,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2016,Passenger-kilometres,Millions,54904.03347,All,All,All,All


# Managing the 'Unit' column
### Rule: The data provides values in Million so we are converting it to Billion. Additionally, we set the correct name for the unit based on the template

In [44]:
# Viewing the available units
group_unit = df.groupby(df["Unit"])
list(group_unit.groups.keys())

['Passenger-kilometres']

In [45]:
# Since the there is only one unit, drop the current 'Unit' column and add a new one
columns_to_delete = ["Unit","PowerCode"]
df.drop(columns=columns_to_delete, inplace = True)

df["Unit"] = ["10^9 passenger-km / yr"]*len(df)

In [46]:
# Transforming the current value in Million to Billion (1M = 0.001B)
for index, row in df.iterrows():
    current_value = row.Value
    new_value = current_value * float(0.001)
    df.Value[index] = new_value

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [47]:
df

Unnamed: 0,Source,Region,Variable,Service,Year,Value,Mode,Vehicle Type,Fuel,Technology,Unit
0,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1990,90.323000,Road,Bus,All,All,10^9 passenger-km / yr
1,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1991,82.691000,Road,Bus,All,All,10^9 passenger-km / yr
2,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1992,69.357000,Road,Bus,All,All,10^9 passenger-km / yr
3,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1993,47.142000,Road,Bus,All,All,10^9 passenger-km / yr
4,International Transport Forum,Ukraine,Road passenger transport by buses and coaches,Passenger,1994,39.952000,Road,Bus,All,All,10^9 passenger-km / yr
...,...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2013,47.955531,All,All,All,All,10^9 passenger-km / yr
9222,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2014,49.094903,All,All,All,All,10^9 passenger-km / yr
9223,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2015,51.422024,All,All,All,All,10^9 passenger-km / yr
9224,International Transport Forum,Argentina,Total inland passenger transport,Passenger,2016,54.904033,All,All,All,All,10^9 passenger-km / yr


# Managing the 'Variable' column
### Rule: To comply with the current template we are setting the correct value that pertains to passenger related activities.

In [48]:
# Since all variables available are related to passenger activity then drop the current column and add the correct on
columns_to_delete = ["Variable"]
df.drop(columns=columns_to_delete, inplace = True)

df["Variable"] = ["Passenger Activity"]*len(df)
df

Unnamed: 0,Source,Region,Service,Year,Value,Mode,Vehicle Type,Fuel,Technology,Unit,Variable
0,International Transport Forum,Ukraine,Passenger,1990,90.323000,Road,Bus,All,All,10^9 passenger-km / yr,Passenger Activity
1,International Transport Forum,Ukraine,Passenger,1991,82.691000,Road,Bus,All,All,10^9 passenger-km / yr,Passenger Activity
2,International Transport Forum,Ukraine,Passenger,1992,69.357000,Road,Bus,All,All,10^9 passenger-km / yr,Passenger Activity
3,International Transport Forum,Ukraine,Passenger,1993,47.142000,Road,Bus,All,All,10^9 passenger-km / yr,Passenger Activity
4,International Transport Forum,Ukraine,Passenger,1994,39.952000,Road,Bus,All,All,10^9 passenger-km / yr,Passenger Activity
...,...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Passenger,2013,47.955531,All,All,All,All,10^9 passenger-km / yr,Passenger Activity
9222,International Transport Forum,Argentina,Passenger,2014,49.094903,All,All,All,All,10^9 passenger-km / yr,Passenger Activity
9223,International Transport Forum,Argentina,Passenger,2015,51.422024,All,All,All,All,10^9 passenger-km / yr,Passenger Activity
9224,International Transport Forum,Argentina,Passenger,2016,54.904033,All,All,All,All,10^9 passenger-km / yr,Passenger Activity


# Reordering the columns positions
### Rule: The order of the columns is based on the order stated in the current template

In [49]:
columnsTitles = ['Source', 'Region', 'Variable','Unit','Service','Mode','Vehicle Type','Technology','Fuel','Value','Year']
df = df.reindex(columns=columnsTitles)
df

Unnamed: 0,Source,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Value,Year
0,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,90.323000,1990
1,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,82.691000,1991
2,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,69.357000,1992
3,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,47.142000,1993
4,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,39.952000,1994
...,...,...,...,...,...,...,...,...,...,...,...
9221,International Transport Forum,Argentina,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,47.955531,2013
9222,International Transport Forum,Argentina,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,49.094903,2014
9223,International Transport Forum,Argentina,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,51.422024,2015
9224,International Transport Forum,Argentina,Passenger Activity,10^9 passenger-km / yr,Passenger,All,All,All,All,54.904033,2016


# Exporting results - Programming Friendly View

In [51]:
# Setting the column id for the dataframe
df["ID"] = [data_id] *len(df)

# Exporting result
df.to_csv(raw_data_csv_path+"T000_cleaned_PF.csv",index=False)

# Exporting Result - Reader Friendly View

In [52]:
# Get the columns to preserve
columns_to_preserve = ['Source','Region','Variable','Unit','Service','Mode','Vehicle Type','Technology','Fuel']

# Grouping by country
group_by_country = df.groupby(df.Region)

# Getting the list of countries
list_of_countries = list(group_by_country.groups.keys())

# Saving the dict of all the final dataframes of each country
dict_of_final_dataframes_per_country = {}

# For each country, perform the following algorithm
for country in list_of_countries:
    
    # Get the df corresponding to the given country
    df_country_X = group_by_country.get_group(country)
    
    # Get the list of years available for the given year
    list_of_years_for_country_X = list(set(df_country_X["Year"]))

    # Group the data of country X by year
    group_by_year_country_X = df_country_X.groupby(df_country_X.Year)
    
    # Create a structure that will hold the dataframes of each year
    df_per_year_for_country_X = {}
    
    # Obtain the dataframe for each year
    for name, group in group_by_year_country_X:
        df_per_year_for_country_X[name] = group

    # Do the necessary processing required in the DF of each year
    for year in list_of_years_for_country_X:
        
        # Obtain the dataframe for country X in year Y
        df_country_X_in_year_Y = df_per_year_for_country_X[year]
        
        # Renaming and droping columns
        df_country_X_in_year_Y.rename(columns={"Value":year}, inplace = True)
        df_country_X_in_year_Y.drop(columns=["Year"], inplace = True)

    # Concatenating all the dataframes of a given country into a single dataframe
    list_of_all_df_for_country_X = list(df_per_year_for_country_X.values())
    df_concat_all_dfs_for_country_x = pd.concat(list_of_all_df_for_country_X,sort=False, verify_integrity=True,join='outer')

    # Creating the final df for country X by eliminating all NAN and combining rows
    final_df_for_country_x = df_concat_all_dfs_for_country_x.groupby(columns_to_preserve)[list_of_years_for_country_X].first().reset_index()

    # Saving the final df of country X in the list of all countries df
    dict_of_final_dataframes_per_country[country] = final_df_for_country_x

# Concatenate all the dataframes of the countries
list_df_for_all_countries_final = list(dict_of_final_dataframes_per_country.values())
df_with_all_countries_data = pd.concat(list_df_for_all_countries_final,sort=False, verify_integrity=True,join='outer',ignore_index=True)

# Setting the column id for the dataframe
df_with_all_countries_data["ID"] = [data_id] *len(df_with_all_countries_data)

# Exporting the final dataframe
df_with_all_countries_data.to_csv(raw_data_csv_path+"T000_cleaned_UF.csv",index=False)

# Final Note

### After analysing the final data result, we discovered that the values presented for India are not accurate. The argument is that India is not the country with the largest PKT.