In [2]:
from datetime import date
import pandas as pd
import os

In [3]:
# Defining the path to store the results
root = "/Users/hlinero/Desktop/iTEM Material/raw dataset/"
path_to_store_results = root+"merged_results/"

# Creating the directory to save the data
if os.path.isdir(not path_to_store_results):
    os.mkdir(path_to_store_results)

# Generating the default filename format
today = date.today()
date = today.strftime("%b-%d")
filename = "merged_{}".format(date)

# Creating a list with the paths to all datasets that are going to be merged
    
    Rule: You have to specify the path of all the datasets that are going to be merged. The datasets to be merged need to have the "Programming Friendly View (PF)." This is one of the files that result from the cleaning scripts.

In [4]:
# Setting the list of paths
list_of_paths = []
list_of_paths.append("/Users/hlinero/Desktop/iTEM Material/raw dataset/T000/T000_cleaned_PF.csv")
list_of_paths.append("/Users/hlinero/Desktop/iTEM Material/raw dataset/T001/T001_cleaned_PF.csv")
list_of_paths.append("/Users/hlinero/Desktop/iTEM Material/raw dataset/T002/T002_cleaned_PF.csv")
list_of_paths.append("/Users/hlinero/Desktop/iTEM Material/raw dataset/T003/T003_cleaned_PF.csv")

# Create a dataframe from each file and save them to a list
list_of_dataframes = []
for path in list_of_paths:
    list_of_dataframes.append(pd.read_csv(path))

# Concatenate all dataframes
df = pd.concat(list_of_dataframes, ignore_index=True, sort=False)
df

Unnamed: 0,Source,Region,Variable,Unit,Service,Mode,Vehicle Type,Technology,Fuel,Value,Year,ID
0,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,90.323,1990,T000
1,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,82.691,1991,T000
2,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,69.357,1992,T000
3,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,47.142,1993,T000
4,International Transport Forum,Ukraine,Passenger Activity,10^9 passenger-km / yr,Passenger,Road,Bus,All,All,39.952,1994,T000
...,...,...,...,...,...,...,...,...,...,...,...,...
27188,International Transport Forum,United States,Freight Activity,10^9 tonnes-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6770.271,2007,T003
27189,International Transport Forum,United States,Freight Activity,10^9 tonnes-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6998.545,2008,T003
27190,International Transport Forum,United States,Freight Activity,10^9 tonnes-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6292.634,2009,T003
27191,International Transport Forum,United States,Freight Activity,10^9 tonnes-km / yr,Freight,Inland (exl. Pipeline),All,All,All,6610.056,2010,T003


# Exporting Result - Programming Friendly View

In [4]:
# Exporting results
df.to_csv(path_to_store_results+filename+"_PF.csv", index=False)

# Exporting Result - User Friendly View

In [5]:
# Get the columns to preserve
columns_to_preserve = ['Source','Region','Variable','Unit','Service','Mode','Vehicle Type','Technology','Fuel', 'ID']

# Grouping by country
group_by_country = df.groupby(df.Region)

# Getting the list of countries
list_of_countries = list(group_by_country.groups.keys())

# Saving the dict of all the final dataframes of each country
dict_of_final_dataframes_per_country = {}

# For each country, perform the following algorithm
for country in list_of_countries:
    
    # Get the df corresponding to the given country
    df_country_X = group_by_country.get_group(country)
    
    # Get the list of years available for the given year
    list_of_years_for_country_X = list(set(df_country_X["Year"]))

    # Group the data of country X by year
    group_by_year_country_X = df_country_X.groupby(df_country_X.Year)
    
    # Create a structure that will hold the dataframes of each year
    df_per_year_for_country_X = {}
    
    # Obtain the dataframe for each year
    for name, group in group_by_year_country_X:
        df_per_year_for_country_X[name] = group

    # Do the necessary processing required in the DF of each year
    for year in list_of_years_for_country_X:
        
        # Obtain the dataframe for country X in year Y
        df_country_X_in_year_Y = df_per_year_for_country_X[year]
        
        # Renaming and droping columns
        df_country_X_in_year_Y.rename(columns={"Value":year}, inplace = True)
        df_country_X_in_year_Y.drop(columns=["Year"], inplace = True)

    # Concatenating all the dataframes of a given country into a single dataframe
    list_of_all_df_for_country_X = list(df_per_year_for_country_X.values())
    df_concat_all_dfs_for_country_x = pd.concat(list_of_all_df_for_country_X,sort=False, verify_integrity=True,join='outer')

    # Creating the final df for country X by eliminating all NAN and combining rows
    final_df_for_country_x = df_concat_all_dfs_for_country_x.groupby(columns_to_preserve)[list_of_years_for_country_X].first().reset_index()

    # Saving the final df of country X in the list of all countries df
    dict_of_final_dataframes_per_country[country] = final_df_for_country_x

# Concatenate all the dataframes of the countries
list_df_for_all_countries_final = list(dict_of_final_dataframes_per_country.values())
df_with_all_countries_data = pd.concat(list_df_for_all_countries_final,sort=False, verify_integrity=True,join='outer',ignore_index=True)

# Reordering the dataframe and ensuring all columns are in the correct order
all_column_names = set(df_with_all_countries_data.keys())
list_of_none_year_columns = ['Source', 'Region', 'Variable','Unit','Service','Mode','Vehicle Type','Technology','Fuel', 'ID']
none_year_columns = set(list_of_none_year_columns)
numberic_columns = list(all_column_names - none_year_columns)
numberic_columns.sort()
order_of_columns = list_of_none_year_columns + numberic_columns
df_with_all_countries_data = df_with_all_countries_data.reindex(columns=order_of_columns)

# Exporting the final dataframe
df_with_all_countries_data.to_csv(path_to_store_results+filename+"_UF.csv",index=False)

KeyboardInterrupt: 