In [1]:
import numpy as np
import pandas as pd

In [2]:
# build up CSV paths
csv_path = "../../db/data/raw"
world_bank_arrivals_csv     = csv_path + "/API_ST.INT.ARVL_DS2_en_csv_v2_424268/API_ST.INT.ARVL_DS2_en_csv_v2_424268.csv"
world_bank_departures_csv   = csv_path + "/API_ST.INT.DPRT_DS2_en_csv_v2_424601/API_ST.INT.DPRT_DS2_en_csv_v2_424601.csv"
world_bank_expenditures_csv = csv_path + "/API_ST.INT.RCPT.CD_DS2_en_csv_v2_422817/API_ST.INT.RCPT.CD_DS2_en_csv_v2_422817.csv"
world_bank_receipts_csv     = csv_path + "/API_ST.INT.XPND.CD_DS2_en_csv_v2_433832/API_ST.INT.XPND.CD_DS2_en_csv_v2_433832.csv"

In [3]:
world_bank_data_csv = [world_bank_arrivals_csv, world_bank_departures_csv, world_bank_expenditures_csv, world_bank_receipts_csv]

In [4]:
# read all CSVs into DataFrames
world_bank_dfs = []
for i in range(len(world_bank_data_csv)):
    world_bank_dfs.append(pd.read_csv(world_bank_data_csv[i], skiprows=3).iloc[:, :-1])

In [5]:
# value variables
years = []
for year in range(1960,2020):
    years.append(str(year))

In [6]:
# melt all DataFrames
value_names = ["Arrivals_in_Thousands", "Departures_in_Thousands", "Expenditures_in_USD_Millions", "Receipts_in_USD_Millions"]
melted_dfs = []
for i in range(len(world_bank_dfs)):
    melted_dfs.append(pd.melt(world_bank_dfs[i], id_vars = ['Country Name','Country Code'], 
                      value_vars = years, var_name = "Year", value_name = value_names[i]))

In [7]:
# merge DataFrames into one combined DataFrame
merge_keys = ['Country Name', 'Country Code', 'Year']
combined_df = melted_dfs[0]
for i in range(1,len(melted_dfs)):
    combined_df = combined_df.merge(melted_dfs[i], how='left', on=merge_keys)

In [8]:
# make all column names snaked-cased
combined_df.columns = combined_df.columns.str.replace(' ', '_')

In [9]:
# save final DataFrame to CSV
combined_df.to_csv("../../db/data/world_bank_tourism.csv", index=False)