In [1]:
import pandas as pd

We will use two datasets provided by the World Bank:
    
* International Arrival-- How many tourist have come to the country
 * https://data.worldbank.org/indicator/ST.INT.ARVL
* Receipts--How mach they spent in the country (how much the locals received)
 * https://data.worldbank.org/indicator/ST.INT.RCPT.CD

Because both dataset mix data for countries with regional aggregates, eg. `CEB` for Central Europe and Baltics.
It also contains columns between `1960` and `2018` but data are populated since `1995` only. 
For that reason we filter out the regions and drop the empty columns

In [10]:
def process_world_bank_dataset(path):
    df = pd.read_csv(path, skiprows=4)
    
    # load country_codes (from https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes)
    country_codes = pd.read_csv("Country_Codes.csv")
    
    # info about regions (part of the world bank data package)
    regions = pd.read_csv("Metadata.csv")

    # merge country codes to have only countries
    df = df.merge(country_codes["ISO3"], left_on="Country Code", right_on="ISO3", how="inner")
    df = df.merge(regions[["Country Code","Region"]], on="Country Code")

    # drop unnecesary columns
    df.drop(columns=["Indicator Name","Indicator Code", "ISO3"], inplace=True)

    # drop empty columns and row which have not data
    # `axis=0` for rows and `axis=1` for columns 
    df = df.dropna(how="all", axis="rows")
    df = df.dropna(how="all", axis="columns")
    
    return df

def melt_world_bank(df, value_name):
    return df.melt(id_vars=["Country Name","Country Code","Region"], 
                            var_name="years",
                            value_name=value_name)

In [11]:
wide_arrivals = process_world_bank_dataset("API_ST.INT.ARVL_DS2_en_csv_v2_1345483.csv")
wide_receipts = process_world_bank_dataset("API_ST.INT.RCPT.CD_DS2_en_csv_v2_1351575.csv")

print(wide_arrivals.shape, wide_receipts.shape)

# melt the data frames to create long dataframe
long_arr = melt_world_bank(wide_arrivals,"visitors")
long_receipts = melt_world_bank(wide_receipts,"receipts")

long_df = long_arr.merge(long_receipts, on=["Country Name","Country Code","years","Region"]).fillna(0)
long_df

(215, 27) (215, 27)


Unnamed: 0,Country Name,Country Code,Region,years,visitors,receipts
0,Aruba,ABW,Latin America & Caribbean,1995,619000.0,5.540000e+08
1,Afghanistan,AFG,South Asia,1995,0.0,0.000000e+00
2,Angola,AGO,Sub-Saharan Africa,1995,9000.0,2.700000e+07
3,Albania,ALB,Europe & Central Asia,1995,0.0,7.000000e+07
4,Andorra,AND,Europe & Central Asia,1995,0.0,0.000000e+00
...,...,...,...,...,...,...
5155,Samoa,WSM,East Asia & Pacific,2018,164000.0,1.913000e+08
5156,"Yemen, Rep.",YEM,Middle East & North Africa,2018,0.0,0.000000e+00
5157,South Africa,ZAF,Sub-Saharan Africa,2018,10472000.0,9.789000e+09
5158,Zambia,ZMB,Sub-Saharan Africa,2018,1072000.0,7.420000e+08


In [13]:
# save to pickle
wide_arrivals.to_pickle("arr.plk")
wide_receipts.to_pickle("rec.plk")
long_df.to_pickle("long.plk")