In [11]:
import pandas as pd
import sqlite3
import os
import re

This is where we will clean the data and use it to create a database of all the data to facilitate its use.

First we need to read the country data into a dataframe.

In [3]:
df_country = pd.read_csv('./data/raw-data/arrivals/Metadata_Country_API_ST.INT.ARVL_DS2_en_csv_v2_5994899.csv')

#df_country.rename(columns={''country_code', 'region', 'income_group', 'special_notes', 'country', 'unnamed'}], inplace=True)
df_country.head(5)

Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5
0,ABW,Latin America & Caribbean,High income,,Aruba,
1,AFE,,,"26 countries, stretching from the Red Sea in t...",Africa Eastern and Southern,
2,AFG,South Asia,Low income,The reporting period for national accounts dat...,Afghanistan,
3,AFW,,,"22 countries, stretching from the westernmost ...",Africa Western and Central,
4,AGO,Sub-Saharan Africa,Lower middle income,The World Bank systematically assesses the app...,Angola,


In [4]:
df_country.rename(columns={'Country Code': 'country_code', 
                           'Region': 'region',
                           'IncomeGroup': 'income_group',
                           'SpecialNotes': 'special_notes',
                           'TableName': 'country',
                           'Unnamed: 5': 'unnamed'}, inplace=True)

df_country.head()

Unnamed: 0,country_code,region,income_group,special_notes,country,unnamed
0,ABW,Latin America & Caribbean,High income,,Aruba,
1,AFE,,,"26 countries, stretching from the Red Sea in t...",Africa Eastern and Southern,
2,AFG,South Asia,Low income,The reporting period for national accounts dat...,Afghanistan,
3,AFW,,,"22 countries, stretching from the westernmost ...",Africa Western and Central,
4,AGO,Sub-Saharan Africa,Lower middle income,The World Bank systematically assesses the app...,Angola,


The next bit of code determines if the mysterious 'unnamed' field contains any data.

In [5]:
df_country[df_country.unnamed.notnull()]

Unnamed: 0,country_code,region,income_group,special_notes,country,unnamed


Since it does not contain anything, we can safely remove it. We can also remove the "special_notes" since they're not very useful for our purpose.

In [6]:
df_country.drop(columns=['special_notes', 'unnamed'], inplace=True)

df_country.head()

Unnamed: 0,country_code,region,income_group,country
0,ABW,Latin America & Caribbean,High income,Aruba
1,AFE,,,Africa Eastern and Southern
2,AFG,South Asia,Low income,Afghanistan
3,AFW,,,Africa Western and Central
4,AGO,Sub-Saharan Africa,Lower middle income,Angola


Next, let's reorder the columns to make the data easier for us to read and mentally process.

In [7]:
df_country = df_country[['country_code', 'country', 'region', 'income_group']]
df_country.head()

Unnamed: 0,country_code,country,region,income_group
0,ABW,Aruba,Latin America & Caribbean,High income
1,AFE,Africa Eastern and Southern,,
2,AFG,Afghanistan,South Asia,Low income
3,AFW,Africa Western and Central,,
4,AGO,Angola,Sub-Saharan Africa,Lower middle income


So far, it's looking much more readable; however, there are also aggregate regions that need to be removed for clarity, considering they are also listed under 'region' in the rows which are for an actual country.

In [34]:
df_country.drop(axis=0, index=df_country.index[df_country.region.isnull()], inplace=True)
df_country.reset_index(drop=True, inplace=True)
df_country.head()


Unnamed: 0,country_code,country,region,income_group
0,ABW,Aruba,Latin America & Caribbean,High income
1,AFG,Afghanistan,South Asia,Low income
2,AGO,Angola,Sub-Saharan Africa,Lower middle income
3,ALB,Albania,Europe & Central Asia,Upper middle income
4,AND,Andorra,Europe & Central Asia,High income


Now that we've cleaned this up, let's export it to a SQL database for ease of use.

In [36]:
connection = sqlite3.connect('./data/db/tourism.db')

df_country.to_sql('country', connection, if_exists='replace')

217

One data source down! Now let's repeat the process for the others.

In [46]:
df_arrivals = pd.read_csv('./data/raw-data/arrivals/API_ST.INT.ARVL_DS2_en_csv_v2_5994899.csv')

df_arrivals.drop(axis=1, labels=['Indicator Name', 'Indicator Code', 'Unnamed: 67'], inplace=True)
df_arrivals.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
df_arrivals.head()


Unnamed: 0,country_name,country_code,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,,,,,,,,,...,1667000.0,1739000.0,1832000.0,1758000.0,1863000.0,1897000.0,1951000.0,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,34426630.0,35738390.0,35318680.0,37645890.0,38258350.0,41189150.0,39826700.0,,,
2,Afghanistan,AFG,,,,,,,,,...,,,,,,,,,,
3,Africa Western and Central,AFW,,,,,,,,,...,10085820.0,10544620.0,13311680.0,13150780.0,,,,,,
4,Angola,AGO,,,,,,,,,...,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,218000.0,,,


It looks like there are some columns with no entries whatsoever. Let's get rid of those.

In [47]:
df_arrivals.dropna(axis=1, how='all', inplace=True)

df_arrivals.head()

Unnamed: 0,country_name,country_code,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,ABW,912000.0,957000.0,947000.0,906000.0,972000.0,1211000.0,1178000.0,1225000.0,...,1469000.0,1481000.0,1667000.0,1739000.0,1832000.0,1758000.0,1863000.0,1897000.0,1951000.0,
1,Africa Eastern and Southern,AFE,11583540.0,13088650.0,13456250.0,14403850.0,15309380.0,15353180.0,15854700.0,17383380.0,...,31650240.0,32748550.0,34426630.0,35738390.0,35318680.0,37645890.0,38258350.0,41189150.0,39826700.0,
2,Afghanistan,AFG,,,,,,,,,...,,,,,,,,,,
3,Africa Western and Central,AFW,2670706.0,3027135.0,3243144.0,3422652.0,3897975.0,4162850.0,4615887.0,4697120.0,...,8902380.0,10221030.0,10085820.0,10544620.0,13311680.0,13150780.0,,,,
4,Angola,AGO,9000.0,21000.0,45000.0,52000.0,45000.0,51000.0,67000.0,91000.0,...,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,218000.0,


Let's also move the 'country_code' column to the first row for consistency's sake.

In [51]:
arrival_columns = df_arrivals.columns.to_list()

arrival_columns[0] = 'country_code'
arrival_columns[1] = 'country_name'

df_arrivals = df_arrivals[arrival_columns]
df_arrivals.head()

Unnamed: 0,country_code,country_name,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,912000.0,957000.0,947000.0,906000.0,972000.0,1211000.0,1178000.0,1225000.0,...,1469000.0,1481000.0,1667000.0,1739000.0,1832000.0,1758000.0,1863000.0,1897000.0,1951000.0,
1,AFE,Africa Eastern and Southern,11583540.0,13088650.0,13456250.0,14403850.0,15309380.0,15353180.0,15854700.0,17383380.0,...,31650240.0,32748550.0,34426630.0,35738390.0,35318680.0,37645890.0,38258350.0,41189150.0,39826700.0,
2,AFG,Afghanistan,,,,,,,,,...,,,,,,,,,,
3,AFW,Africa Western and Central,2670706.0,3027135.0,3243144.0,3422652.0,3897975.0,4162850.0,4615887.0,4697120.0,...,8902380.0,10221030.0,10085820.0,10544620.0,13311680.0,13150780.0,,,,
4,AGO,Angola,9000.0,21000.0,45000.0,52000.0,45000.0,51000.0,67000.0,91000.0,...,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,218000.0,


This also has the aggregate regions that we can do away with.

In [80]:
country_codes = df_country.country_code.to_list()

df_arrivals = df_arrivals[df_arrivals.country_code.isin(country_codes)]

df_arrivals.reset_index(drop=True, inplace=True)

df_arrivals.head()

Unnamed: 0,country_code,country_name,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,912000.0,957000.0,947000.0,906000.0,972000.0,1211000.0,1178000.0,1225000.0,...,1469000.0,1481000.0,1667000.0,1739000.0,1832000.0,1758000.0,1863000.0,1897000.0,1951000.0,
1,AFG,Afghanistan,,,,,,,,,...,,,,,,,,,,
2,AGO,Angola,9000.0,21000.0,45000.0,52000.0,45000.0,51000.0,67000.0,91000.0,...,481000.0,528000.0,650000.0,595000.0,592000.0,397000.0,261000.0,218000.0,218000.0,
3,ALB,Albania,304000.0,287000.0,119000.0,184000.0,371000.0,317000.0,354000.0,470000.0,...,2932000.0,3514000.0,3256000.0,3673000.0,4131000.0,4736000.0,5118000.0,5927000.0,6406000.0,2658000.0
4,AND,Andorra,,,,,9422000.0,10991000.0,11351000.0,11507000.0,...,7983000.0,7900000.0,7676000.0,7797000.0,7850000.0,8025000.0,8152000.0,8328000.0,8235000.0,5207000.0


Now let's get this one saved as a database as well.

In [81]:
df_arrivals.to_sql('arrivals', connection, if_exists='replace')

217

Continuing on...

In [99]:
df_departures = pd.read_csv('./data/raw-data/departures/API_ST.INT.DPRT_DS2_en_csv_v2_5996775.csv')
df_expenditures = pd.read_csv('./data/raw-data/expenditures/API_ST.INT.XPND.CD_DS2_en_csv_v2_5996767.csv')
df_gdp = pd.read_csv('./data/raw-data/gdp/API_NY.GDP.MKTP.KD_DS2_en_csv_v2_5994841.csv')
df_gdp_per_capita = pd.read_csv('./data/raw-data/gdp-per-capita/API_NY.GDP.PCAP.KD_DS2_en_csv_v2_5994684.csv')
df_income = pd.read_csv('./data/raw-data/income/API_NY.ADJ.NNTY.PC.CD_DS2_en_csv_v2_5996043.csv')
df_receipts = pd.read_csv('./data/raw-data/receipts/API_ST.INT.RCPT.CD_DS2_en_csv_v2_5996774.csv')

In [100]:
df_departures.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_departures.dropna(axis=1, how='all', inplace=True)
df_departures.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
departure_columns = df_departures.columns.to_list()
departure_columns[0] = 'country_code'
departure_columns[1] = 'country_name'
df_departures = df_departures[departure_columns]
df_departures = df_departures[df_departures.country_code.isin(country_codes)]
df_departures.reset_index(drop=True, inplace=True)

df_departures.head()


Unnamed: 0,country_code,country_name,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,,,,,,,,,...,,,,,,,,,,
1,AFG,Afghanistan,,,,,,,,,...,,,,,,,,,,
2,AGO,Angola,3000.0,3000.0,,,,,,,...,,,,,,,,,,
3,ALB,Albania,,,,,,,955000.0,1303000.0,...,4120000.0,3959000.0,3928000.0,4146000.0,4504000.0,4852000.0,5186000.0,5415000.0,5922000.0,2907000.0
4,AND,Andorra,,,,,,,,,...,,,,,,,,,,


In [101]:
df_departures.to_sql('departures', connection, if_exists='replace')

217

In [102]:
df_expenditures.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_expenditures.dropna(axis=1, how='all', inplace=True)
df_expenditures.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
expenditure_columns = df_expenditures.columns.to_list()
expenditure_columns[0] = 'country_code'
expenditure_columns[1] = 'country_name'
df_expenditures = df_expenditures[expenditure_columns]
df_expenditures = df_expenditures[df_expenditures.country_code.isin(country_codes)]
df_expenditures.reset_index(drop=True, inplace=True)
df_expenditures.head()

Unnamed: 0,country_code,country_name,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,79000000.0,128000000.0,161000000.0,140000000.0,151000000.0,163000000.0,156000000.0,172000000.0,...,287000000.0,294000000.0,342000000.0,350000000.0,357000000.0,319000000.0,349000000.0,394000000.0,399000000.0,310000000.0
1,AFG,Afghanistan,,,,,,,,,...,255000000.0,110000000.0,138000000.0,140000000.0,151000000.0,89000000.0,130000000.0,226000000.0,168000000.0,49000000.0
2,AGO,Angola,113000000.0,110000000.0,125000000.0,96000000.0,142000000.0,146000000.0,80000000.0,52000000.0,...,323000000.0,288000000.0,316000000.0,505000000.0,389000000.0,823000000.0,1216000000.0,762000000.0,717000000.0,691000000.0
3,ALB,Albania,19000000.0,25000000.0,13000000.0,22000000.0,35000000.0,290000000.0,269000000.0,386000000.0,...,1677000000.0,1374000000.0,1567000000.0,1689000000.0,1311000000.0,1338000000.0,1473000000.0,1750000000.0,1852000000.0,805000000.0
4,AND,Andorra,,,,,,,,,...,,,,,,,,,187000000.0,


In [103]:
df_expenditures.to_sql('expenditures', connection, if_exists='replace')

217

In [104]:
df_gdp.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_gdp.dropna(axis=1, how='all', inplace=True)
df_gdp.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
gdp_columns = df_gdp.columns.to_list()
gdp_columns[0] = 'country_code'
gdp_columns[1] = 'country_name'
df_gdp = df_gdp[gdp_columns]
df_gdp = df_gdp[df_gdp.country_code.isin(country_codes)]
df_gdp.reset_index(drop=True, inplace=True)
df_gdp.head()

Unnamed: 0,country_code,country_name,1960,1961,1962,1963,1964,1965,1966,1967,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,ABW,Aruba,,,,,,,,,...,2862306000.0,2861720000.0,2963128000.0,3025850000.0,3191738000.0,3359555000.0,3380889000.0,2752412000.0,3225070000.0,
1,AFG,Afghanistan,,,,,,,,,...,19189240000.0,19712060000.0,19998140000.0,20450160000.0,20991480000.0,21241120000.0,22071990000.0,21553050000.0,17091570000.0,
2,AGO,Angola,,,,,,,,,...,82428840000.0,86404020000.0,87219300000.0,84968950000.0,84843910000.0,83727060000.0,83139070000.0,78451510000.0,79392310000.0,81810120000.0
3,ALB,Albania,,,,,,,,,...,10945470000.0,11139690000.0,11386850000.0,11764330000.0,12211680000.0,12702500000.0,12967700000.0,12539490000.0,13656580000.0,14318130000.0
4,AND,Andorra,,,,,,,,,...,2683235000.0,2750436000.0,2789881000.0,2893377000.0,2903390000.0,2949518000.0,3008967000.0,2672446000.0,2893917000.0,3148859000.0


In [105]:
df_gdp.to_sql('gdp', connection, if_exists='replace')

217

In [111]:
df_gdp_per_capita.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_gdp_per_capita.dropna(axis=1, how='all', inplace=True)
df_gdp_per_capita.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
gdp_columns = df_gdp_per_capita.columns.to_list()
gdp_columns[0] = 'country_code'
gdp_columns[1] = 'country_name'
df_gdp_per_capita = df_gdp_per_capita[gdp_columns]
df_gdp_per_capita = df_gdp_per_capita[df_gdp_per_capita.country_code.isin(country_codes)]
df_gdp_per_capita.reset_index(drop=True, inplace=True)
df_gdp_per_capita.head()

KeyError: "['Indicator Name', 'Indicator Code'] not found in axis"

In [112]:
df_gdp_per_capita.to_sql('gdp_per_capita', connection, if_exists='replace')

217

In [113]:
df_income.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_income.dropna(axis=1, how='all', inplace=True)
df_income.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
income_columns = df_income.columns.to_list()
income_columns[0] = 'country_code'
income_columns[1] = 'country_name'
df_income = df_income[income_columns]
df_income = df_income[df_income.country_code.isin(country_codes)]
df_income.reset_index(drop=True, inplace=True)
df_income.head()

Unnamed: 0,country_code,country_name,1970,1971,1972,1973,1974,1975,1976,1977,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,,,,,,,,,...,20090.351473,21165.636596,22168.224174,23163.637761,24192.616602,24118.031802,24565.934645,24994.147351,,
1,AFG,Afghanistan,154.284558,157.65087,134.379044,141.664823,171.38238,182.45692,192.466274,218.010807,...,556.163841,598.215245,586.613689,575.680796,526.930235,489.00974,494.93492,458.344566,467.186947,475.084737
2,AGO,Angola,,,,,,,,,...,2379.084597,2835.320063,3158.544114,3583.800549,3199.012702,2688.630467,2936.850718,2051.814728,1738.059253,1103.908798
3,ALB,Albania,,,,,,,,,...,3728.04779,3503.24351,3760.376907,3844.337544,3267.217515,3443.310224,3712.733017,4318.911492,4337.131083,4208.456611
4,AND,Andorra,,,,,,,,,...,,,,,,,,,,


In [114]:
df_income.to_sql('income', connection, if_exists='replace')

217

In [115]:
df_receipts.drop(columns=['Indicator Name', 'Indicator Code'], inplace=True)
df_receipts.dropna(axis=1, how='all', inplace=True)
df_receipts.rename(columns={'Country Name': 'country_name', 'Country Code': 'country_code'}, inplace=True)
receipts_columns = df_receipts.columns.to_list()
receipts_columns[0] = 'country_code'
receipts_columns[1] = 'country_name'
df_receipts = df_receipts[receipts_columns]
df_receipts = df_receipts[df_receipts.country_code.isin(country_codes)]
df_receipts.reset_index(drop=True, inplace=True)
df_receipts.head()

Unnamed: 0,country_code,country_name,1995,1996,1997,1998,1999,2000,2001,2002,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,ABW,Aruba,554000000.0,666000000.0,726000000.0,786000000.0,782000000.0,850000000.0,825000000.0,835000000.0,...,1358000000.0,1412000000.0,1506000000.0,1625000000.0,1659000000.0,1757000000.0,1855000000.0,2035000000.0,2109000000.0,1077000000.0
1,AFG,Afghanistan,,,,,,,,,...,165000000.0,167000000.0,179000000.0,121000000.0,86000000.0,62000000.0,16000000.0,50000000.0,85000000.0,75000000.0
2,AGO,Angola,27000000.0,38000000.0,24000000.0,39000000.0,31000000.0,34000000.0,35000000.0,51000000.0,...,653000000.0,711000000.0,1241000000.0,1597000000.0,1171000000.0,628000000.0,884000000.0,557000000.0,395000000.0,19000000.0
3,ALB,Albania,70000000.0,94000000.0,34000000.0,60000000.0,218000000.0,398000000.0,451000000.0,492000000.0,...,1833000000.0,1623000000.0,1670000000.0,1849000000.0,1613000000.0,1821000000.0,2050000000.0,2306000000.0,2458000000.0,1243000000.0
4,AND,Andorra,,,,,,,,,...,,,,,,,,,1910000000.0,


In [116]:
df_receipts.to_sql('receipts', connection, if_exists='replace')

217