## Page 2: Tourism, only need the tourism_arrival and tourism_departures

In [38]:
import pandas as pd
# !pip3 install pycountry_convert
import pycountry
import pycountry_convert as pc

In [39]:
years = [str(i) for i in range(1995, 2021)]
years.append('Country Name')
arrival = pd.read_csv('data/tourism/tourism_arrivals.csv', usecols=years, skiprows=4)
depart = pd.read_csv('data/tourism/tourism_departures.csv', usecols=years, skiprows=4)
print(arrival.shape, depart.shape)

(266, 27) (266, 27)


In [40]:
arrival.columns

Index(['Country Name', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019',
       '2020'],
      dtype='object')

In [41]:
# Melt the DataFrame to have years in rows
melted_arrival = pd.melt(arrival, id_vars=['Country Name'], var_name='Year', value_name='Arrivals')
melted_depart = pd.melt(depart, id_vars=['Country Name'], var_name='Year', value_name='Departures')
print(melted_arrival.shape, melted_depart.shape)

(6916, 3) (6916, 3)


In [42]:
melted_arrival.head()

Unnamed: 0,Country Name,Year,Arrivals
0,Aruba,1995,912000.0
1,Africa Eastern and Southern,1995,11583540.0
2,Afghanistan,1995,
3,Africa Western and Central,1995,2670706.0
4,Angola,1995,9000.0


In [43]:
melted_depart.head()

Unnamed: 0,Country Name,Year,Departures
0,Aruba,1995,
1,Africa Eastern and Southern,1995,
2,Afghanistan,1995,
3,Africa Western and Central,1995,
4,Angola,1995,3000.0


In [44]:
arr_depart = melted_arrival.merge(melted_depart)
arr_depart.shape

(6916, 4)

In [45]:
arr_depart.head()

Unnamed: 0,Country Name,Year,Arrivals,Departures
0,Aruba,1995,912000.0,
1,Africa Eastern and Southern,1995,11583540.0,
2,Afghanistan,1995,,
3,Africa Western and Central,1995,2670706.0,
4,Angola,1995,9000.0,3000.0


Append the country codes back into the dataframe for easier country identification

In [46]:
country_code = pd.read_csv('data/tourism/tourism_arrivals.csv', usecols=['Country Name', 'Country Code'], skiprows=4)
country_code.shape

(266, 2)

In [47]:
arr_depart = pd.merge(arr_depart, country_code, how='left')
arr_depart.head()

Unnamed: 0,Country Name,Year,Arrivals,Departures,Country Code
0,Aruba,1995,912000.0,,ABW
1,Africa Eastern and Southern,1995,11583540.0,,AFE
2,Afghanistan,1995,,,AFG
3,Africa Western and Central,1995,2670706.0,,AFW
4,Angola,1995,9000.0,3000.0,AGO


Calculate Departure / Arrival ratio for choropleth visualization

In [48]:
arr_depart['DA_Ratio'] = arr_depart['Departures'] / arr_depart['Arrivals']
arr_depart.rename(columns={'Country Name': 'Country', 'Country Code': 'Country_code'}, inplace=True)
arr_depart.head()

Unnamed: 0,Country,Year,Arrivals,Departures,Country_code,DA_Ratio
0,Aruba,1995,912000.0,,ABW,
1,Africa Eastern and Southern,1995,11583540.0,,AFE,
2,Afghanistan,1995,,,AFG,
3,Africa Western and Central,1995,2670706.0,,AFW,
4,Angola,1995,9000.0,3000.0,AGO,0.333333


Get the continent information so that we can categorize countries in continents.

In [49]:
def get_continent(country_code):
    try:
        country_alpha2 = pc.country_alpha3_to_country_alpha2(country_code)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except KeyError:
        return None

In [50]:
arr_depart['Continent'] = arr_depart['Country_code'].apply(get_continent)

In [51]:
arr_depart.head()

Unnamed: 0,Country,Year,Arrivals,Departures,Country_code,DA_Ratio,Continent
0,Aruba,1995,912000.0,,ABW,,North America
1,Africa Eastern and Southern,1995,11583540.0,,AFE,,
2,Afghanistan,1995,,,AFG,,Asia
3,Africa Western and Central,1995,2670706.0,,AFW,,
4,Angola,1995,9000.0,3000.0,AGO,0.333333,Africa


In [52]:
arr_depart['Continent'].unique()

array(['North America', None, 'Asia', 'Africa', 'Europe', 'South America',
       'Oceania'], dtype=object)

In [53]:
arr_depart.shape

(6916, 7)

There are some regions included in the data we downloaded, so it is expected that there will be rows that do not match to a specific continent. Since they're not countries, we want to drop them.

In [54]:
# Drop rows that cannot be matched to a continent
final = arr_depart.dropna(subset=['Continent'])
final.shape

(5538, 7)

In [55]:
# Combine the Americas (North and South America should count as one category)
final['Continent'] = final['Continent'].apply(lambda x: 'America' if 'America' in x else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['Continent'] = final['Continent'].apply(lambda x: 'America' if 'America' in x else x)


In [56]:
final.head()

Unnamed: 0,Country,Year,Arrivals,Departures,Country_code,DA_Ratio,Continent
0,Aruba,1995,912000.0,,ABW,,America
2,Afghanistan,1995,,,AFG,,Asia
4,Angola,1995,9000.0,3000.0,AGO,0.333333,Africa
5,Albania,1995,304000.0,,ALB,,Europe
6,Andorra,1995,,,AND,,Europe


Check datatypes of each column.

In [57]:
final.dtypes

Country          object
Year             object
Arrivals        float64
Departures      float64
Country_code     object
DA_Ratio        float64
Continent        object
dtype: object

Column 'Year' is an object, change it to integer instead.

In [58]:
final['Year'] = pd.to_numeric(final['Year'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['Year'] = pd.to_numeric(final['Year'])


Output the file as JSON so d3 can read in data

In [59]:
final.to_json('data/tourism/tourism_arr_dep.json', orient='records')

In [13]:
final = pd.read_json('data/tourism/tourism_arr_dep.json')

In [15]:
final[final['Country'] == 'United States']

Unnamed: 0,Country,Year,Arrivals,Departures,DA_Ratio,Continent
183,United States,1995,79732000.0,74031000.0,0.928498,America
376,United States,1996,82756000.0,76803000.0,0.928066,America
569,United States,1997,82525000.0,78481000.0,0.950997,America
762,United States,1998,74767000.0,82758000.0,1.106879,America
955,United States,1999,75796000.0,84540000.0,1.115362,America
1148,United States,2000,78343000.0,87973000.0,1.122921,America
1341,United States,2001,70975000.0,84755000.0,1.194153,America
1534,United States,2002,64434000.0,80883000.0,1.255284,America
1727,United States,2003,62082000.0,75880000.0,1.222254,America
1920,United States,2004,67606000.0,79655000.0,1.178224,America


In [21]:
final[final['Year'] == 2020]['DA_Ratio'].describe()

count    67.000000
mean      0.934644
std       0.823490
min       0.031377
25%       0.276920
50%       0.706881
75%       1.376155
max       3.326730
Name: DA_Ratio, dtype: float64