In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv(
    "../data/oecd_tourism.csv", usecols=["LOCATION", "SUBJECT", "TIME", "Value"]
)
df.head()

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,AUS,INT_REC,2008,31159.8
1,AUS,INT_REC,2009,29980.7
2,AUS,INT_REC,2010,35165.5
3,AUS,INT_REC,2011,38710.1
4,AUS,INT_REC,2012,38003.7


1. Find the five countries that received the greatest amount of tourist dollars, on average, across the years in the dataset.
2. Find the five countries whose citizens spent the least amount of tourist dollars, on averae, across the years in the dataset.
3. Join the data from `oecd_locations.csv`, index it using the abbreviated name as an index, and join the data with the tourism dataframe.
4. Run queries 1 and 2, displaying the name of each country rather than the abbreviation.
5. Ignoring the names, did we get the same results as before? Why or why not?

In [28]:
# five countries that received the greatest amount of tourist dollars on average
(
    df.loc[df["SUBJECT"] == "INT_REC"]
    .groupby("LOCATION")["Value"]
    .mean()
    .sort_values(ascending=False)
    .iloc[:5]
)

LOCATION
USA    201613.500000
ESP     69655.817364
FRA     65063.335727
DEU     53408.570636
GBR     51752.090909
Name: Value, dtype: float64

In [29]:
# countries whose citizens spent the least on average
(
    df.loc[df["SUBJECT"] == "INT-EXP"]
    .groupby("LOCATION")["Value"]
    .mean()
    .sort_values()
    .iloc[:5]
)

LOCATION
MLT     387.801667
CRI     867.075000
LVA     919.545455
ISL    1072.819636
HRV    1115.628083
Name: Value, dtype: float64

In [30]:
# load the location name data, specify columns, and set the index
df_locs = pd.read_csv(
    "../data/oecd_locations.csv", names=["Location", "Name"], header=None
).set_index("Location")
df_locs.head()

Unnamed: 0_level_0,Name
Location,Unnamed: 1_level_1
AUS,Australia
AUT,Austria
BEL,Belgium
CAN,Canada
DNK,Denmark


In [31]:
df_joined = df.set_index("LOCATION").join(df_locs)
df_joined.head()
# my guess is that it's dropped all the data from countries not in the locations dataframe

Unnamed: 0_level_0,SUBJECT,TIME,Value,Name
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,INT_REC,2008,31159.8,Australia
AUS,INT_REC,2009,29980.7,Australia
AUS,INT_REC,2010,35165.5,Australia
AUS,INT_REC,2011,38710.1,Australia
AUS,INT_REC,2012,38003.7,Australia


In [32]:
# most tourist income query
(
    df_joined.loc[df_joined["SUBJECT"] == "INT_REC"]
    .groupby("Name")["Value"]
    .mean()
    .sort_values(ascending=False)
    .iloc[:5]
)

Name
United States     201613.500000
France             65063.335727
Germany            53408.570636
United Kingdom     51752.090909
Italy              44930.211545
Name: Value, dtype: float64

In [33]:
# lowest spending citizens query
(
    df_joined.loc[df_joined["SUBJECT"] == "INT-EXP"]
    .groupby("Name")["Value"]
    .mean()
    .sort_values()
    .iloc[:5]
)

Name
Hungary     2918.390182
Finland     5877.080909
Israel      6726.524833
Denmark    11326.169636
Austria    11934.563636
Name: Value, dtype: float64

# Extension questions
1. What happens if you perform the join in the other direction? Do you get the same result?
2. Get the mean tourism income per year rather than by country. Do you see any evidence of less tourism incoming during the time of the Great Recession, which started in 2008?
3. Reset the index on the joined locations dataframe. Now run the join specifying the locations column rather than the index.

In [34]:
# 1. join in the opposite direction
df_opp_join = df_locs.join(df.set_index("LOCATION"))
df_opp_join.head()
# looks like the same result to me, and makes sense that this type of join would be the same both ways

Unnamed: 0_level_0,Name,SUBJECT,TIME,Value
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AUS,Australia,INT_REC,2008,31159.8
AUS,Australia,INT_REC,2009,29980.7
AUS,Australia,INT_REC,2010,35165.5
AUS,Australia,INT_REC,2011,38710.1
AUS,Australia,INT_REC,2012,38003.7


In [None]:
# 2. mean tourism income per year
(df[df["SUBJECT"] == "INT_REC"].groupby("TIME")["Value"].mean())
# does look like a significant dip from 2009-2010

TIME
2008    16841.151327
2009    15081.294774
2010    16003.938556
2011    17788.743759
2012    18216.112815
2013    19296.536037
2014    20198.824148
2015    19301.865907
2016    19574.941796
2017    20763.391981
2018    22436.338296
2019    23005.937500
Name: Value, dtype: float64

In [36]:
# 3. using join with args to specify the column to join on
alt_join = df.reset_index().join(df_locs, on="LOCATION")
alt_join
# interesting - in this case we haven't dropped rows that don't share the index

Unnamed: 0,index,LOCATION,SUBJECT,TIME,Value,Name
0,0,AUS,INT_REC,2008,31159.800,Australia
1,1,AUS,INT_REC,2009,29980.700,Australia
2,2,AUS,INT_REC,2010,35165.500,Australia
3,3,AUS,INT_REC,2011,38710.100,Australia
4,4,AUS,INT_REC,2012,38003.700,Australia
...,...,...,...,...,...,...
1229,1229,SRB,INT-EXP,2015,1253.644,
1230,1230,SRB,INT-EXP,2016,1351.098,
1231,1231,SRB,INT-EXP,2017,1549.183,
1232,1232,SRB,INT-EXP,2018,1837.317,
