In [1]:
import pandas as pd
import numpy as np

In [2]:
oecd_df = pd.read_csv(
    "../data/oecd_locations.csv", names=["country", "name"]
).set_index("country")
oecd_df.head()

Unnamed: 0_level_0,name
country,Unnamed: 1_level_1
AUS,Australia
AUT,Austria
BEL,Belgium
CAN,Canada
DNK,Denmark


In [10]:
oecd_tourism_df = (
    pd.read_csv(
        "../data/oecd_tourism.csv",
        usecols=["LOCATION", "TIME", "SUBJECT", "Value"],
        index_col="LOCATION",
    )
    .loc[lambda x: x["SUBJECT"] == "INT-EXP"]  # didn't know you could do this, neat
    .drop("SUBJECT", axis="columns")
)
oecd_tourism_df.head()

Unnamed: 0_level_0,TIME,Value
LOCATION,Unnamed: 1_level_1,Unnamed: 2_level_1
AUS,2008,27620.0
AUS,2009,25629.6
AUS,2010,31916.5
AUS,2011,39381.5
AUS,2012,41632.8


In [23]:
# create a tourism spending series where the index is the country name and the value contains the average tourism spending
# for that country - be aware that there are country codes that don't exist in the location names, so these will be dropped
tourism_spending = (
    # add the name field to the tourism dataframe
    oecd_df.join(oecd_tourism_df).groupby("name").mean()
)["Value"]
tourism_spending

name
Australia          36727.966667
Austria            11934.563636
Belgium            20859.883455
Brazil             21564.351833
Canada             40984.633333
Denmark            11326.169636
Finland             5877.080909
France             51394.272273
Germany            96615.075545
Hungary             2918.390182
Israel              6726.524833
Italy              34148.908455
Japan              32197.925000
Korea              25573.509091
United Kingdom     75262.227273
United States     142080.666667
Name: Value, dtype: float64

In [24]:
# verify against the location code means
oecd_tourism_df.groupby("LOCATION")["Value"].mean()

LOCATION
AUS     36727.966667
AUT     11934.563636
BEL     20859.883455
BGR      1562.641750
BRA     21564.351833
CAN     40984.633333
CHE     15775.966667
CHL      2409.591667
CHN    163217.404417
COL      4381.633333
CRI       867.075000
CZE      4898.278545
DEU     96615.075545
DNK     11326.169636
EGY      3202.420750
ESP     21601.955273
EST      1122.501909
FIN      5877.080909
FRA     51394.272273
GBR     75262.227273
GRC      3486.849818
HRV      1115.628083
HUN      2918.390182
IDN     10058.441667
IND     17050.493083
IRL      7030.703818
ISL      1072.819636
ISR      6726.524833
ITA     34148.908455
JPN     32197.925000
KAZ      2606.684083
KOR     25573.509091
LTU      1189.933333
LUX      3355.510636
LVA       919.545455
MAR      2299.033750
MEX     11469.027273
MLT       387.801667
NLD     22278.809091
NOR     15667.881818
NZL      4698.900000
PER      2236.278583
PHL      8961.027250
POL      9206.116364
PRT      5037.670182
ROU      3126.455250
RUS     39577.236364
SRB 

In [69]:
# wine reviews averages sorted in descending order
wine_df = pd.read_csv("../data/winemag-150k-reviews.csv", usecols=["country", "points"])
country_points = (
    wine_df.groupby("country").mean().sort_values("points", ascending=False)
)
country_points

Unnamed: 0_level_0,points
country,Unnamed: 1_level_1
England,92.888889
Austria,89.276742
France,88.92587
Germany,88.626427
Italy,88.413664
Canada,88.239796
Slovenia,88.234043
Morocco,88.166667
Turkey,88.096154
Portugal,88.057685


In [38]:
mean_wine_df = wine_df.groupby("country").mean()
inner_joins = mean_wine_df.join(tourism_spending)["Value"]
# bunch of NaN values which are either:
#  - countries that don't exist in the tourism spending country list
#  - countries where the name is different (e.g US vs United States, or United Kingdom vs England)

In [None]:
# outer join
outer_joins = mean_wine_df.join(tourism_spending, how="outer")["Value"]
# so this should be the NaNs are the country names that don't exist from the tourism spending set in the
# wine set, and also the countries that from the wines set that don't exist in the tourism set

In [62]:
wine_countries = wine_df.reset_index()["country"].drop_duplicates()  # 48
tourism_countries = tourism_spending.reset_index()["name"].drop_duplicates()  # 16
outer_joins.reset_index()["index"].drop_duplicates()  # 54

# now how many countries are in both?
wine_countries[
    wine_countries.isin(tourism_countries)
].count()  # 10 from wine countries are in tourism
# so 54 = 48 + 16 - 10

10

In [70]:
# book solution
mean_wine_df.join(tourism_spending).head()

Unnamed: 0_level_0,points,Value
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,88.0,
Argentina,85.996093,
Australia,87.892475,36727.966667
Austria,89.276742,11934.563636
Bosnia and Herzegovina,84.75,


In [68]:
# correlation between wine score and average spending
mean_wine_df.join(tourism_spending).corr()
# correlation of 0.28 isn't too bad, but not exactly strong

Unnamed: 0,points,Value
points,1.0,0.288231
Value,0.288231,1.0


# Extension questions
1. Read in the three data frames as before but without setting an index. Use `abbrev`, `TIME`, and `Value` from `oecd_tourism_df`, and `Value` should be `np.int64`
2. Perform the same join as before but use `merge` instead of `join`
3. How is the default `merge` different from the default `join` when it comes to `NaN` values?

In [88]:
oecd_df = pd.read_csv("../data/oecd_locations.csv", names=["abbrev", "country"])
oecd_tourism_df = (
    pd.read_csv(
        "../data/oecd_tourism.csv",
        usecols=["LOCATION", "TIME", "SUBJECT", "Value"],
    )
    .loc[lambda x: x["SUBJECT"] == "INT-EXP"]  # didn't know you could do this, neat
    .drop("SUBJECT", axis="columns")
)
oecd_tourism_df = oecd_tourism_df.rename(columns={"LOCATION": "abbrev"})
oecd_tourism_df["Value"] = oecd_tourism_df["Value"].astype(np.int64)
wine_df = pd.read_csv("../data/winemag-150k-reviews.csv", usecols=["country", "points"])
oecd_tourism_df.head()

Unnamed: 0,abbrev,TIME,Value
12,AUS,2008,27620
13,AUS,2009,25629
14,AUS,2010,31916
15,AUS,2011,39381
16,AUS,2012,41632


In [99]:
tourism_spending = (
    oecd_df.merge(oecd_tourism_df, on="abbrev").groupby("country")["Value"].mean()
)
# this looks like the same data as what I got before

In [102]:
country_points = wine_df.groupby("country")["points"].mean()
country_points.to_frame().merge(tourism_spending, on="country")


Unnamed: 0_level_0,points,Value
country,Unnamed: 1_level_1,Unnamed: 2_level_1
Australia,87.892475,36727.5
Austria,89.276742,11934.0
Brazil,83.24,21563.833333
Canada,88.239796,40984.25
France,88.92587,51393.909091
Germany,88.626427,96614.636364
Hungary,87.329004,2918.090909
Israel,87.17619,6726.083333
Italy,88.413664,34148.272727
Japan,85.0,32197.5


In [104]:
(country_points.to_frame().merge(tourism_spending, on="country", how="outer")).count()


points    48
Value     16
dtype: int64

`.merge` performs an inner join by default, whereas `.join` performs a left join by default