In [14]:
import pandas as pd
import numpy as np
from string import punctuation

In [40]:
df = pd.read_csv(
    "../data/winemag-150k-reviews.csv",
    usecols=["country", "province", "description", "variety", "points"],
)
df.head()

Unnamed: 0,country,description,points,province,variety
0,US,This tremendous 100% varietal wine hails from ...,96,California,Cabernet Sauvignon
1,Spain,"Ripe aromas of fig, blackberry and cassis are ...",96,Northern Spain,Tinta de Toro
2,US,Mac Watson honors the memory of a wine once ma...,96,California,Sauvignon Blanc
3,US,"This spent 20 months in 30% new French oak, an...",96,Oregon,Pinot Noir
4,France,"This is the top wine from La Bégude, named aft...",95,Provence,Provence red blend


1. What are the 10 most common words containing 5 or more letters in the wine descriptions? Normalise to lower case and strip punctuation. Also remove `"flavors"`, `"aromas"`, `"finish"`, `"palate"`, `"drink"`.
2. What are the 10 most common words for non-California wines?
3. What are the 10 most common words for French wines?
4. What are the 10 most common words for white wines? (Chardonnay, Sauvignon Blanc, Riesling)
5. What are the 10 most common words for red wines? (Pinot Noir, Cabernet Sauvignon, Syrah, Merlot, Zinfandel)
6. What are the 10 most common words for rosé wines?
7. What are the 10 most common words for the five most common wine varieties?

In [23]:
common_wine_words = ["flavors", "aromas", "finish", "palate", "drink"]

In [None]:
all_words = df["description"].str.split().explode().str.lower().str.strip(punctuation)
all_words = all_words[(~all_words.isin(common_wine_words) & (all_words.str.len() >= 5))]
all_words.head(10)

0    tremendous
0      varietal
0         hails
0      oakville
0         three
Name: description, dtype: object

In [None]:
# 1. 10 most common 5+ letter words
all_words.value_counts().head(10)

description
fruit      56496
acidity    32603
tannins    32186
cherry     30685
black      24599
Name: count, dtype: int64

In [None]:
# 2. 10 most common words for non-California wines
non_calif = (
    df.loc[~(df["province"] == "California")]["description"]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
non_calif = non_calif[~non_calif.isin(common_wine_words) & (non_calif.str.len() >= 5)]
non_calif.value_counts().head(10)

description
fruit      46517
acidity    22325
tannins    22007
cherry     19473
spice      18572
Name: count, dtype: int64

In [None]:
# 3. 10 most common words for French wines
french_words = (
    df.loc[df["country"] == "France"]["description"]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
french_words = french_words[
    ~french_words.isin(common_wine_words) & (french_words.str.len() >= 5)
]
french_words.value_counts().head(10)

description
fruit      8722
acidity    8641
tannins    6509
fruits     5459
fresh      4217
Name: count, dtype: int64

In [None]:
# 4. 10 most common words for white wines
white_words = (
    df.loc[df["variety"].isin(["Chardonnay", "Sauvignon Blanc", "Riesling"])][
        "description"
    ]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
white_words = white_words[
    ~white_words.isin(common_wine_words) & (white_words.str.len() >= 5)
]
white_words.value_counts().head(10)

In [None]:
# 5. 10 most common words for white wines
red_words = (
    df.loc[
        df["variety"].isin(
            ["Pinot Noir", "Cabernet Sauvignon", "Syrah", "Merlot", "Zinfandel"]
        )
    ]["description"]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
red_words = red_words[~red_words.isin(common_wine_words) & (red_words.str.len() >= 5)]
red_words.value_counts().head(10)

description
fruit         15081
cherry        14041
tannins       13173
black          9551
blackberry     6777
acidity        6353
pinot          6342
sweet          5984
cherries       5378
years          5370
Name: count, dtype: int64

In [None]:
# 6. common words for rosé wines
rose_words = (
    df.loc[df["variety"] == "Rosé"]["description"]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
rose_words = rose_words[
    ~rose_words.isin(common_wine_words) & (rose_words.str.len() >= 5)
]
rose_words.value_counts().head(10)

description
acidity       1135
fruit          697
crisp          672
fresh          622
strawberry     534
light          518
raspberry      510
cherry         470
fruity         428
fruits         420
Name: count, dtype: int64

In [39]:
# 7. Most common words for the 5 most common wine varieties
common_varieties = df["variety"].value_counts(ascending=False).iloc[:5].index
topwine_words = (
    df.loc[df["variety"].isin(common_varieties)]["description"]
    .str.split()
    .explode()
    .str.lower()
    .str.strip(punctuation)
)
topwine_words = topwine_words[
    ~topwine_words.isin(common_wine_words) & (topwine_words.str.len() >= 5)
]
topwine_words.value_counts().head(10)

description
fruit       22859
tannins     16007
cherry      13991
acidity     12526
black       11236
cabernet     9480
spice        7910
sweet        7881
blend        7597
shows        7264
Name: count, dtype: int64

# Extension questions
1. Which country's wines got the highest average score?
2. Create a pivot table in which the index contains countries, the columns contain varieties, and the cells contain mean scores. Include only the top 10 varieties.
3. What is the correlation between the number of wines offered by a country and the mean score for that country? (i.e. does a country that offers more wines get a higher average score in competitions)

In [43]:
# 1. which country's wines got the highest average score
df.groupby("country")["points"].mean().sort_values(ascending=False).iloc[:5]

country
England    92.888889
Austria    89.276742
France     88.925870
Germany    88.626427
Italy      88.413664
Name: points, dtype: float64

In [None]:
# 2. create a pivot table in which the index contains the countries, the columns contain the
# varieties and the cells contain mean scores - include only the top 10 varieties
(
    df.loc[
        df["variety"].isin(df["variety"].value_counts(ascending=False).iloc[:10].index)
    ].pivot_table(index="country", columns="variety", values="points")
)

variety,Bordeaux-style Red Blend,Cabernet Sauvignon,Chardonnay,Merlot,Pinot Noir,Red Blend,Riesling,Sauvignon Blanc,Syrah,Zinfandel
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Argentina,89.575472,85.527745,84.177489,84.341969,85.058333,88.197059,85.0,83.295302,85.232394,
Australia,88.841463,88.115502,86.727952,85.258824,86.405263,87.816176,87.79021,86.62406,91.952381,88.2
Austria,91.625,87.75,90.016393,89.0,88.753846,88.890511,90.583955,88.694215,87.0,
Brazil,86.0,81.0,,83.2,,84.0,,,,
Bulgaria,,84.8125,86.875,84.363636,87.4,89.0,83.75,84.4,90.0,
Canada,89.0,88.666667,88.653846,87.875,89.111111,89.5,87.564516,87.75,89.666667,
Chile,89.754717,86.561564,85.246011,84.939189,85.827273,88.683168,85.714286,85.895805,87.506739,85.0
China,,,82.0,,,,,,,
Croatia,,,85.0,,84.0,86.0,,82.0,,
Cyprus,,88.0,,,,85.714286,,,,


In [None]:
# 3. correlation between countries that offer more wines and the mean score for the country
(
    pd.merge(
        df.groupby("country")["points"].mean().rename("Points"),
        df.groupby("country")["points"].count().rename("Submissions"),
        on="country",
    )
).corr()

Unnamed: 0,Points,Submissions
Points,1.0,0.236117
Submissions,0.236117,1.0


In [55]:
# book version of 3
(df.groupby("country")["points"].agg(["count", "mean"]).corr())

Unnamed: 0,count,mean
count,1.0,0.236117
mean,0.236117,1.0


# My stuff
- Which wines from Australia perform above and below the mean?
- Are there any Australian wines that are better than 1 s.d. than the average?

In [None]:
# top performing Australian wines (> 1 s.d. from mean than global average for that variety)
ozwines = df.loc[df["country"] == "Australia"].groupby("variety")["points"].mean()

bestoz = ozwines[ozwines > df["points"].mean() + df["points"].std()]
bestoz

variety
Cabernet-Shiraz    96.000000
Muscadel           94.142857
Muscat             93.100000
Shiraz-Malbec      92.000000
Syrah              91.952381
Tokay              93.294118
Name: points, dtype: float64

In [80]:
# worst performing Australian wines
worstoz = ozwines[ozwines < df["points"].mean() - df["points"].std()]
worstoz

variety
Cabernet Sauvignon-Cabernet Franc    84.0
Cabernet Sauvignon-Malbec            82.0
Grenache-Syrah                       84.0
Merlot-Cabernet                      80.0
Muscat Blanc                         81.0
Muscat Blanc à Petit Grain           82.0
Name: points, dtype: float64

In [None]:
# how many of these good and bad wine reviews were there?
df.loc[(df["country"] == "Australia") & df["variety"].isin(bestoz.index)].groupby(
    "variety"
).count()
# so, 1 really good Cab-Shiraz review, but muscat, syrah and tokay are probably pretty solid

Unnamed: 0_level_0,country,description,points,province
variety,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cabernet-Shiraz,1,1,1,1
Muscadel,7,7,7,7
Muscat,30,30,30,30
Shiraz-Malbec,1,1,1,1
Syrah,42,42,42,42
Tokay,17,17,17,17


In [None]:
# what about the clangers?
df.loc[(df["country"] == "Australia") & df["variety"].isin(worstoz.index)].groupby(
    "variety"
).count()
# as expected, at most 2 reviews

Unnamed: 0_level_0,country,description,points,province
variety,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Cabernet Sauvignon-Cabernet Franc,1,1,1,1
Cabernet Sauvignon-Malbec,1,1,1,1
Grenache-Syrah,1,1,1,1
Merlot-Cabernet,1,1,1,1
Muscat Blanc,2,2,2,2
Muscat Blanc à Petit Grain,1,1,1,1
