In [38]:
import pandas as pd
import numpy as np

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import plotly.express as px

import cufflinks as cf
cf.go_offline(connected=True)
init_notebook_mode(connected=True)

In [39]:
cf.set_config_file(theme='polar')

## Beer Reviews Visualization

This dataset contains 1.5 million beer reviews sourced from BeerAdvocate. We will examine the distribution of the reviews, as well as the most popular (and least popular) beers, beer styles and breweries.

In [40]:
df = pd.read_csv('beer_reviews_clean.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,0,10325,Vecchio Birraio,2009-02-16 20:57:03,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,1,10325,Vecchio Birraio,2009-03-01 13:44:57,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,2,10325,Vecchio Birraio,2009-03-01 14:10:04,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,3,10325,Vecchio Birraio,2009-02-15 19:12:25,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,4,1075,Caldera Brewing Company,2010-12-30 18:53:26,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [None]:
print('Number of unique breweries:', df.brewery_name.nunique())
print('Number of unique beer styles:', df.beer_style.nunique())
print('Number of beers reviewed:', df.beer_name.nunique())

## Review Scores

In [None]:
x0 = df.review_appearance
x1 = df.review_aroma
x2 = df.review_overall
x3 = df.review_palate
x4 = df.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

It appears that the users in this dataset are most likely to give a rating of 4 across all categories. There seem to be more good reviews than bad.

In [None]:
dfhigh = df.loc[df['review_overall'] >= 3]
dflow= df.loc[df['review_overall'] <= 2.5]

In [None]:
x0 = dfhigh.review_appearance
x1 = dfhigh.review_aroma
x2 = dfhigh.review_overall
x3 = dfhigh.review_palate
x4 = dfhigh.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Highly Rated',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

In [None]:
x0 = dflow.review_appearance
x1 = dflow.review_aroma
x2 = dflow.review_overall
x3 = dflow.review_palate
x4 = dflow.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Low Ratings',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

Unsurprisingly, beers with ratings under 3 scored lower on all categories. The biggest difference appears to be in the review_taste category, and the smallest is review_appearance. This suggests that taste is ultimately the most important feature in a beer.

## Alcohol By Volume

In [None]:
x = df['beer_abv'].sort_values(ascending=False)

fig= go.Figure()
fig.add_trace(go.Histogram(x=x/100, nbinsx=100, histfunc="count", name='count'))

fig.update_layout(
    title_text='Beer ABV',
    bargap=0.2,
    xaxis=dict(title="Percent Alcohol by Volume",
            tickformat = "%",
            hoverformat = '.2%'),
    yaxis_title='Count')
fig.show()

The most represented alcohol by volume percentage is 5 - 5.99%. Most of the data falls between 4 and 10% ABV, with some outliers at the extreme ends of the spectrum. 

## Which features are most influential?

In [None]:
corr = df[['review_overall', 'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'beer_abv']].corr()
fig = go.Figure()

fig.add_trace(go.Heatmap(
    z=corr.values,
    x=list(corr.columns),
    y=list(corr.index),
    colorscale='blues'
    ))

fig.show()

review_taste displays the strongest correlation with review_overall, followed by review_palate. beer_abv does not seem to affect the review scores at all. 

## Beer Style Counts

In [None]:
df.beer_style.value_counts().head(20).iplot(kind='barh', title='20 Most Reviewed Beer Styles')

In [None]:
df.beer_style.value_counts().tail(20).iplot(kind='barh', title='20 Least Reviewed Beer Styles')

The most commonly reviewed beer is clearly the American IPA, with over 100,000 reviews. Interestingly over half of the beers in the top 20 are American styles. While we don't have data on user location, I would guess that most of the users in this dataset are American. The least reviewed beer styles are Kvass and Happoshu, both of which are beer-like beverages but arguably not actual beer. The next least reviewed is Roggenbier, a medieval style rye beer. 

## Top 10 Beers

In [None]:
y = df['beer_name'].value_counts().sort_values(ascending=False).head(10).iplot(kind='barh', title='10 Most Reviewed Beers')

## Brewery Counts

In [None]:
df.brewery_name.value_counts().head(20).iplot(kind='barh', title='Top 20 Most Reviewed Breweries')

In [None]:
df.brewery_name.value_counts().tail(20).iplot(kind='barh', title='Least Reviewed Breweries')

Boston Beer Company (Samuel Adams) takes the prize for highest number of reviews with over 38,000 reviews. The Bottom 20 are all tied at one review each. 

## Strongest and Weakest Beers

In [None]:
top20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').head(20)

p = [go.Bar(x = top20abv['beer_abv'] / 100,
            y = top20abv['beer_name'],
            hoverinfo = 'x',
            text=top20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Strongest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
btm20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').tail(20)

p = [go.Bar(x = btm20abv['beer_abv'] / 100,
            y = btm20abv['beer_name'],
            hoverinfo = 'x',
            text=btm20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Weakest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

The strongest beer reviewed is the Schorsbrau Schorsbock at a whopping 57% alcohol by volume, higher than most hard liquors. The lowest is Liber at 0.01%

## Best and Worst Reviews

While before we viewed the most frequently reviewed beers, this section will examine both the best and worst reviews by beer, style, and brewery. First we will create a "review average" column, taking the average of all the review variables for each beer. 

In [None]:
df['review_average'] = df.apply(lambda x: (x.review_overall + x.review_aroma + x.review_appearance + x.review_palate + x.review_taste) / 5, axis=1)

In [None]:
df['total_reviews'] = 0

beers_grouped = df.groupby(['beer_beerid']).agg(dict(beer_name='first', brewery_name='first', beer_style = 'first', total_reviews='count', review_appearance='mean', review_overall='median', review_taste='mean', review_aroma='mean', review_average='mean', review_palate='mean')).reset_index()

In [None]:
beers_grouped.head()

In [None]:
beers_grouped.describe()

After grouping the beers so we can see how many times each has been reviewed, we can see that the average amount of reviews for each beer is around 30, and the standard deviation is quite high in this category. Everything up to the 50% quantile is below 3 reviews per beer! For a more accurate representation of the highest reviewed beers, we will limit our data to only beers that have been reviewed at least 100 times. 

In [None]:
top_reviews = beers_grouped.loc[beers_grouped['total_reviews'] >= 100]

In [None]:
top_reviews.head()

In [None]:
top_beers = top_reviews.sort_values('review_average',ascending=False).head(15)

In [None]:
btm_beers = top_reviews.sort_values('review_average',ascending=False).tail(15)

In [None]:
x = top_beers['review_average']
y = top_beers['beer_name']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=top_beers['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 15 Beers by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
x = btm_beers['review_average']
y = btm_beers['beer_name']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=btm_beers['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Bottom 15 Beers by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

The thing that stands out to me most from these plots is that the bottom 15 is almost exclusively populated with American macro-brewed beers, particularly frorm Anheuser-Busch. The users here clearly prefer craft beer! Also of note is the fact that Russian River Brewing has not only the top rated beer, but two other appearances in the top 15. 

In [None]:
top_breweries = top_reviews.groupby('brewery_name').agg(dict(brewery_name='first', review_average='mean')).sort_values('review_average', ascending=False)
top_brew = top_breweries.head(15)
btm_brew = top_breweries.tail(15)

In [None]:
x = top_brew['review_average']
y = top_brew['brewery_name']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=top_brew['review_average'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 15 Breweries by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
x = btm_brew['review_average']
y = btm_brew['brewery_name']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=btm_brew['review_average'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Bottom 15 Breweries by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
top_styles = top_reviews.groupby('beer_style').agg(dict(beer_style='first', review_average='mean')).sort_values('review_average', ascending=False)
top_style = top_styles.head(15)
btm_style = top_styles.tail(15)

In [None]:
x = top_style['review_average']
y = top_style['beer_style']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=top_style['review_average'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 15 Styles by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
x = btm_style['review_average']
y = btm_style['beer_style']

p = [go.Bar(x = x,
            y = y,
            hoverinfo = 'x',
            text=btm_style['review_average'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Bottom 15 Breweries by Review Average',
                   xaxis=dict(title="Review Average"),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

## User Data


In [None]:
df.review_profilename.value_counts().head(10)

The most prolific reviewer in the dataset is northyorksammy with over 5,000 reviews. 

In [None]:
fig = go.Figure()
fig.add_trace(go.Box(y=df.review_profilename.value_counts(), boxmean='sd'))
fig.update_layout(title='Distribution of Reviews per User')
fig.show()

While there are some power-users like northyorksammy in the dataset, the mean amount of reviews per user is 45.4, and the median is 3. We clearly have a smaller percent of prolific users throwing off the average, while the majority of users have far fewer reviews. 

In [None]:
df.review_time = pd.to_datetime(df['review_time'])

In [None]:
group_by_date = df[['review_time']].groupby(df['review_time'].dt.date).agg(['count'])
group_by_date.iplot(kind='line', title='Reviews Over Time')