In [None]:
import pandas as pd
import numpy as np

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import plotly.express as px

import cufflinks as cf
cf.go_offline(connected=True)
init_notebook_mode(connected=True)

In [None]:
cf.set_config_file(theme='polar')

## Beer Reviews Visualization

Beer Reviews Visualization

This dataset contains 1.5 million beer reviews sourced from BeerAdvocate. We will examine the distrobution of the reviews, as well as the most popular (and least popular) beer styles and breweries.

In [None]:
df = pd.read_csv('beer_reviews_clean.csv')
df.head()

In [None]:
print('Number of unique breweries:', df.brewery_name.nunique())
print('Number of unique beer styles:', df.beer_style.nunique())
print('Number of beers reviewed:', df.beer_name.nunique())

## Review Scores

In [None]:
x0 = df.review_appearance
x1 = df.review_aroma
x2 = df.review_overall
x3 = df.review_palate
x4 = df.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

It appears that the users in this dataset are most likely to give a rating of 4 across all categories. There seem to be more good reviews than bad.

In [None]:
dfhigh = df.loc[df['review_overall'] >= 3]
dflow= df.loc[df['review_overall'] <= 2.5]

In [None]:
x0 = dfhigh.review_appearance
x1 = dfhigh.review_aroma
x2 = dfhigh.review_overall
x3 = dfhigh.review_palate
x4 = dfhigh.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Highly Rated',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

In [None]:
x0 = dflow.review_appearance
x1 = dflow.review_aroma
x2 = dflow.review_overall
x3 = dflow.review_palate
x4 = dflow.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Low Ratings',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

Unsurprisingly, beers with ratings under 3 scored lower on all categories. The biggest difference appears to be in the review_taste category, and the smallest is review_appearance. This suggests that taste is ultimately the most important feature in a beer.

## Alcohol By Volume

In [None]:
x = df['beer_abv'].sort_values(ascending=False)

fig= go.Figure()
fig.add_trace(go.Histogram(x=x/100, nbinsx=100, histfunc="count", name='count'))

fig.update_layout(
    title_text='Beer ABV',
    bargap=0.2,
    xaxis=dict(title="Percent Alcohol by Volume",
            tickformat = "%",
            hoverformat = '.2%'),
    yaxis_title='Count')
fig.show()

The most represented alcohol by volume percentage is 5 - 5.99%. Most of the data falls between 4 and 10% ABV, with some outliers at the extreme ends of the spectrum. 

## Beer Style Counts

In [None]:
df.beer_style.value_counts().head(20).iplot(kind='barh', title='Top 20 Beer Styles')

In [None]:
df.beer_style.value_counts().tail(20).iplot(kind='barh', title='Bottom 20 Beer Styles')

The most commonly reviewed beer is clearly the American IPA, with over 100,000 reviews. Interestingly over half of the beers in the top 20 are American styles. While we don't have data on user location, I would guess that most of the users in this dataset are American. The least reviewed beer styles are Kvass and Happoshu, both of which are beer-like beverages but arguably not actual beer. The next least reviewed is Roggenbier, a medieval style rye beer. 

## Brewery Counts

In [None]:
df.brewery_name.value_counts().head(20).iplot(kind='barh', title='Top 20 Breweries')

In [None]:
df.brewery_name.value_counts().tail(20).iplot(kind='barh', title='Bottom 20 Breweries')

Boston Beer Company (Samuel Adams) takes the prize for highest number of reviews with over 38,000 reviews. The Bottom 20 are all tied at one review each. 

## Rating By ABV

In [None]:
fig = px.scatter(x=df.review_overall, y=df.beer_abv/100, color=df.beer_style)
fig.update_layout(title_text='Beer Ratings by ABV', xaxis_title_text='Rating', yaxis_title_text='ABV', yaxis_tickformat='%')
fig.show()

There does not seem to be a correlation between overall rating and ABV.

## Strongest and Weakest Beers

In [None]:
top20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').head(20)

p = [go.Bar(x = top20abv['beer_abv'] / 100,
            y = top20abv['beer_name'],
            hoverinfo = 'x',
            text=top20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Strongest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
btm20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').tail(20)

p = [go.Bar(x = btm20abv['beer_abv'] / 100,
            y = btm20abv['beer_name'],
            hoverinfo = 'x',
            text=btm20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Weakest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

The strongest beer reviewed is the Schorsbrau Schorsbock at a whopping 57% alcohol by volume, higher than most hard liquors. The lowest is Liber at 0.01%

## Top/Bottom 50 reviews

In this section we will examine both the highest and lowest scoring 50 reviews for beer name, style and brewery

In [None]:
top20beers = df[['beer_name', 'brewery_name', 'review_overall']].sort_values('review_overall', ascending=False).head(50)
                                                                                                                    
top20beers.beer_name.iplot(kind='hist', title='Top Rated Beers', yTitle='Number of 5 Star Reviews')                                                                                                                   

In [None]:
btm20beers = df[['beer_name', 'brewery_name', 'review_overall']].sort_values('review_overall', ascending=False).tail(50)

btm20beers.beer_name.iplot(kind='hist', title='Worst Rated Beers', yTitle='Number of 1 Star Reviews')

The most highly reviewed beer is the Big Eye IPA, with 6 five star ratings. The lowest is the Bud Light Chelada, with 19 one star ratings. 

In [None]:
top20styles = df[['beer_name', 'brewery_name', 'review_overall', 'beer_style']].sort_values('review_overall', ascending=False).head(50)

top20styles.beer_style.iplot(kind='hist', title='Top Rated Styles', yTitle='Number of 5 Star Reviews')

In [None]:
btm20styles = df[['beer_name', 'brewery_name', 'review_overall', 'beer_style']].sort_values('review_overall', ascending=False).tail(50)

btm20styles.beer_style.iplot(kind='hist', title='Worst Rated Styles', yTitle='Number of 1 Star Reviews')

American IPA is unsurprisingly the beer with the most five star reviews, and also the most frequently reviewed beer. Fruit/Vegetable beer seems to be the leasat popular style, with 19 one star reviews. The very unpopular Bud Light Chelada from above is a Fruit/Vegetable beer. The mean rating for the style is 3.4, so this result may be unfairly being dragged down by the unpopular Bud Light Chelada. 

In [None]:
vegbeer = df.loc[df['beer_style']=='Fruit / Vegetable Beer']
print(np.mean(vegbeer.review_overall), np.median(vegbeer.review_overall))

In [None]:
top20styles.brewery_name.iplot(kind='hist', title='Top Rated Breweries', yTitle='Number of 5 Star Reviews')

In [None]:
btm20styles.brewery_name.iplot(kind='hist', title='Worst Rated Breweries', yTitle='Number of 1 Star Reviews')

The Ballast Point Brewing Company and Southhampton Publick House are tied for most five star reviews with 6 apiece. Anheuser-Busch is clearly the lowest rated brewery here with 32 one star reviews.