In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
import numpy as np


from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff
import plotly.express as px

import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)


## Beer Reviews Visualization

This dataset contains 1.5 million beer reviews sourced from BeerAdvocate. We will examine the distrobution of the reviews, as well as the most popular (and least popular) beer styles and breweries.

In [None]:
style.use('seaborn-darkgrid')
style.use('seaborn-poster')

In [None]:
df = pd.read_csv('beer_reviews_clean.csv')
df.head()

In [None]:
print('Number of unique breweries:', df.brewery_name.nunique(),  'Number of unique beer styles:', df.beer_style.nunique(), 'Number of beers reviewed:', df.beer_name.nunique())

In [None]:
x0 = df.review_appearance
x1 = df.review_aroma
x2 = df.review_overall
x3 = df.review_palate
x4 = df.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

It appears that the users in this dataset are most likely to give a rating of 4 across all categories. There seem to be more good reviews than bad. 

In [None]:
x = df['beer_abv'].sort_values(ascending=False)

fig= go.Figure()
fig.add_trace(go.Histogram(x=x/100, nbinsx=100, histfunc="count", name='count'))

fig.update_layout(
    autosize=False,
    width=800,
    height=800,
    title_text='Beer ABV',
    bargap=0.2,
    xaxis=dict(title="Percent Alcohol by Volume",
            tickformat = "%",
            hoverformat = '.2%'),
    yaxis_title='Count')
fig.show()

In [None]:

y = df['beer_style'].sort_values(ascending=False)

fig = go.Figure()
fig.add_trace(go.Histogram(y=y))

fig.update_layout(
    title_text='Beer Style Count', 
    autosize=False,
    width=1200,
    height=1200,
    bargap=0.1,
    uniformtext_minsize=8, uniformtext_mode='hide'
)

fig.show()

In [None]:

y = df['brewery_name'].sort_values(ascending=False)

fig = go.Figure()
fig.add_trace(go.Histogram(y=y))

fig.update_layout(
    title_text='Brewery Count',
    autosize=False,
    width=1200,
    height=1200,
    bargap=0.1)

fig.show()

In [None]:
fig = px.scatter(x=df.review_overall, y=df.beer_abv, color=df.beer_style)
fig.update_layout(title_text='Beer Ratings by ABV')
fig.show()

In [None]:
print(np.mean(df['beer_abv']))

In [None]:
top20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').head(20)

p = [go.Bar(x = top20abv['beer_abv'] / 100,
            y = top20abv['beer_name'],
            hoverinfo = 'x',
            text=top20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Strongest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)


In [None]:
btm20abv = df[['beer_name', 'brewery_name', 'beer_abv', 'beer_style']].sort_values('beer_abv', ascending=False).drop_duplicates('beer_name').tail(20)

p = [go.Bar(x = btm20abv['beer_abv'] / 100,
            y = btm20abv['beer_name'],
            hoverinfo = 'x',
            text=btm20abv['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'
            ))]

layout = go.Layout(title='Top 20 Weakest Beers by ABV',
                   xaxis=dict(title="ABV",
                              tickformat = "%",
                              hoverformat = '.2%'),
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
top50beers = df[['beer_name', 'brewery_name', 'review_overall']].sort_values('review_overall', ascending=False).head(50)

p = [go.Bar(x = top50beers['review_overall'],
            y = top50beers['beer_name'],
            hoverinfo = 'x',
            text=top50beers['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'))]

layout = go.Layout(title='Highest Rated Beers',
                   xaxis_title_text = 'Number of 5 star reviews',
                   autosize=False,
                   width = 1000,
                   height = 1000,
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)


In [None]:
btm50beers = df[['beer_name', 'brewery_name', 'review_overall']].sort_values('review_overall', ascending=False).tail(50)

p = [go.Bar(x = btm50beers['review_overall'],
            y = btm50beers['beer_name'],
            hoverinfo = 'x',
            text=btm50beers['brewery_name'],
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'))]

layout = go.Layout(title='Lowest Rated Beers',
                   xaxis_title_text = 'Number of 1 star reviews',
                   autosize=False,
                   width = 1000,
                   height = 1000,
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
top20styles = df[['beer_name', 'brewery_name', 'review_overall', 'beer_style']].sort_values('review_overall', ascending=False).head(20)


p = [go.Bar(x = top20styles['review_overall'],
            y = top20styles['beer_style'],
            hoverinfo = 'x',
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'))]

layout = go.Layout(title='Highest Rated Beer styles',
                   xaxis_title_text = 'Number of 5 star Reviews',
                   autosize=False,
                   width = 1000,
                   height = 1000,
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)


In [None]:
btm20styles = df[['beer_name', 'brewery_name', 'review_overall', 'beer_style']].sort_values('review_overall', ascending=False).tail(20)


p = [go.Bar(x = btm20styles['review_overall'],
            y = btm20styles['beer_style'],
            hoverinfo = 'x',
            textposition = 'inside',
            orientation='h',
            opacity=0.7, 
            marker=dict(
                color='rgb(1, 87, 155)'))]

layout = go.Layout(title='Lowest Rated Beer styles',
                   xaxis_title_text = 'Number of 5 star Reviews',
                   autosize=False,
                   width = 1000,
                   height = 1000,
                   margin = dict(l = 220))

fig = go.Figure(data=p, layout=layout)

py.offline.iplot(fig)

In [None]:
dfhigh = df.loc[df['review_overall'] >= 3]
dflow= df.loc[df['review_overall'] <= 2.5]

## High vs Low review distributions

In [None]:
x0 = dfhigh.review_appearance
x1 = dfhigh.review_aroma
x2 = dfhigh.review_overall
x3 = dfhigh.review_palate
x4 = dfhigh.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Highly Rated',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

In [None]:
x0 = dflow.review_appearance
x1 = dflow.review_aroma
x2 = dflow.review_overall
x3 = dflow.review_palate
x4 = dflow.review_taste

fig = go.Figure()
fig.add_trace(go.Histogram(x=x0, name='Review Appearance'))
fig.add_trace(go.Histogram(x=x1,  name='Review Aroma'))
fig.add_trace(go.Histogram(x=x2, name='Review Overall'))
fig.add_trace(go.Histogram(x=x3, name='Review Palate'))
fig.add_trace(go.Histogram(x=x4, name="Review Taste"))

# Reduce opacity to see both histograms
fig.update_traces(opacity=0.75)
fig.update_layout(title_text = 'Distribution of Reviews - Low Ratings',
                 xaxis_title_text='Review Score',
                 yaxis_title_text='Count',
                 bargap=0.2)
fig.show()

Unsurprisingly, beers with ratings under 3 scored lower on all categories. The biggest difference appears to be in the review_taste category, and the smallest is review_appearance. This suggests that taste is ultimately the most important feature in a beer. 