Latest Access to the dataset: 12/03/2020

In [21]:
import pandas as pd
import numpy as np

In [22]:
file_name = pd.read_csv('https://query.data.world/s/gib6aa6n3tmtuvrqpbdlhcbgdg7tk4')

# Best Practice: Make a copy of the raw data to work on
reviews = file_name.copy()

# Sanity check
reviews.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


Creating New Features

In [23]:
reviews = reviews.sort_values('review_overall', ascending=False)

reviews = reviews.drop_duplicates(subset= ['review_profilename', 'beer_name'], keep='first')

In [24]:
reviews = reviews[(reviews['review_overall'] >= 1)|\
       (reviews['review_appearance'] >= 1)]

In [26]:
# Convert 'object' to 'category' 
#reviews[reviews.select_dtypes(['object']).columns] = reviews.select_dtypes(['object']).\
                                                         #apply(lambda x: x.astype('category'))

In [27]:
reviews = reviews.dropna()
reviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1495555 entries, 793307 to 587292
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1495555 non-null  int64  
 1   brewery_name        1495555 non-null  object 
 2   review_time         1495555 non-null  int64  
 3   review_overall      1495555 non-null  float64
 4   review_aroma        1495555 non-null  float64
 5   review_appearance   1495555 non-null  float64
 6   review_profilename  1495555 non-null  object 
 7   beer_style          1495555 non-null  object 
 8   review_palate       1495555 non-null  float64
 9   review_taste        1495555 non-null  float64
 10  beer_name           1495555 non-null  object 
 11  beer_abv            1495555 non-null  float64
 12  beer_beerid         1495555 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 159.7+ MB


In [28]:
reviews['Score'] = ((reviews['review_appearance'] + reviews['review_aroma'] + reviews['review_palate'] + reviews['review_taste'])*5)

In [29]:
# create a list of our conditions
conditions = [
    (reviews['beer_abv'] >= 0.0)&(reviews['beer_abv'] < 0.5),
    (reviews['beer_abv'] >= 0.5) & (reviews['beer_abv'] < 3.2),
    (reviews['beer_abv'] >= 3.2) & (reviews['beer_abv'] < 6.0),
    (reviews['beer_abv'] >= 6.0)
    ]

# create a list of the values we want to assign for each condition
values = ['NA', 'Low', 'Reg', 'High']

# create a new column and use np.select to assign values to it using our lists as arguments
reviews['beer_strength'] = np.select(conditions, values)

# display updated DataFrame
reviews.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,Score,beer_strength
793307,579,Kona Brewing Co.,1293809208,5.0,2.5,3.0,vfgccp,American Pale Ale (APA),4.0,3.5,Fire Rock Pale Ale,5.8,5753,65.0,Reg
591801,1549,Goose Island Beer Co.,1180667386,5.0,5.0,4.5,bort11,American Double / Imperial Stout,5.0,4.5,Bourbon County Brand Stout,14.5,10672,95.0,High
591818,1549,Goose Island Beer Co.,1175185922,5.0,5.0,5.0,Beezor,American Double / Imperial Stout,4.5,5.0,Bourbon County Brand Stout,14.5,10672,97.5,High
591819,1549,Goose Island Beer Co.,1175135314,5.0,4.5,4.0,zeff80,American Double / Imperial Stout,4.5,5.0,Bourbon County Brand Stout,14.5,10672,90.0,High
591824,1549,Goose Island Beer Co.,1173843941,5.0,4.5,3.5,klewis,American Double / Imperial Stout,5.0,5.0,Bourbon County Brand Stout,14.5,10672,90.0,High


In [30]:
#reviews = reviews.drop(['brewery_id', 'review_time', 'review_overall', 'review_aroma', 'review_appearance', 'review_profilename', 'review_palate', 'review_taste', 'beer_beerid'], axis=1)

In [31]:
import matplotlib.pyplot as plt
%matplotlib inline 

In [32]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(reviews, test_size=0.2, random_state=42)

In [57]:
## separate labels and data

reviews = train_set.drop("review_overall", axis=1)
reviews_labels = train_set["review_overall"].copy()

In [58]:
from sklearn.impute import SimpleImputer

In [59]:
imputer = SimpleImputer(strategy= 'mean')

In [60]:
reviews_num = reviews.loc[: ,reviews.dtypes !=object]

In [61]:
imputer.fit(reviews_num)
temp_num_data = imputer.transform(reviews_num)

reviews_tr = pd.DataFrame(temp_num_data, columns=reviews_num.columns)

In [62]:
from sklearn.preprocessing import LabelBinarizer
encoder = LabelBinarizer()

In [63]:
beer_style_1hot = encoder.fit_transform(reviews['beer_style']) 

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler()),
    ])
reviews_num_tr = num_pipeline.fit_transform(reviews_num)


In [65]:
try:
    from sklearn.compose import ColumnTransformer
except ImportError:
    from future_encoders import ColumnTransformer # Scikit-Learn < 0.20

In [66]:
from sklearn.preprocessing import OneHotEncoder

In [67]:
num_attribs = list(reviews_num)
cat_attribs = ["beer_name"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

reviews_prepared = full_pipeline.fit_transform(reviews)

In [68]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(reviews_num, reviews_labels)

LinearRegression()

In [70]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(reviews_prepared, reviews_labels)

DecisionTreeRegressor(random_state=42)

In [71]:
reviews_predictions = tree_reg.predict(reviews_prepared)
tree_mse = mean_squared_error(reviews_labels, reviews_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.00045711325656622235

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, reviews_prepared, reviews_labels,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)



def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)