In [19]:
import os
import urllib.request

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
file_path = 'https://www.dropbox.com/s/dzg4j2jolmpc8tb/beer_reviews.csv?dl=1'

df = pd.read_csv(file_path)

In [3]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586614 entries, 0 to 1586613
Data columns (total 13 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   brewery_id          1586614 non-null  int64  
 1   brewery_name        1586599 non-null  object 
 2   review_time         1586614 non-null  int64  
 3   review_overall      1586614 non-null  float64
 4   review_aroma        1586614 non-null  float64
 5   review_appearance   1586614 non-null  float64
 6   review_profilename  1586266 non-null  object 
 7   beer_style          1586614 non-null  object 
 8   review_palate       1586614 non-null  float64
 9   review_taste        1586614 non-null  float64
 10  beer_name           1586614 non-null  object 
 11  beer_abv            1518829 non-null  float64
 12  beer_beerid         1586614 non-null  int64  
dtypes: float64(6), int64(3), object(4)
memory usage: 157.4+ MB


In [8]:
print(f'unique beer styles: {len(df.beer_style.unique())}')
print(f'unique beers: {len(df.beer_beerid.unique())}')
print(f'unique breweries: {len(df.brewery_id.unique())}')
print(f'unique users: {len(df.review_profilename.unique())}')

unique beer styles: 104
unique beers: 66055
unique breweries: 5840
unique users: 33388


In [6]:
# Let us add a user_id column that maps to every user's review_profilename

df = df.assign(user_id=df['review_profilename'].astype('category').cat.codes)
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,user_id
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,30566
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,30566
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,30566
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,30566
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,23008


In [13]:
train_end = int(df.shape[0] * 0.8)
train = df[:train_end]
test = df[train_end:]


1586614:1269291:317323:1586614


In [15]:
def rmse(y_pred, y_true):
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [16]:
def evaluate(estimate_f):    
    ids_to_estimate = zip(test.user_id, test.beer_beerid)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = test.review_overall
    return rmse(estimated, real)

In [21]:
# Simple Content based filtering
def content_mean(user_id, movie_id):    
    user_condition = train.user_id == user_id
    return train.loc[user_condition, 'review_overall'].mean()

print(f'RMSE for estimate: {evaluate(content_mean)}')

RMSE for estimate: 0.7269122601676697
