In [1]:
import pandas as pd

from surprise.model_selection import GridSearchCV
from surprise import Dataset, Reader
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import cross_validate, train_test_split

In [53]:
#df containing the reviews, and uids for each beer and reviewer.
review_df = pd.read_csv('final_df')
review_df.drop('Unnamed: 0', axis=1, inplace=True)

#df with the name and uid for each beer
beer_df = pd.read_csv('final_beers')
beer_df.drop('Unnamed: 0', axis=1, inplace=True)

#df with the user id and uid for each reviewer
reviewer_df = pd.read_csv('reviewers')
reviewer_df.drop('Unnamed: 0', axis=1, inplace=True)

## Without gridsearch
Using a base model without grid search to get a baseline for the user rating. 

In [54]:
#data set for overall rating given by user
overall_df = review_df[['reviewer_id', 'beer_id', 'overall_rating']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(overall_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [62]:
final = SVD(n_epochs=50, n_factors=1, biased=True, 
              lr_all=0.005, reg_all=0.06)
final.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x160cfd72448>

In [63]:
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4078
MAE:  0.2825


0.2824553516248325

## With gridsearch

Using the user rating

In [11]:
#grid used to iterate through.
overall_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [12]:
gridmod = GridSearchCV(SVD, overall_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [13]:
params = gridmod.best_params

In [14]:
params

{'rmse': {'n_factors': 3,
  'n_epochs': 40,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 5,
  'n_epochs': 50,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True}}

In [92]:
overall_params = {'n_factors': 3,
  'n_epochs': 40,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True}

Using the 'look' rating given by users. 

In [19]:
look_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [20]:
look_df = review_df[['reviewer_id', 'beer_id', 'look']]
data = Dataset.load_from_df(look_df, reader)
gridmod = GridSearchCV(SVD, look_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [23]:
params = gridmod.best_params
params

{'rmse': {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 1,
  'n_epochs': 30,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}}

In [93]:
look_params = {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}

Using the 'smell' rating given by users

In [24]:
smell_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [25]:
smell_df = review_df[['reviewer_id', 'beer_id', 'smell']]
data = Dataset.load_from_df(smell_df, reader)
gridmod = GridSearchCV(SVD, smell_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [28]:
#best params from the smell gridsearch
params = gridmod.best_params
params

{'rmse': {'n_factors': 1,
  'n_epochs': 30,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 5,
  'n_epochs': 30,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True}}

In [94]:
smell_params = {'n_factors': 1,
  'n_epochs': 30,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True}

using the 'taste' rating given by users

In [29]:
taste_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [30]:
taste_df = review_df[['reviewer_id', 'beer_id', 'taste']]
data = Dataset.load_from_df(taste_df, reader)
gridmod = GridSearchCV(SVD, taste_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [33]:
params = gridmod.best_params
params

{'rmse': {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 5,
  'n_epochs': 30,
  'lr_all': 0.01,
  'reg_all': 0.1,
  'biased': True}}

In [95]:
taste_params = {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}

Using the 'feel' rating given by users

In [34]:
feel_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [35]:
feel_df = review_df[['reviewer_id', 'beer_id', 'feel']]
data = Dataset.load_from_df(feel_df, reader)
gridmod = GridSearchCV(SVD, feel_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [38]:
params = gridmod.best_params
params

{'rmse': {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}}

In [96]:
feel_params = {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}

Using the overall -average of physical reviews- rating given by users

In [39]:
all_param_grid = {'n_factors':[1, 3, 5, 7],'n_epochs':[20, 30, 40, 50],  'lr_all':[0.005, 0.01, 0.05, 0.1],
              'reg_all':[0.02, 0.1], 'biased':[True]}

In [40]:
all_df = review_df[['reviewer_id', 'beer_id', 'overall']]
data = Dataset.load_from_df(all_df, reader)
gridmod = GridSearchCV(SVD, all_param_grid, measures=['rmse', 'mae'], cv=3)
gridmod.fit(data)

In [43]:
params = gridmod.best_params
params

{'rmse': {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True},
 'mae': {'n_factors': 5,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}}

In [97]:
all_params = {'n_factors': 1,
  'n_epochs': 50,
  'lr_all': 0.005,
  'reg_all': 0.1,
  'biased': True}

## Comparing the models closer to each other. 

In [114]:
overall_df = review_df[['reviewer_id', 'beer_id', 'overall_rating']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(overall_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [115]:
final = SVD(n_factors = 3,
  n_epochs= 40,
  lr_all = 0.01,
  reg_all= 0.1,
  biased= True)
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4078
MAE:  0.2823


0.2822869832724041

In [None]:
#look

In [121]:
look_df = review_df[['reviewer_id', 'beer_id', 'look']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(look_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [124]:
final = SVD(n_factors = 1,
 n_epochs = 50,
 lr_all = 0.005,
 reg_all = 0.1,
 biased = True) #using look params gs
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4356
MAE:  0.3136


0.31363324991775154

In [101]:
#smell

In [102]:
smell_df = review_df[['reviewer_id', 'beer_id', 'smell']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(smell_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [126]:
final = SVD(n_factors= 1,
 n_epochs= 30,
 lr_all= 0.01,
 reg_all= 0.1,
 biased= True) #using best params from gridsearch
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4360
MAE:  0.3141


0.3140960429988747

In [None]:
#taste

In [104]:
taste_df = review_df[['reviewer_id', 'beer_id', 'taste']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(taste_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [128]:
final = SVD(n_factors= 1,
 n_epochs= 50,
 lr_all= 0.005,
 reg_all= 0.1,
 biased= True) #using best params from gridsearch
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4357
MAE:  0.3137


0.31373349490010416

In [None]:
#feel

In [None]:
feel_df = review_df[['reviewer_id', 'beer_id', 'feel']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(feel_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [130]:
final = SVD(n_factors= 1,
 n_epochs= 50,
 lr_all= 0.005,
 reg_all= 0.1,
 biased= True) #using best params from gridsearch
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4357
MAE:  0.3137


0.31368953308249886

In [None]:
#average

In [132]:
all_df = review_df[['reviewer_id', 'beer_id', 'overall']]
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(all_df, reader)
trainset, testset = train_test_split(data, test_size=.2, random_state= 40)

In [133]:
final = SVD(n_factors= 1,
 n_epochs= 50,
 lr_all= 0.005,
 reg_all= 0.1,
 biased= True) #using best params from gridsearch
final.fit(trainset)
predictions = final.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 0.4664
MAE:  0.3253


0.3252896318396259

## Conclusions

So based on the data provided, which is something I would like to improve upon in the future, the average ratings of the physical characteristics of the beer seemed to provide a slightly better RMSE over the other user ratings. 
Future work would include scraping more reviews, and being more selective with the reviews used. There were too many single reviewed beers and low review count reviewers. 