In [1]:
# read .csv into python
import pandas as pd
import numpy as np
import os
data = pd.read_csv('D:/Columbia/Fall 2018/PTA/Project/beeradvocate1.csv', sep='^', engine='python', 
                   header=None, error_bad_lines=False, warn_bad_lines=False)

In [2]:
data.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

# keep 3 columns: user name, beer name, overall score
data2 = data[['beer_name', 'review_profileName', 'review_overall', 'review_time']]

m = 33382 # number of users
n = 56855 # Number of items
# remove NA
data2 = data2[pd.notnull(data2.beer_name)]
data2 = data2[pd.notnull(data2.review_profileName)]
data2 = data2[pd.notnull(data2.review_overall)]
print(data2.shape)

(1585696, 4)


In [3]:
# keep top 1000/33382 frequent users
user = data2.review_profileName.value_counts()
user_list = user.keys()[:m].tolist()

# keep top 100/56855 most reviewed beer
beer = data2.beer_name.value_counts()[:200]
beer_list = beer.keys()[:n].tolist()

# keep (beer&user) pair in (user_list) and (beer_list)
subdata = data2[data2.beer_name.isin(beer_list)]
subdata = subdata[subdata.review_profileName.isin(user_list)]

# sort by user names
subdata = subdata.sort_values(by=['review_profileName','beer_name','review_time'])

print(subdata.shape)


(308218, 4)


In [4]:
#subdata.groupby(['beer_name', 'review_profileName'])['review_time'].max()

data_clean = subdata.sort_values(by='review_time')

data_clean.drop_duplicates(['beer_name', 'review_profileName'], keep = 'last',
                          inplace = True)
data_clean['review_overall'] = data_clean['review_overall'].astype(float)

data_train = data_clean.sample(frac = 0.7, random_state=1)
data_test = data_clean.drop(data_train.index)



In [5]:
from surprise import NMF, GridSearch, Reader, Dataset, SVD
from surprise.model_selection import GridSearchCV

In [6]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_clean[['review_profileName', 'beer_name', 'review_overall']], reader)

In [7]:
# NMF
param_grid_pu = {'n_factors': [25], 'n_epochs': [50], 'reg_pu': [0.1], 'reg_qi': [0.1]}
grid_search_pu = GridSearchCV(NMF, param_grid_pu, measures=['RMSE', 'MAE'], cv = 5)
grid_search_pu.fit(data)
results_pu = pd.DataFrame.from_dict(grid_search_pu.cv_results)
results_pu



Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,split3_test_rmse,split4_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_n_epochs,param_reg_pu,param_reg_qi
0,0.609763,0.613279,0.61129,0.609375,0.61008,0.610757,0.001414,1,0.450857,0.452819,...,1,22.484699,0.330696,0.521607,0.027191,"{'n_factors': 25, 'n_epochs': 50, 'reg_pu': 0....",25,50,0.1,0.1


In [8]:
print(results_pu["mean_test_rmse"])
print(results_pu["mean_test_mae"])

0    0.610757
Name: mean_test_rmse, dtype: float64
0    0.451954
Name: mean_test_mae, dtype: float64


In [None]:
# SVD
param_grid_pu2 = {'n_factors': [25], 'n_epochs': [50], 'lr_all': [0.01], 'reg_all': [0.05]}
grid_search_pu2 = GridSearchCV(SVD, param_grid_pu2, measures=['RMSE', 'MAE'], cv = 5)
grid_search_pu2.fit(data)
results_pu2 = pd.DataFrame.from_dict(grid_search_pu2.cv_results)
results_pu2

In [None]:
print(results_pu2["mean_test_rmse"])
print(results_pu2["mean_test_mae"])