In [1]:
# read .csv into python
import pandas as pd
import numpy as np
import operator
import os
import gzip
import re

In [2]:
with gzip.open('../Beeradvocate.txt.gz', 'r') as f:
  rb_file = f.readlines()


maindata = []
row_out = []

for i in rb_file:
    row = i.decode('utf-8', errors = 'replace')
    #print(row)
    if row == '\n':
      maindata.append(row_out)
      row_out = []
      continue
    cat, field = row.split(":", 1)
    #remove leading white spaces
    field = field.rstrip()
    row_out.append(field)
maindata = pd.DataFrame(maindata)


In [3]:
maindata.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

# keep 3 columns: user name, beer name, overall score
data2 = maindata[['beer_name', 'review_profileName', 'review_overall', 'review_time']]

# n = 56855 # Number of items
# m = 33382

n = 56855 # Number of items
m = 33382
# remove NA
data2 = data2[pd.notnull(data2.beer_name)]
data2 = data2[pd.notnull(data2.review_profileName)]
data2 = data2[pd.notnull(data2.review_overall)]
print(data2.shape)

(1586614, 4)


In [4]:
# keep users with atleast 10 beers/reviews.
user = data2.review_profileName.value_counts()
# for i in range(1, 33382) :
#         if (user[i] == 9):
#             m = i 
#             break;
user_list = user.keys()[:m].tolist()

# keep top 100/56855 most reviewed beer
beer = data2.beer_name.value_counts()[:n]
beer_list = beer.keys()[:n].tolist()

# keep (beer&user) pair in (user_list) and (beer_list)
subdata = data2[data2.beer_name.isin(beer_list)]
subdata = subdata[subdata.review_profileName.isin(user_list)]

# sort by user names
subdata = subdata.sort_values(by=['review_profileName','beer_name','review_time'])

print(subdata.shape)


(1586606, 4)


In [5]:
#subdata.groupby(['beer_name', 'review_profileName'])['review_time'].max()

data_clean = subdata.sort_values(by='review_time')

data_clean.drop_duplicates(['beer_name', 'review_profileName'], keep = 'last',
                          inplace = True)
data_clean['review_overall'] = data_clean['review_overall'].astype(float)



In [6]:
from surprise import NMF, GridSearch, Reader, Dataset, SVD, NMF
from surprise.model_selection import GridSearchCV

In [7]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(data_clean[['review_profileName', 'beer_name', 'review_overall']], reader)

SVD

In [8]:
from collections import defaultdict
from surprise.model_selection import train_test_split
import time
trainset, testset = train_test_split(data, test_size=.25)




In [9]:
start_time = time.time()
algo = SVD(n_factors = 30, lr_all = 0.01, reg_all = 0.05)
algo.fit(trainset)
recommendation = algo.test(testset)
print("--- %s seconds ---" % (time.time() - start_time))

--- 61.29616618156433 seconds ---


In [21]:
svd_user_dict = {}
for recom in recommendation:
    if recom[0] not in svd_user_dict:
        svd_user_dict[recom[0]] = []
    svd_user_dict[recom[0]].append((recom[2],recom[3]))

In [22]:
start_time = time.time()
svd_rmse_dict = {}
for user in svd_user_dict:
    err = 0
    for t in svd_user_dict[user]:
        err = err + ((t[0] - t[1]) ** 2)
    rmse = err/ len(svd_user_dict[user])
    beers = len(data_clean[data_clean['review_profileName'] == user]) - len(svd_user_dict[user])
    if beers not in svd_rmse_dict:
         svd_rmse_dict[beers] = []
    svd_rmse_dict[beers].append(rmse)
print("--- %s seconds ---" % (time.time() - start_time))

--- 7242.851690530777 seconds ---


In [23]:
svd_rmse_final = {}
for beer_count in svd_rmse_dict:
    average = sum(svd_rmse_dict[beer_count]) / len(svd_rmse_dict[beer_count])
    svd_rmse_final[beer_count] =  average


In [36]:
svd_rmse_final

KeyError: 1001

In [26]:
import json
with open('final_svd_rmse.json', 'w') as outfile:
    json.dump(svd_rmse_final, outfile)

NMF

In [37]:

start_time = time.time()
algo = NMF(n_factors = 25, n_epochs = 50, reg_pu = 0.1, reg_qi = 0.1)
algo.fit(trainset)
nmf_recommendation = algo.test(testset)

print("--- %s seconds ---" % (time.time() - start_time))

--- 165.22505927085876 seconds ---


In [38]:
nmf_user_dict = {}
for recom in nmf_recommendation:
    if recom[0] not in nmf_user_dict:
        nmf_user_dict[recom[0]] = []
    nmf_user_dict[recom[0]].append((recom[2],recom[3]))

In [39]:

start_time = time.time()
nmf_rmse_dict = {}
for user in nmf_user_dict:
    err = 0
    for t in nmf_user_dict[user]:
        err = err + ((t[0] - t[1]) ** 2)
    rmse = err/ len(nmf_user_dict[user])
    beers = len(data_clean[data_clean['review_profileName'] == user]) - len(nmf_user_dict[user])
    if beers not in nmf_rmse_dict:
         nmf_rmse_dict[beers] = []
    nmf_rmse_dict[beers].append(rmse)
print("--- %s seconds ---" % (time.time() - start_time))

--- 5416.158062458038 seconds ---


In [40]:
nmf_rmse_final = {}
for beer_count in nmf_rmse_dict:
    average = sum(nmf_rmse_dict[beer_count]) / len(nmf_rmse_dict[beer_count])
    nmf_rmse_final[beer_count] =  average


In [41]:
nmf_rmse_final

{174: 0.3248910801455645,
 358: 0.2618650538693067,
 1887: 0.2662752002146341,
 23: 0.4509004910865521,
 269: 0.2486512006187544,
 425: 0.2948680513681398,
 900: 0.21510995169584637,
 40: 0.4187978040806068,
 181: 0.33625528129699267,
 11: 0.5363021572053084,
 111: 0.30959179205074105,
 448: 0.4095569176898576,
 1063: 0.5969040183779383,
 1698: 0.4629264273266352,
 814: 0.4221455023728704,
 129: 0.33944729096588633,
 12: 0.5313034512631359,
 110: 0.3501496380184691,
 408: 0.3055072619191628,
 183: 0.34936066347432876,
 204: 0.4031654896475592,
 0: 1.036583571279521,
 406: 0.327743845151064,
 213: 0.33058874303108854,
 17: 0.4564133341037532,
 121: 0.306730214905335,
 1083: 0.4089955587097941,
 2488: 0.3378388706074241,
 1637: 0.2225132219478381,
 167: 0.2937829001055258,
 522: 0.2399330234578879,
 285: 0.34220189573375914,
 581: 0.434833959443593,
 461: 0.47666731433760173,
 560: 0.4689040113270291,
 452: 0.31082192955646837,
 2419: 0.33313102797170924,
 55: 0.3445045110953559,
 61: 0.

In [42]:
import json
with open('final_nmf_rmse.json', 'w') as outfile:
    json.dump(nmf_rmse_final, outfile)