In [1]:
# python2
import gzip
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Parsing and cleaning

Load in the data

In [2]:
with gzip.open('Beeradvocate.txt.gz', 'r') as f:
  rb_file = f.readlines()


data = []
row_out = []

for i in rb_file:
    row = i.decode('utf-8', errors = 'ignore')
    #print(row)
    if row == '\n':
      data.append(row_out)
      row_out = []
      continue
    row = row.split(":", 1)[1].rstrip()
    row_out.append(row)


In [3]:
data = pd.DataFrame(data)

data.columns = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

# keep 3 columns: user name, beer name, overall score
data2 = data[['beer_name', 'review_profileName', 'review_overall','review_time']]

# remove NA
data2 = data2[pd.notnull(data2.beer_name)]
data2 = data2[pd.notnull(data2.review_profileName)]
data2 = data2[pd.notnull(data2.review_overall)]


In [4]:
# keep top 1000/33382 frequent users
user = data2.review_profileName.value_counts()
user_list = user.keys()[:1000].tolist()

# keep top 100/56855 most reviewed beer
beer = data2.beer_name.value_counts()[:100]
beer_list = beer.keys()[:100].tolist()

# keep (beer&user) pair in (user_list) and (beer_list)
subdata = data2[data2.beer_name.isin(beer_list)]
subdata = subdata[subdata.review_profileName.isin(user_list)]

# sort by user names
subdata = subdata.sort_values(by=['review_profileName','beer_name','review_time'])

print(subdata.shape)
subdata.head(10)

(68929, 4)


Unnamed: 0,beer_name,review_profileName,review_overall,review_time
1191634,120 Minute IPA,1fastz28,3.0,1198733748
1170777,60 Minute IPA,1fastz28,4.0,1176428202
1174390,90 Minute IPA,1fastz28,4.5,1173146934
1339659,Alpha King Pale Ale,1fastz28,4.5,1275961162
868119,Arrogant Bastard Ale,1fastz28,4.0,1214535830
1032119,Ayinger Celebrator Doppelbock,1fastz28,4.0,1240105718
931404,Bell's Hopslam Ale,1fastz28,4.5,1236131835
944912,Bell's Oberon Ale,1fastz28,4.5,1240099321
353066,Blue Moon Belgian White,1fastz28,3.0,1204429269
591597,Bourbon County Brand Stout,1fastz28,4.0,1207029127


Only keeping latest reviews by user in case they rated the same beer multipe times

In [5]:
data_clean = subdata.sort_values(by='review_time')

data_clean.drop_duplicates(['beer_name', 'review_profileName'], keep = 'last',
                          inplace = True)

In [6]:
data_clean.head(10)
#data_clean.shape

Unnamed: 0,beer_name,review_profileName,review_overall,review_time
282223,Samuel Adams Boston Lager,Jason,4,1000083887
518392,Samuel Smith's Oatmeal Stout,Jason,4,1000336190
518391,Samuel Smith's Oatmeal Stout,guinness33,5,1000464943
1339361,Alpha King Pale Ale,John,5,1000733852
1541504,Golden Monkey,John,5,1000809985
1334473,Dreadnaught IPA,John,5,1000890164
1544699,HopDevil Ale,John,5,1000904483
91268,Sierra Nevada Pale Ale,John,4,1000905152
518390,Samuel Smith's Oatmeal Stout,stoutman,5,1000931285
1548897,Storm King Stout,John,5,1001336489


**Splitting the data into training and testing**

In [7]:
data_clean['review_overall'] = data_clean['review_overall'].astype(float)

data_train, data_test = train_test_split(data_clean, test_size=.3, random_state=1)

Calculate the average rating for each item

In [8]:
mean_rating = data_train.groupby(['beer_name'], as_index = False, sort = False)['review_overall'].mean().rename(columns = {'review_overall': 'item_mean'})

data_train = pd.merge(data_train, mean_rating, on = 'beer_name', how = 'left')
data_train['adjusted_rating'] = data_train['review_overall'] - data_train['item_mean']

In [20]:
data_train.head()

Unnamed: 0,beer_name,review_profileName,review_overall,review_time,item_mean,adjusted_rating
0,Sierra Nevada Pale Ale,LilBeerDoctor,4.0,1228189535,4.261006,-0.261006
1,Anchor Steam Beer,JamesS,4.0,1199459768,4.038283,-0.038283
2,Ommegang (Abbey Ale),projectflam86,4.0,1316131551,4.057951,-0.057951
3,Stone Imperial Russian Stout,Naerhu,5.0,1057695990,4.244163,0.755837
4,#9,Golden2wenty1,4.0,1152928924,3.478933,0.521067


**Build the similarity matrix**

In [21]:
def item_matrix(train_sample):
  #find distinct beers
  beers = np.unique(train_sample['beer_name'])
  beer_pairs = []

  for beer1 in beers:
    #determine users who consumed the beer
    users = train_sample.loc[train_sample['beer_name'] == beer1] 
    for index, user in users.iterrows():
      #find rating for first beer
      user_name = user['review_profileName']  
      beer1_rating = user['adjusted_rating']
      #find other beer ratings by same users, excluding beer1 rating
      user_ratings = train_sample.loc[(train_sample['review_profileName'] == user_name) & (train_sample['beer_name'] != beer1)]
      for index, beer2 in user_ratings.iterrows():
        beer2_name = beer2['beer_name']
        beer2_rating = beer2['adjusted_rating']
        #print([user_name, beer1, beer2_name, beer1_rating, beer2_rating])
        beer_pairs.append([user_name, beer1, beer2_name, beer1_rating, beer2_rating])
  
  beer_pairs = pd.DataFrame(beer_pairs, columns = ['user', 'beer1', 'beer2', 'scaled_r1', 'scaled_r2'])
  
  return beer_pairs.groupby(['beer1', 'beer2'])[['scaled_r1','scaled_r2']].corr(method = 'pearson')
      

In [22]:
beer_pairs = item_matrix(data_train)

In [23]:
print (beer_pairs.shape)
beer_corr = beer_pairs.iloc[0::2][['scaled_r2']].reset_index()[['beer1', 'beer2', 'scaled_r2']]
beer_corr = beer_corr.rename(columns = {'scaled_r2': 'Pearson_corr'})

beer_corr.to_csv('beer_corr.csv', index = False, encoding = 'utf-8')

(19800, 2)


**Load the similarity matrix**

In [9]:
beer_corr = pd.read_csv('beer_corr.csv', encoding = 'utf-8')
beer_corr = beer_corr.rename(columns = {'scaled_r2': 'Pearson_corr'})
beer_corr.head()

Unnamed: 0,beer1,beer2,Pearson_corr
0,#9,120 Minute IPA,0.00398
1,#9,60 Minute IPA,0.074866
2,#9,90 Minute IPA,0.052636
3,#9,Alpha King Pale Ale,-0.01134
4,#9,Anchor Steam Beer,0.02919


In [62]:
def predict(user, beer, n_neighbors):
    #find the beers that have been rated by the user 
    user_ratings = data_train.loc[(data_train['review_profileName'] == user) & (data_train['beer_name'] != beer)]
    user_ratings = user_ratings.rename(columns = {'beer_name': 'beer2'})
    #find list of potential neighbors
    similarities = beer_corr.loc[beer_corr['beer1'] == beer]
    #narrow down to neighbors that were rated by users
    neighbors = user_ratings.merge(similarities, how = 'inner', on='beer2')
    #select n nearest neighbors
    neighbors = neighbors.sort_values(by = ['Pearson_corr'], ascending = False)[0:n_neighbors]
    #calculate numerator
    num = sum(neighbors['Pearson_corr']*neighbors['adjusted_rating'])
    den = sum(abs(neighbors['Pearson_corr']))
    #find item mean
    #try:
    beer_mean = data_train.loc[data_train['beer_name'] == beer][0:1]['item_mean'].values[0]
    #except KeyError:
    #    return None
    return beer_mean + num/den

In [63]:
t_u = data_train['review_profileName'][0]
t_b = data_train['beer_name'][0]
predict(t_u, t_b, 5)

4.298725123527138

# Testing 

Define the evaluation metrics

In [19]:
def RMSE(y, y_hat):
    return (y - y_hat)**2

In [64]:
se = 0
count = 0
for index, review in data_train.iterrows():
    predicted_rating = predict(review['review_profileName'], review['beer_name'], 5)
    #print (predicted_rating)
    se += RMSE(review['review_overall'], predicted_rating)
    count += 1
    #else:
    #    print (review['beer_name'], review['review_profileName'])

rmse = (se/count)**0.5
        
print(rmse)

0.5438915311781987
