In [0]:
# based on Google Colab 
# python 3
!pip install scikit-surprise



In [0]:
import pandas as pd
import numpy as np
import os
import itertools as it
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from matplotlib.ticker import FormatStrFormatter
from surprise.model_selection import train_test_split
from surprise import NMF, Reader, Dataset, SVD, NMF, accuracy, KNNWithMeans

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


#### 1. Load Data

In [0]:
# load metadata from Google Drive 
# save as a list
data = []
one_complete_review = []

with open('/content/gdrive/My Drive/beeradvocate.txt', 'r') as f:
  for i in f: 
    #row = i.decode(encoding = 'UTF-8',errors = 'ignore')
    row = i.replace('beer/', 'beer_').replace('review/','review_') 
    if row == '\n':
      data.append(one_complete_review)
      one_complete_review = []
      continue
    row = row.split(": ", 1)[1].rstrip()
    one_complete_review.append(row)

In [0]:
# convert list to dataframe
column_names = ['beer_name', 'beer_beerId', 'beer_brewer', 'beer_ABV', 'beer_style', 
                'review_appearance', 'review_aroma', 'review_palate', 'review_taste', 
                'review_overall', 'review_time', 'review_profileName', 'review_text']

df = pd.DataFrame.from_records(data, columns=column_names)

#### 2. Subset three columns

In [0]:
# subset data for collabrotive filtering
df1 = df[['beer_name', 'beer_beerId', 'review_profileName', 'review_overall', 'review_time']]
print('Original data size: %s' % str(df1.shape)) 

# remove NA 
df1 = df1[pd.notnull(df1.beer_name) & pd.notnull(df1.review_profileName) & pd.notnull(df1.review_overall)]
# remove blanks
df1 = df1.loc[df1.review_profileName != '']
df1 = df1.loc[df1.beer_name != '']
df1 = df1.loc[df1.review_overall != '']
print('After removing NAs and blanks: %s' % str(df1.shape)) 

# drop duplicate (beer&user) pairs, keep the latest rating
df1['beer_user_pair'] = df1.beer_name + df1.review_profileName
df1 = df1.sort_values(by=['review_time'], ascending=False).drop_duplicates(subset=['beer_user_pair'])
print('After drop duplicate user-item pairs (only keep the latest rating), data size: %s' % str(df1.shape))

# convert review ratings to numberic
df1.review_overall = pd.to_numeric(df1.review_overall)

Original data size: (1586614, 5)
After removing NAs and blanks: (1586266, 5)
After drop duplicate user-item pairs (only keep the latest rating), data size: (1561405, 6)


In [0]:
# subset three columns
cf = df1[['review_profileName', 'beer_name', 'review_overall']]

# sort user by # of reviews
num_review_byuser = df.review_profileName.value_counts()
cnt_user = dict(num_review_byuser)
cf['user_freq'] = [cnt_user.get(x) for x in cf.review_profileName]

# sort beer by # of reviews
num_review_bybeer = df.beer_name.value_counts()
cnt_beer = dict(num_review_bybeer)
cf['beer_freq'] = [cnt_beer.get(x) for x in cf.beer_name]

# drop users with less than 10 reviews
# drop beers with less than 5 reviews
cf = cf.loc[cf.user_freq > 10]
cf = cf.loc[cf.beer_freq > 5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [0]:
print('After removing bottom users and beers,')
print('%d unique beers;' % cf.beer_name.nunique())
print('%d unique users;' % cf.review_profileName.nunique())
print('and %d reviews in total.' % cf.shape[0])

After removing bottom users and beers,
18925 unique beers;
10189 unique users;
and 1423856 reviews in total.


#### 3. Define functions

In [0]:
# define functions to generate prediction dataframe
# get_Iu and get_Ui are borrowed from Surprise library
def get_Iu(uid):
    """Return the number of items rated by given user
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

# customized function to get predictions
def get_pred_df(pred):
  pred_df = pd.DataFrame(pred, columns=['uid', 'iid', 'rui', 'est', 'details'])    
  pred_df['Iu'] = pred_df.uid.apply(get_Iu)
  pred_df['Ui'] = pred_df.iid.apply(get_Ui)
  pred_df['err'] = abs(pred_df.est - pred_df.rui)
  
  # append review counts
  pred_df['user_freq'] = [cnt_user.get(x) for x in pred_df.uid]
  pred_df = pred_df.sort_values(by=['uid'])
  
  return pred_df

#### 4. Train Test Split

In [0]:
# train test split
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(cf[['review_profileName', 'beer_name', 'review_overall']], reader)
trainset, testset = train_test_split(data, test_size=.33)

#### 5. KNN on whole dataset (after dropped bottom users and beers)
Comments: although this algorithm run through, it takes up ~11GB RAM as noticed by Colab. This shows that KNN methods do not scale well for large dataset.

In [0]:
# model set up
sim_options = {'name': 'pearson', 'user_based': True, 'min_support':10}  
knnmean = KNNWithMeans(k=300, 
                       min_k=10, 
                       sim_options=sim_options, 
                       verbose=True)
# fit
knnmean.fit(trainset)

# test
knn_pred = knnmean.test(testset)

# predict
knn_pred_df = get_pred_df(knn_pred)
knn_pred_df.head()

Computing the pearson similarity matrix...
Done computing similarity matrix.


Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,user_freq
2731,0110x011,Good Mojo,5.0,4.220588,"{'actual_k': 8, 'was_impossible': False}",102,9,0.779412,139
238518,0110x011,Schlafly Oatmeal Stout,4.0,4.025864,"{'actual_k': 47, 'was_impossible': False}",102,126,0.025864,139
74494,0110x011,10 Commandments,3.5,4.161951,"{'actual_k': 120, 'was_impossible': False}",102,194,0.661951,139
447595,0110x011,White Zin,4.5,4.349181,"{'actual_k': 10, 'was_impossible': False}",102,16,0.150819,139
52543,0110x011,The Dissident,4.0,4.517313,"{'actual_k': 133, 'was_impossible': False}",102,226,0.517313,139


In [0]:
# predict whole dataset
trainset_s, testset_s = train_test_split(data, test_size=.8)

knn_pred_s = knnmean.test(testset_s)
knn_pred_df_s = get_pred_df(knn_pred_s)
knn_pred_df_s.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,user_freq
190004,0110x011,Bourbon County Brand Stout,4.0,4.349813,"{'actual_k': 300, 'was_impossible': False}",102,1062,0.349813,139
65736,0110x011,Trappist Westvleteren 12,4.5,4.947105,"{'actual_k': 300, 'was_impossible': False}",102,763,0.447105,139
8986,0110x011,Ten FIDY,4.0,4.464221,"{'actual_k': 300, 'was_impossible': False}",102,1087,0.464221,139
68971,0110x011,AleSmith Speedway Stout - Barrel Aged,3.5,4.514964,"{'actual_k': 87, 'was_impossible': False}",102,136,1.014964,139
387797,0110x011,Weihenstephaner Hefeweissbier,5.0,4.969172,"{'actual_k': 300, 'was_impossible': False}",102,1169,0.030828,139


#### 6. KNN on two separate sets

Comments: beforing dropping the bottom users and beers, the dataset was too large to run on Colab and led to runtime died everytime, so we need to split the original dataset.  According to exploratory analysis above, 100 reviews is a benchmark that separate users' review_group. Thus, we split the dataset into two parts: users with more than 100 reviews and users less than 100 reviews.

Below, the result shows that this separation lead to a higher accuracy compared to the previous user-based *KNN* that used the whole dataset for training and testing. 

Note that parameters used in this section follows from previous gridsearch results. We do not include gridsearch step in this notebook, because it also takes long running time and large memory, thus not necessary/efficient to run each time we compile this notebook.

##### Split into two groups 

in case kernel shut down on the whole dataset, we will use this two subsets to train KNN
- users with more than 100 reviews;
- users with less than 100 reviews.

In [0]:
# split into two groups
threshold = 100

more = cf.loc[cf.user_freq > threshold]
less = cf.loc[cf.user_freq <= threshold]

more.head()

Unnamed: 0,review_profileName,beer_name,review_overall,user_freq,beer_freq
1023623,Mark,Fiji Bitter,4.0,532,8
1077899,bcm119,Wolaver's India Pale Ale,3.5,175,257
797362,Jason,Lindemans Framboise,3.5,2350,903
808675,Jason,Chapeau Framboise Lambic,4.0,2350,44
961766,Jason,Dooryard Ale,3.0,2350,13


In [0]:
# set up model
sim_options = {'name': 'pearson', 'user_based': True, 'min_support':10}  
knnmean1 = KNNWithMeans(k=300, 
                       min_k=10, 
                       sim_options=sim_options, 
                       verbose=True)

knnmean2 = KNNWithMeans(k=300, 
                       min_k=10, 
                       sim_options=sim_options, 
                       verbose=True)

##### 1) users with more than 100 reviews

In [0]:
# split train & test 
knn_more = Dataset.load_from_df(more[['review_profileName', 'beer_name', 'review_overall']], reader)
trainset_more, testset_more = train_test_split(knn_more, test_size=.33)

# fit
knnmean1.fit(trainset_more)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f86b035cf28>

In [0]:
# test
pred_more = knnmean1.test(testset_more)

In [0]:
# prediction
pred_more_df = get_pred_df(pred_more)
pred_more_df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,user_freq
65209,0110x011,Storm King Stout,4.0,4.535019,"{'actual_k': 300, 'was_impossible': False}",102,1474,0.535019,139
261046,0110x011,Union Jack India Pale Ale,4.5,4.65256,"{'actual_k': 215, 'was_impossible': False}",102,439,0.15256,139
16960,0110x011,Temptation,5.0,4.822557,"{'actual_k': 268, 'was_impossible': False}",102,571,0.177443,139
385401,0110x011,Bottleworks 10th Anniversary Wild Ale,5.0,4.619681,"{'actual_k': 18, 'was_impossible': False}",102,25,0.380319,139
278817,0110x011,Two Hearted Ale,4.0,4.825561,"{'actual_k': 300, 'was_impossible': False}",102,1657,0.825561,139


##### 2) users with less than 100 reviews

In [0]:
# split train & test 
knn_less = Dataset.load_from_df(less[['review_profileName', 'beer_name', 'review_overall']], reader)
trainset_less, testset_less = train_test_split(knn_less, test_size=.33)

# fit
knnmean2.fit(trainset_less)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f86b035c208>

In [0]:
# test
pred_less = knnmean2.test(testset_less)

# predict
pred_less_df = get_pred_df(pred_less)
pred_less_df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,user_freq
30708,1121987,La Fin Du Monde,4.5,4.470588,"{'actual_k': 0, 'was_impossible': False}",20,1495,0.029412,29
76523,1121987,Boréale Blanche,5.0,4.470588,"{'actual_k': 0, 'was_impossible': False}",20,12,0.529412,29
55021,1121987,Faxe Fest Bock,4.0,4.470588,"{'actual_k': 0, 'was_impossible': False}",20,18,0.470588,29
12542,1121987,Revenante,4.5,3.864883,"{'was_impossible': True, 'reason': 'User and/o...",20,6,0.635117,29
21361,1121987,Hefeweizen,3.0,4.470588,"{'actual_k': 0, 'was_impossible': False}",20,421,1.470588,29


#### 7. KNN item-based
- failed twice: runtime died
- user-based collabrative filtering takes up 12GB+ RAM, reached the limits of Colab and led to kernel crach every time.

#### 8. Get Predictions and Errors

In [0]:
# define function to get group prediction error
def error_bygroup(pred_df, model_name):
  """
  input complete pred_error df
  output pred_error by group
  """
  # cut into groups
  bins = [10, 20, 30, 40, 50, 100, 200, 500, 1000, 2000, 6000]
  labels = ['11-20', '21-30', '31-40', '41-50', '51-100', '101-200', 
            '201-500', '501-1000', '1001-2000', '2001-6000']
  
  # add group label
  pred_df['review_group'] = pd.cut(pred_df['user_freq'], bins = bins, labels = labels)
  
  # calculate group mean
  pred_df = pred_df.groupby('review_group').mean()
  
  # rename the error column
  #pred_df['err'].columns = model_name
  pred_df = pred_df.rename(columns = {'err':model_name})
  return pred_df

In [0]:
# get prediction error datarfame
knn_err_separate = error_bygroup(knn_pred_df, 'KNN_separate')
knn_err = error_bygroup(knn_pred_df_s, 'KNN')

In [0]:
# KNN prediction (if it runs through)
knn_pred_df_s

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err,user_freq,review_group
190004,0110x011,Bourbon County Brand Stout,4.0,4.349813,"{'actual_k': 300, 'was_impossible': False}",102,1062,0.349813,139,101-200
65736,0110x011,Trappist Westvleteren 12,4.5,4.947105,"{'actual_k': 300, 'was_impossible': False}",102,763,0.447105,139,101-200
8986,0110x011,Ten FIDY,4.0,4.464221,"{'actual_k': 300, 'was_impossible': False}",102,1087,0.464221,139,101-200
68971,0110x011,AleSmith Speedway Stout - Barrel Aged,3.5,4.514964,"{'actual_k': 87, 'was_impossible': False}",102,136,1.014964,139,101-200
387797,0110x011,Weihenstephaner Hefeweissbier,5.0,4.969172,"{'actual_k': 300, 'was_impossible': False}",102,1169,0.030828,139,101-200
591054,0110x011,Union Jack India Pale Ale,4.5,4.599340,"{'actual_k': 239, 'was_impossible': False}",102,439,0.099340,139,101-200
906276,0110x011,Founders Breakfast Stout,4.5,4.700928,"{'actual_k': 300, 'was_impossible': False}",102,1507,0.200928,139,101-200
667330,0110x011,Phunky Duck,4.5,4.220588,"{'actual_k': 6, 'was_impossible': False}",102,10,0.279412,139,101-200
881415,0110x011,The Angel's Share - Bourbon Barrel-Aged,5.0,4.532440,"{'actual_k': 186, 'was_impossible': False}",102,325,0.467560,139,101-200
726404,0110x011,Alpha Klaus Christmas (Xmas) Porter,5.0,4.638627,"{'actual_k': 230, 'was_impossible': False}",102,434,0.361373,139,101-200


#### Save the model

In [0]:
import pickle
pickle.dump(knn_pred_df_s,open("/content/gdrive/My Drive/knn_rec.sav", 'wb'))

In [0]:
# prediction here in this variable
# knn_pred_df_s