In [16]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

In [4]:
#collaborative imports
from surprise import Dataset
from surprise import Reader
from surprise import dump
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import KFold
from surprise.model_selection.split import train_test_split

In [5]:
df = pd.read_csv('../data/top_ratings.csv') #dataframe containing reviews for top beers
df = df.dropna()
df = df[~(df.score>5)]
df['uid'] = df.groupby('username').ngroup()
# user_df = df.pivot(index='uid',columns='beer_id',values='score')
# user_df

In [18]:

class CollabRecommender():
    '''
    SVD Collaborative-Based Beer Recommender
    '''
    def __init__(self, df):
        reader = Reader(rating_scale=(1,5))
        self.data = Dataset.load_from_df(df[['uid','beer_id','score']],reader)
        self.trainset, self.testset = train_test_split(self.data,test_size=.20)
        self.model = self.load_obj('model')
        try:
            self.predictions = self.load_obj('predictions')
        except:
            print('Unable to load predictions, please use .predict() method.')
        try:
            self.model = self.load_obj('model')
        except:
            print('Unable to load model, please use the .fit() method.')

    def fit(self, model=SVD, n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02):
        self.model = model(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, verbose=True)
        self.model.fit(self.trainset)
        self.save_obj(self.model, 'model')
    
    def predict(self):
        
        print('Generating predictions...')
        self.predictions = self.model.test(self.testset)
    
    def get_top_n(self, n=10):
        '''
        Return the top-N recommendation for each user from a set of predictions.
        Input:
            predictions(list of Prediction objects): The list of predictions, as
                returned by the test method of an algorithm.
            n(int): The number of recommendation to output for each user. Default
                is 10.

        Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
            [(raw item id, rating estimation), ...] of size n.
        '''

        # First map the predictions to each user.
        top_n = defaultdict(list)
        for uid, iid, true_r, est, _ in self.predictions:
            top_n[uid].append((iid, est))

        # Then sort the predictions for each user and retrieve the k highest ones.
        for uid, user_ratings in top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            top_n[uid] = user_ratings[:n]

        return top_n
    

    #pickle helper functions
    def save_obj(self,obj, name):
        with open('../obj/'+ name + '.pkl', 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

    def load_obj(self,name):
        with open('../obj/' + name + '.pkl', 'rb') as f:
            return pickle.load(f)     


In [19]:
colab_r = CollabRecommender(df)

In [20]:
predictions = colab_r.predict()

Generating predictions...


In [21]:
top_ratings = colab_r.get_top_n()

defaultdict(list,
            {65974: [(87246, 4.560385216468914),
              (110635, 4.494985475720616),
              (102123, 4.473140302861139),
              (64545, 4.4683744020841125),
              (7971, 4.4407793889666625),
              (52928, 4.415763243610473),
              (109989, 4.404742713208501),
              (46849, 4.3884398765950925),
              (114804, 4.3561181140604),
              (47954, 4.354064854943778)],
             39309: [(16909, 4.329816504640543),
              (34483, 4.2264231803644785),
              (1558, 4.210225545225556),
              (40674, 4.183060845893827),
              (1904, 4.15538417886436),
              (3158, 4.134904662243856),
              (44409, 4.110380763582626),
              (197, 4.093111010344757),
              (25852, 4.081892898481363),
              (1769, 4.053080763327822)],
             98243: [(34483, 4.1423353815096),
              (6108, 3.985063982645619),
              (7463, 3.909604767749662),

In [32]:
top_ratings[39309]

[(16909, 4.329816504640543),
 (34483, 4.2264231803644785),
 (1558, 4.210225545225556),
 (40674, 4.183060845893827),
 (1904, 4.15538417886436),
 (3158, 4.134904662243856),
 (44409, 4.110380763582626),
 (197, 4.093111010344757),
 (25852, 4.081892898481363),
 (1769, 4.053080763327822)]

In [37]:
sim_users = [170, 203,  87, 169, 190]
beers={}
for user in sim_users:
    for item in top_ratings[user]:
        if len(item)>0:
            beers[item[0]]=item[1]

beers


{1372: 4.062112052778878, 7463: 4.1464328095279495, 22343: 3.8146801284069456}

In [41]:
beers = dict(sorted(beers.items(),key=lambda item: item[1], reverse=True))
beers

{7463: 4.1464328095279495, 1372: 4.062112052778878, 22343: 3.8146801284069456}

In [38]:
max(beers,key=beers.get)

7463

In [11]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
pred_df = pd.DataFrame(predictions, columns=['username', 'beer_id', 'rui', 'est', 'details'])
pred_df['Iu'] = pred_df.username.apply(get_Iu)
pred_df['Ui'] = pred_df.beer_id.apply(get_Ui)
pred_df['err'] = abs(pred_df.est - df.rui)
best_predictions = pred_df.sort_values(by='err')[:10]
worst_predictions = pred_df.sort_values(by='err')[-10:]

NameError: name 'predictions' is not defined

In [6]:
pu = colab_r.model.pu
qi = colab_r.model.qi
bu = colab_r.model.bu
bi = colab_r.model.bi

In [26]:
bi.shape

(1000,)

In [37]:
avg=df.score.mean()
pq = np.dot((bi[:,None]+qi),(bu[:,None]+pu).T)
prediction = (avg+pq).T
prediction.shape

(122278, 1000)

In [41]:
prediction[prediction>5]=5
prediction[prediction<0]=0

In [47]:
top_recs = prediction.argsort()
top_recs[0][:10]

array([  0, 629, 628, 626, 625, 624, 622, 616, 610, 605])

In [50]:
df.iloc[0]

beer_id      125646
username    _dirty_
text               
score           4.5
uid           68954
Name: 0, dtype: object