## Recommendation Models

We will look at two models for recommending movies to existing users. 

 - Matrix factorization based on the surprise package.
 - Matrix factorization based on Pytorch.



### Recommendation (Pytorch) Training

Assuming we have installed pytorch (in the previous section), lets install certain additional packages. In particular, the package `surprise` can be installed using the command `conda install -c conda-forge scikit-surprise` (ensure you are not in the base environment but in the datasci-dev or some other specific environment).


In [1]:
from surprise import Dataset #can be replaced by explicitly importing the movielens data, https://github.com/NicolasHug/Surprise
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.utils import shuffle

In [2]:
class Loader():
    current = 0

    def __init__(self, x, y, batchsize=1024, do_shuffle=True):
        self.shuffle = shuffle
        self.x = x
        self.y = y
        self.batchsize = batchsize
        self.batches = range(0, len(self.y), batchsize)
        if do_shuffle:
            # Every epoch re-shuffle the dataset
            self.x, self.y = shuffle(self.x, self.y)

    def __iter__(self):
        # Reset & return a new iterator
        self.x, self.y = shuffle(self.x, self.y, random_state=0)
        self.current = 0
        return self

    def __len__(self):
        # Return the number of batches
        return int(len(self.x) / self.batchsize)

    def __next__(self):
        n = self.batchsize
        if self.current + n >= len(self.y):
            raise StopIteration
        i = self.current
        xs = torch.from_numpy(self.x[i:i + n])
        ys = torch.from_numpy(self.y[i:i + n])
        self.current += n
        return (xs, ys)

In [3]:
class MF(nn.Module):

    def __init__(self, n_user, n_item, k=18, c_vector=1.0, c_bias=1.0):
        super(MF, self).__init__()
        self.k = k
        self.n_user = n_user
        self.n_item = n_item
        self.c_bias = c_bias
        self.c_vector = c_vector
        
        self.user = nn.Embedding(n_user, k)
        self.item = nn.Embedding(n_item, k)
        
        # We've added new terms here:
        self.bias_user = nn.Embedding(n_user, 1)
        self.bias_item = nn.Embedding(n_item, 1)
        self.bias = nn.Parameter(torch.ones(1))
    
    def forward(self, train_x):
        user_id = train_x[:, 0]
        item_id = train_x[:, 1]
        vector_user = self.user(user_id)
        vector_item = self.item(item_id)
        
        # Pull out biases
        bias_user = self.bias_user(user_id).squeeze()
        bias_item = self.bias_item(item_id).squeeze()
        biases = (self.bias + bias_user + bias_item)
        
        ui_interaction = torch.sum(vector_user * vector_item, dim=1)
        
        # Add bias prediction to the interaction prediction
        prediction = ui_interaction + biases
        return prediction
    
    def loss(self, prediction, target):

        def l2_regularize(array):
            loss = torch.sum(array**2)
            return loss

        loss_mse = F.mse_loss(prediction, target.squeeze())
        
        # Add new regularization to the biases
        prior_bias_user =  l2_regularize(self.bias_user.weight) * self.c_bias
        prior_bias_item = l2_regularize(self.bias_item.weight) * self.c_bias
        
        prior_user =  l2_regularize(self.user.weight) * self.c_vector
        prior_item = l2_regularize(self.item.weight) * self.c_vector
        total = loss_mse + prior_user + prior_item + prior_bias_user + prior_bias_item
        return total

In [4]:
def main():
    #Data
    data = Dataset.load_builtin('ml-100k')
    trainset = data.build_full_trainset()
    uir = np.array([x for x in trainset.all_ratings()])
    train_x = test_x = uir[:,:2].astype(np.int64) #for simplicity
    train_y = test_y = uir[:,2].astype(np.float32)

    #Parameters
    lr = 1e-1
    k = 10 #latent dimension
    c_bias = 1e-6
    c_vector = 1e-6
    batchsize = 1024

    model = MF(trainset.n_users, trainset.n_items, k=k, c_bias=c_bias, c_vector=c_vector)
    optimizer = torch.optim.Adam(model.parameters(),lr = lr)
    dataloader = Loader(train_x, train_y, batchsize=batchsize)

    itr = 0
    for batch in dataloader:
        itr += 1
        prediction = model(batch[0])
        loss = model.loss(prediction,batch[1])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"iteration: {itr}. training loss: {loss}")

    torch.save(model.state_dict(), "./recommendation_model_pytorch.pkl")

if __name__=='__main__':
    main()

iteration: 1. training loss: 20.42420768737793
iteration: 2. training loss: 17.85934066772461
iteration: 3. training loss: 15.529234886169434
iteration: 4. training loss: 14.985013961791992
iteration: 5. training loss: 13.329107284545898
iteration: 6. training loss: 11.790022850036621
iteration: 7. training loss: 11.360279083251953
iteration: 8. training loss: 10.009636878967285
iteration: 9. training loss: 8.997058868408203
iteration: 10. training loss: 8.372835159301758
iteration: 11. training loss: 7.8731465339660645
iteration: 12. training loss: 7.506683349609375
iteration: 13. training loss: 6.360579013824463
iteration: 14. training loss: 6.4721760749816895
iteration: 15. training loss: 5.841653347015381
iteration: 16. training loss: 5.418582439422607
iteration: 17. training loss: 4.895118236541748
iteration: 18. training loss: 4.843044281005859
iteration: 19. training loss: 4.418150424957275
iteration: 20. training loss: 4.507508754730225
iteration: 21. training loss: 4.566267967

### Recommendation (Pytorch) Inference

For inference, we will need most of the code from before (especially the model definition).

In [None]:
from recommend_pytorch_base import MF
from surprise import Dataset
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import pprint

def get_top_n(model,testset,trainset,uid_input,movies_df,n=10):
    
    preds = []
    try:
        uid_input = int(trainset.to_inner_uid(uid_input))
    except KeyError:
        return preds        

    # First map the predictions to each user.
    for uid, iid, _ in testset: #inefficient
        try:
            uid_internal = int(trainset.to_inner_uid(uid))
        except KeyError:
            continue
        if uid_internal==uid_input:
            try:
                iid_internal = int(trainset.to_inner_iid(iid))
                movie_name = movies_df.loc[int(iid),'name']
                preds.append((iid,movie_name,float(model(torch.tensor([[uid_input,iid_internal]])))))
            except KeyError:
                pass
    # Then sort the predictions for each user and retrieve the k highest ones
    if preds is not None:
        preds.sort(key=lambda x: x[1], reverse=True)
        if len(preds) > n:
            preds = preds[:n]
    return preds

def get_previously_seen(trainset, uid, movies_df):
    seen = []
    for (iid, _) in trainset.ur[int(uid)]:
        try:
            seen.append(movies_df.loc[int(iid),'name'])
        except KeyError:
            pass
        if len(seen) > 10:
            break
    return seen

def main():
    #Data
    movies_df = pd.read_csv('./movies.dat',sep="::",header=None,engine='python')
    movies_df.columns = ['iid','name','genre']
    movies_df.set_index('iid',inplace=True)
    data = Dataset.load_builtin('ml-1m')
    trainset = data.build_full_trainset()
    testset = trainset.build_anti_testset()

    k = 10 #latent dimension
    c_bias = 1e-6
    c_vector = 1e-6

    model = MF(trainset.n_users, trainset.n_items, k=k, c_bias=c_bias, c_vector=c_vector)
    model.load_state_dict(torch.load('./recommendation_model_pytorch.pkl'))
    model.eval()

    # Print the recommended items for sample users
    sample_users = list(set([x[0] for x in testset]))[:4]

    for uid in sample_users:
        
        print('User:',uid)
        print('\n')

        print('\tSeen:')
        seen = get_previously_seen(trainset, uid, movies_df)
        pprint.pprint(seen)
        print('\n')

        print('\tRecommendations:')
        recommended = get_top_n(model, testset, trainset, uid, movies_df, n=10)
        pprint.pprint([x[1] for x in recommended])
        print('\n')



if __name__=="__main__":
    main()


### Recommendation (SVD) Training

In [None]:
# https://github.com/NicolasHug/Surprise
from surprise import SVD, Dataset
from surprise.accuracy import rmse
from surprise.dump import dump

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-1m')
trainset = data.build_full_trainset()

# Use an example algorithm: SVD.
algo = SVD()
algo.fit(trainset)

# predict ratings for all pairs (u, i) that are in the training set.
testset = trainset.build_testset()
predictions = algo.test(testset)
rmse(predictions)                                                                              

#actual predictions as thse items have not been seen by the users. there is no ground truth. 
# We predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

dump('./recommendation_model_surprise.pkl', predictions, algo)


### Recommendation (SVD) Inference

In [None]:
# https://github.com/NicolasHug/Surprise
from surprise import SVD, Dataset
from surprise.dump import load
from collections import defaultdict
import pandas as pd
from recommend_pytorch_inf import get_previously_seen

def get_top_n_all(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


def main():

    movies_df = pd.read_csv('./movies.dat',sep="::",header=None,engine='python')
    movies_df.columns = ['iid','name','genre']
    movies_df.set_index('iid',inplace=True)
    predictions, algo = load('./recommendation_model_surprise.pkl')


    top_n = get_top_n_all(predictions, n=10)
    # Print the recommended items for some sample users

    itr = 0
    for uid, user_ratings in top_n.items():
        itr += 1
        if itr == 5:
            break

        print('User:',uid)
        print('\n')

        print('\tSeen:')
        seen = get_previously_seen(algo.trainset, uid, movies_df)

        pprint.pprint(seen)
        print('\n')

        print('\tRecommendations:')
        recommended = [movies_df.loc[int(iid),'name'] for (iid, _) in user_ratings]
        pprint.pprint(recommended)
        print('\n')

if __name__ == '__main__':
    main()