# Collaborative Filtering with Neural Networks

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem. Then we will write a more general neural model for the same problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
PATH = Path("/Users/yinterian/teaching/deeplearning/data/ml-latest-small/")
PATH = Path("/data2/yinterian/ml-latest-small/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/ml-latest-small/ratings.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tags.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tiny_training2.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/links.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tiny_val2.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/README.txt'),
 PosixPath('/data2/yinterian/ml-latest-small/movies.csv')]

In [4]:
! head $PATH/ratings.csv

userId,movieId,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125


In [5]:
data = pd.read_csv(PATH/"ratings.csv")

In [6]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Encoding data
We enconde the data to have contiguous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [7]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [8]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [9]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [10]:
# to check my new implementation
LOCAL_PATH = Path("images/")
df_t = pd.read_csv(LOCAL_PATH/"tiny_training2.csv")
df_v = pd.read_csv(LOCAL_PATH/"tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [11]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embedding layer

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [14]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.1301,  0.0691, -1.1678],
         [-0.9865,  0.4514, -1.4770],
         [-1.7121,  0.0701,  0.0481],
         [ 1.4485,  0.1340,  0.0099],
         [-1.4074, -0.8650, -0.1255],
         [-0.1301,  0.0691, -1.1678]]])

## Matrix factorization model

In [15]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

## Debugging MF model

In [16]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [16]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [17]:
U = user_emb(users)
V = item_emb(items)

In [18]:
U

tensor([[ 0.1547,  0.2277,  0.2442],
        [ 0.1547,  0.2277,  0.2442],
        [ 0.6601,  0.8225, -1.2139],
        [ 0.6601,  0.8225, -1.2139],
        [ 0.1672, -1.2177,  0.1403],
        [ 0.1672, -1.2177,  0.1403],
        [-1.1907, -1.2933, -0.5506],
        [-1.1907, -1.2933, -0.5506],
        [ 0.1938, -0.0683, -0.8493],
        [ 0.1938, -0.0683, -0.8493],
        [ 0.8506, -1.1564,  1.1165],
        [ 0.8639, -2.5148, -0.8391],
        [ 0.8639, -2.5148, -0.8391]])

In [19]:
# element wise multiplication
U*V 

tensor([[-0.1766,  0.2957,  0.4409],
        [ 0.1205,  0.1733,  0.1165],
        [ 0.5143,  0.6258, -0.5793],
        [-0.5603,  0.3582, -0.5370],
        [-0.1909, -1.5812,  0.2533],
        [ 0.1303, -0.9266,  0.0670],
        [ 1.3594, -1.6793, -0.9940],
        [-0.2324,  1.4822,  0.5151],
        [-0.2212, -0.0887, -1.5335],
        [ 0.0378,  0.0783,  0.7947],
        [ 0.1660,  1.3253, -1.0447],
        [ 0.6730, -1.9135, -0.4004],
        [ 0.1686,  2.8820,  0.7851]])

In [20]:
# what we want is a dot product per row
(U*V).sum(1) 

tensor([ 0.5600,  0.4103,  0.5608, -0.7391, -1.5187, -0.7294, -1.3139,
         1.7649, -1.8434,  0.9108,  0.4466, -1.6409,  3.8357])

## Training MF model

In [21]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items) 

671 8442


In [22]:
model = MF(num_users, num_items, emb_size=100) # .cuda() if you have a GPU

In [23]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [24]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([79799])
torch.Size([79799, 1])


In [25]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [26]:
train_epocs(model, epochs=10, lr=0.1)

13.23068904876709
5.119534015655518
2.3902299404144287
3.441521406173706
0.9096018671989441
1.8109439611434937
2.749631643295288
2.278921604156494
1.1593214273452759
0.925656795501709
test loss 1.947 


In [27]:
train_epocs(model, epochs=15, lr=0.01)

1.7027523517608643
1.0512956380844116
0.7498359680175781
0.6950282454490662
0.7596880197525024
0.8397833108901978
0.8818210363388062
0.8753886818885803
0.8334189653396606
0.7767009735107422
0.7246581315994263
0.6901594400405884
0.6771144866943359
0.6810137033462524
0.69219970703125
test loss 0.894 


In [28]:
train_epocs(model, epochs=15, lr=0.01)

0.7007282376289368
0.6625022888183594
0.6684340834617615
0.6455244421958923
0.6380830407142639
0.6450700759887695
0.6408411264419556
0.6256920099258423
0.6144804358482361
0.6132143139839172
0.6140048503875732
0.6083489060401917
0.5969548225402832
0.5860226154327393
0.5791704058647156
test loss 0.822 


## MF with bias

In [29]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [32]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [33]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

13.233644485473633
9.459980964660645
4.618295669555664
1.2266862392425537
2.4537320137023926
3.888521432876587
2.6157896518707275
1.1573508977890015
0.8204843997955322
1.3100122213363647
test loss 2.126 


In [34]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.9130752086639404
1.3447301387786865
0.9572998285293579
0.7714419364929199
0.752704381942749
0.8091325759887695
0.8543495535850525
0.8524782657623291
0.8114585876464844
0.7577651739120483
test loss 0.851 


In [35]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.7163214087486267
0.7023102045059204
0.6904919147491455
0.6807348728179932
0.6728458404541016
0.6666097044944763
0.6618107557296753
0.6582220792770386
0.6556380391120911
0.6538312435150146
test loss 0.805 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Neural Network Model

In [76]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [77]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [78]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True) 

13.101761817932129
1.957230806350708
1.2605514526367188
1.3381402492523193
1.061022162437439
1.1385098695755005
0.9165319800376892
0.9622549414634705
0.8723138570785522
0.8084518909454346
0.8500765562057495
0.7535637617111206
0.791947603225708
0.7653028964996338
0.7301635146141052
test loss 0.869 


In [79]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

0.7691234350204468
0.9072751402854919
0.7757670879364014
0.7180655598640442
0.7918605208396912
0.7724899053573608
0.7119362950325012
0.7106000185012817
0.7403213977813721
0.7438958883285522
test loss 0.816 


In [80]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.7163267731666565
0.7032808065414429
0.695513904094696
0.6967512369155884
0.6998187303543091
0.700666606426239
0.7004959583282471
0.6982167959213257
0.6955875158309937
0.694402813911438
test loss 0.796 


In [81]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.6919353008270264
0.6934647560119629
0.6922585368156433
0.6942275762557983
0.6926798224449158
0.6916202902793884
0.6911264061927795
0.6923496127128601
0.6922929286956787
0.6904215812683105
test loss 0.795 


# References
* This notebook is based on [lesson 5 of Jeremy Howard's Deep Learning Course](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb)