# Collaborative Filtering with Neural Networks

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem. Then we will write a more general neural model for the same problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
PATH = Path("/data2/yinterian/ml-latest-small/")
list(PATH.iterdir())

[PosixPath('/data2/yinterian/ml-latest-small/ratings.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tags.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tiny_training2.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/links.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/tiny_val2.csv'),
 PosixPath('/data2/yinterian/ml-latest-small/README.txt'),
 PosixPath('/data2/yinterian/ml-latest-small/movies.csv')]

In [4]:
! head /data2/yinterian/ml-latest-small/ratings.csv

userId,movieId,rating,timestamp
1,31,2.5,1260759144
1,1029,3.0,1260759179
1,1061,3.0,1260759182
1,1129,2.0,1260759185
1,1172,4.0,1260759205
1,1263,2.0,1260759151
1,1287,2.0,1260759187
1,1293,2.0,1260759148
1,1339,3.5,1260759125


In [5]:
data = pd.read_csv(PATH/"ratings.csv")

In [6]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


### Encoding data
We enconde the data to have contiguous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [7]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [8]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [9]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [10]:
# to check my new implementation
df_t = pd.read_csv(PATH/"tiny_training2.csv")
df_v = pd.read_csv(PATH/"tiny_val2.csv")
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [11]:
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embedding layer

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [14]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.6739, -0.2685,  2.7923],
         [ 0.7226, -0.6228, -0.6661],
         [-0.5755, -0.5083,  0.8545],
         [ 0.7943, -0.2778, -0.6064],
         [ 0.8183,  0.4646, -0.3189],
         [-0.6739, -0.2685,  2.7923]]])

## Matrix factorization model

In [15]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

## Debugging MF model

In [16]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [17]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [18]:
U = user_emb(users)
V = item_emb(items)

In [19]:
U

tensor([[ 1.2067, -1.7158,  0.7074],
        [ 1.2067, -1.7158,  0.7074],
        [-2.4077, -2.1725,  0.5144],
        [-2.4077, -2.1725,  0.5144],
        [-0.5675, -0.8782,  0.2159],
        [-0.5675, -0.8782,  0.2159],
        [-0.0939, -0.3438,  0.4589],
        [-0.0939, -0.3438,  0.4589],
        [-0.1218,  0.7984,  0.3366],
        [-0.1218,  0.7984,  0.3366],
        [-1.1452,  0.5116, -0.0972],
        [ 1.5442,  0.4086,  0.3488],
        [ 1.5442,  0.4086,  0.3488]])

In [20]:
# element wise multiplication
U*V 

tensor([[ 0.2930, -0.0936,  0.6396],
        [-1.0525,  4.2574,  0.2091],
        [ 2.1000,  5.3905,  0.1521],
        [-0.9552, -1.1500, -0.5660],
        [-0.1378, -0.0479,  0.1952],
        [ 0.4949,  2.1791,  0.0638],
        [-0.0228, -0.0188,  0.4149],
        [ 0.1387, -0.1703,  0.2284],
        [-0.0296,  0.0435,  0.3043],
        [ 0.1798,  0.3955,  0.1675],
        [ 1.6907,  0.2534, -0.0484],
        [-1.3469, -1.0139,  0.1031],
        [-2.2798,  0.2024,  0.1736]])

In [21]:
# what we want is a dot product per row
(U*V).sum(1) 

tensor([ 0.8390,  3.4141,  7.6427, -2.6713,  0.0096,  2.7379,  0.3733,
         0.1967,  0.3183,  0.7428,  1.8958, -2.2576, -1.9038])

## Training MF model

In [22]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items) 

671 8442


In [25]:
model = MF(num_users, num_items, emb_size=100) # .cuda()

In [27]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) # used to be loss.data[0]
    test_loss(model, unsqueeze)

In [28]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values).unsqueeze(1) # .cuda()
ratings.shape

torch.Size([79799, 1])

In [29]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [30]:
train_epocs(model, epochs=10, lr=0.1)

13.230864524841309
5.119508743286133
2.3753397464752197
3.4457108974456787
0.907403290271759
1.8074331283569336
2.746753215789795
2.275250196456909
1.1549110412597656
0.9233947396278381
test loss 1.946 


In [31]:
train_epocs(model, epochs=15, lr=0.01)

1.7039943933486938
1.0514448881149292
0.7490931153297424
0.6940227746963501
0.7590653300285339
0.8398492932319641
0.8824888467788696
0.876357913017273
0.8343711495399475
0.7773463129997253
0.7248387932777405
0.6898486018180847
0.6764661073684692
0.6802723407745361
0.6916050910949707
test loss 0.893 


In [32]:
train_epocs(model, epochs=15, lr=0.01)

0.700444221496582
0.6621485948562622
0.6683022975921631
0.645268440246582
0.6376075744628906
0.6445549130439758
0.6403719782829285
0.6252588033676147
0.6140212416648865
0.6126754283905029
0.6134018898010254
0.6077237129211426
0.5963393449783325
0.5854580998420715
0.5786408185958862
test loss 0.821 


## MF with bias

In [33]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [34]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [35]:
train_epocs(model, epochs=10, lr=0.1, wd=1e-5)

13.235189437866211
4.369823455810547
3.4950919151306152
2.4675838947296143
0.7887334823608398
1.816540241241455
2.524082660675049
2.1433157920837402
1.2760214805603027
0.9029127359390259
test loss 1.538 


In [36]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.281296968460083
0.8570265173912048
0.6940464973449707
0.6954416036605835
0.754655659198761
0.7996518611907959
0.8058428168296814
0.7789906859397888
0.7361329197883606
0.6945789456367493
test loss 0.824 


In [37]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6663876175880432
0.6583597660064697
0.6516505479812622
0.6461962461471558
0.6418693661689758
0.6384627223014832
0.63578200340271
0.6336454153060913
0.6318715214729309
0.6303484439849854
test loss 0.810 


In [38]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6289854645729065
0.6269554495811462
0.6253837943077087
0.623939573764801
0.6224943995475769
0.6210141181945801
0.6195191740989685
0.6180238127708435
0.6165342926979065
0.6150566935539246
test loss 0.811 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Neural Network Model

In [39]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.0)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

In [40]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [41]:
train_epocs(model, epochs=20, lr=0.01, wd=1e-5, unsqueeze=True) 

14.149616241455078
9.508943557739258
6.430286407470703
3.8725030422210693
2.072707414627075
1.294029951095581
1.6011323928833008
2.507868766784668
3.1368980407714844
3.1491539478302
2.708000421524048
2.103240728378296
1.5794142484664917
1.2538635730743408
1.1517829895019531
1.2164102792739868
1.3608611822128296
1.5175153017044067
1.6299303770065308
1.6761547327041626
test loss 1.660 


In [42]:
train_epocs(model, epochs=20, lr=0.01, wd=1e-6, unsqueeze=True)

1.6430702209472656
1.0609583854675293
1.3088443279266357
1.297023057937622
1.081175684928894
0.9823583364486694
1.0359172821044922
1.0936707258224487
1.0668280124664307
0.9791148900985718
0.905183732509613
0.8912572264671326
0.9251843690872192
0.9420336484909058
0.908094584941864
0.8533429503440857
0.823914110660553
0.8270622491836548
0.8434524536132812
0.8433369398117065
test loss 0.863 


In [43]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.8194280862808228
0.7982811331748962
0.7888120412826538
0.7888966798782349
0.7920635342597961
0.7930675148963928
0.7930376529693604
0.7866061925888062
0.785195529460907
0.7833060622215271
test loss 0.826 


In [44]:
train_epocs(model, epochs=20, lr=0.001, wd=1e-6, unsqueeze=True)

0.7807965874671936
0.7815154790878296
0.7778905630111694
0.7798123359680176
0.7777783870697021
0.774885892868042
0.7739108800888062
0.7738898992538452
0.7731009125709534
0.7710920572280884
0.7701359987258911
0.770715057849884
0.7674622535705566
0.7662222981452942
0.7666878700256348
0.7660290598869324
0.7637755274772644
0.7642784714698792
0.7618466019630432
0.7628806829452515
test loss 0.815 


# References
* This notebook is based on [lesson 5 of Jeremy Howard's Deep Learning Course](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb)