In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.learner import *
from fastai.column_data import *

# Moving ratings data

Data available from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [6]:
path='data/ml-latest-small/'

We're working with the movielens data, which contains one rating per row, like this:

In [7]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Just for display purposes, let's read in the movie names too.

In [8]:
movies = pd.read_csv(path+'movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


We create a crosstab of the most popular movies and most movie-addicted users which we'll copy into Excel for creating a simple example. This isn't necessary for any of the modeling below however.

In [9]:
g=ratings.groupby('userId')['rating'].count()
topUsers=g.sort_values(ascending=False)[:15]

g=ratings.groupby('movieId')['rating'].count()
topMovies=g.sort_values(ascending=False)[:15]

top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,110,260,296,318,356,480,527,589,593,608,1196,1198,1270,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15,2.0,3.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0
30,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,3.0
73,5.0,4.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.0,5.0,5.0,5.0,4.5
212,3.0,5.0,4.0,4.0,4.5,4.0,3.0,5.0,3.0,4.0,,,3.0,3.0,5.0
213,3.0,2.5,5.0,,,2.0,5.0,,4.0,2.5,2.0,5.0,3.0,3.0,4.0
294,4.0,3.0,4.0,,3.0,4.0,4.0,4.0,3.0,,,4.0,4.5,4.0,4.5
311,3.0,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0,3.0,4.5,4.5,4.0
380,4.0,5.0,4.0,5.0,4.0,5.0,4.0,,4.0,5.0,4.0,4.0,,3.0,5.0
452,3.5,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,2.0
468,4.0,3.0,3.5,3.5,3.5,3.0,2.5,,,3.0,4.0,3.0,3.5,3.0,3.0


### Dot product basics

In [6]:
a = T( [[1., 2], [3, 4]] )
b = T( [[2., 2], [10, 10]] )

In [7]:
a


 1  2
 3  4
[torch.cuda.FloatTensor of size 2x2 (GPU 0)]

In [8]:
b


  2   2
 10  10
[torch.cuda.FloatTensor of size 2x2 (GPU 0)]

In [9]:
a * b


  2   4
 30  40
[torch.cuda.FloatTensor of size 2x2 (GPU 0)]

##### Dot product is ...

In [10]:
(a * b).sum(1) # sum across the columns


  6
 70
[torch.cuda.FloatTensor of size 2 (GPU 0)]

### Formally in Pytorch

PyTorch uses the concept of `module` which can be used independently, but also cna be used as a layer in a `NN`. Model can be invoked with params.

In [11]:
class DotProduct(nn.Module):
    def forward(self, user, movie):
        return (user * movie).sum(1)

In [12]:
model = DotProduct()

In [13]:
model(a, b)


  6
 70
[torch.cuda.FloatTensor of size 2 (GPU 0)]

# For real ...

Need to ensure that the numbers do not get too big for gradient descent. So, we ensure the scale of the random initialization is low. There is a formula to that - [Kaiming He method](http://www.jefkine.com/deep/2016/08/08/initialization-of-deep-networks-case-of-rectifiers/). `Note : Pytorch has this initialization built in.` However, as we are trying to learn doing this from scratch, we won't use it.

**Note:** In this case, n1 = 50 (The number of factors of the embedding matrices, which are the inputs to this layer)

In [13]:
class EmbeddingDot(nn.Module):
    def __init__(self, no_of_users, no_of_movies, no_of_factors):
        super().__init__()
        self.u = nn.Embedding(no_of_users, no_of_factors) # rows = no_of_users, columns = no_of_factors
        self.m = nn.Embedding(no_of_movies, no_of_factors)
        # self.u.weight is a PyToych variable. A Pytorch variable does similar things as tensor but it also
        # does **differentiation**.
        # self.u.weight.data is the tensor representing the enbedded matrix
        self.u.weight.data.uniform_(0, 0.05) # initialize this else we may get very large random numbers
        self.m.weight.data.uniform_(0, 0.05) # uniform_ does an inplace update with the random numbers
    
    
    def forward(self, categories, continuous):
        users = categories[:, 0] # will be a mini-batch(Thx to Pytorch) of users/movies

        movies = categories[:, 1] # will be a mini-batch(Thx to Pytorch) of users/movies

#         return dot_product(self.u(users), self.m(movies)) # user the mini-batch to lookup on the embedded matrices
        u, m = self.u(users), self.m(movies)
        return (u * m).sum(1)
    
    
#     def dot_product(mat1, mat2):
#         return (mat1 * mat2).sum(1)    

##### **NOTE** : If you `loop through a mini-batch`, GPU `accelration is not possible`

#### As user id and movie id is not contiguous...

In [14]:
user_unique = ratings.userId.unique()
user2idx = {user:index for index, user in enumerate(user_unique)}
ratings.userId = ratings.userId.apply(lambda x: user2idx[x])

In [15]:
ratings.userId[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: userId, dtype: int64

In [16]:
movie_unique = ratings.movieId.unique()
movie2idx = {movie:index for index, movie in enumerate(movie_unique)}
ratings.movieId = ratings.movieId.apply(lambda x: movie2idx[x])

In [17]:
ratings.movieId[:10]

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
Name: movieId, dtype: int64

In [18]:
no_of_users = int(ratings.userId.nunique())
no_of_movies = int(ratings.movieId.nunique())
print("Users : {0}, movies : {1}".format(no_of_users, no_of_movies))


Users : 671, movies : 9066


#### Input data

In [19]:
x = ratings.drop(['rating', 'timestamp'], axis=1)
y = ratings['rating'].astype(np.float32)

In [20]:
x[:10]

Unnamed: 0,userId,movieId
0,0,0
1,0,1
2,0,2
3,0,3
4,0,4
5,0,5
6,0,6
7,0,7
8,0,8
9,0,9


In [21]:
y[:5]

0    2.5
1    3.0
2    3.0
3    2.0
4    4.0
Name: rating, dtype: float32

### Train

In [25]:
val_idxs = get_cv_idxs(len(ratings))
wd = 1e-5
no_of_factors = 50

In [35]:
data = ColumnarModelData.from_data_frame(path, val_idxs, x, y, ['userId', 'movieId'], bs=64)

In [25]:
model = EmbeddingDot(no_of_users, no_of_movies, no_of_factors).cuda()

In [26]:
optimizer = optim.SGD(model.parameters(), 1e-1, weight_decay=wd, momentum=0.9)

##### We are using the fast.ai `fit` method which is nothing but a training loop. We are not using the learner, so there is no SGDR, multiple LR's etc.

In [27]:
fit(model, data, 3, optimizer, F.mse_loss) 

epoch      trn_loss   val_loss                                 
    0      1.687112   1.633845  
    1      1.128573   1.297411                                 
    2      0.935976   1.222038                                  



[1.2220384]

##### Mimic different learning rates

In [29]:
set_lrs(optimizer, 0.01)

In [30]:
fit(model, data, 3, optimizer, F.mse_loss)

epoch      trn_loss   val_loss                                  
    0      0.718686   1.148802  
    1      0.683221   1.137943                                  
    2      0.675178   1.132162                                  



[1.1321619]

## Bias

In [10]:
min_rating, max_rating = ratings.rating.min(), ratings.rating.max()
min_rating, max_rating

(0.5, 5.0)

In [58]:
def get_embedding(no_of_inputs, no_of_factors):
    emb = nn.Embedding(no_of_inputs, no_of_factors)
    emb.weight.data.uniform_(0.01, 0.01)
    return emb
    
class EmbeddingDotBias(nn.Module):
    def __init__(self, no_of_users, no_of_movies, no_of_factors):
        super().__init__()
        (self.users, self.movies, self.user_bias, self.movie_bias) = [get_embedding(*values) for values in 
                                                                      [(no_of_users, no_of_factors), 
                                                                       (no_of_movies, no_of_factors), 
                                                                       (no_of_users, 1), 
                                                                       (no_of_movies, 1)]]
        
    def forward(self, categories, continuous):
        users, movies = categories[:, 0], categories[:, 1]
        dot_prod = (self.users(users) * self.movies(movies)).sum(1)
        # squeeze() adds a vector (bias) to a matrix by duplicating the bias to match the matrix dimension (called as broadcasting)
        response = dot_prod + self.user_bias(users).squeeze() + self.movie_bias(movies).squeeze()
        # Optional, but good to have
        # As ratings are between 1 to 5, we use a sigmoid function to restrict the result to 0 to 1
        # and then multipy it by 4 and add by 1 to make max as 5
        # e.g.: sigmoid(result) = .6, .6*4 = 2.4, 2.4+1 = 3.4
        return (F.sigmoid(response) * (max_rating - min_rating)) + min_rating
        #return response

In [59]:
wd = 2e-4

In [60]:
model = EmbeddingDotBias(no_of_users, no_of_movies, no_of_factors).cuda() # Put it on the **GPU**
optimizer = optim.SGD(model.parameters(), lr=1e-1, weight_decay=wd, momentum=0.9)

In [61]:
fit(model, data, 3, optimizer, F.mse_loss)

epoch      trn_loss   val_loss                                  
    0      0.831139   0.837831  
    1      0.834587   0.813115                                  
    2      0.793029   0.806108                                  



[0.806108]

In [62]:
set_lrs(optimizer, 1e-2)

In [63]:
fit(model, data, 3, optimizer, F.mse_loss)

epoch      trn_loss   val_loss                                  
    0      0.753929   0.796583  
    1      0.705659   0.795153                                  
    2      0.738125   0.793814                                  



[0.7938138]