## Movielens

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from imports import *
from torch_imports import *
from pt_models import *
from column_data import *
from fast_gen import *
from layer_optimizer import *

In [2]:
path='/data/datasets/movielens/ml-latest-small/'

In [3]:
ratings = pd.read_csv(path+'ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movie_df = pd.read_csv(path+'movies.csv')
movie_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
cf = CollabFilter(ratings.userId, ratings.movieId, ratings.rating)

In [57]:
val_idxs = get_cv_idxs(len(ratings), cv_idx=0)
data = cf.to_model_data(val_idxs, 64)

In [65]:
n_factors = 50

In [66]:
model = cf.get_model(n_factors).cuda()

In [67]:
wd=2e-4

params = []
for k, v in model.named_parameters():
    if k in ('u.weight','i.weight'): params += [{'params':v,'weight_decay':wd}]
    else: params += [{'params':v}]

opt = optim.Adam(params, 1e-2)

In [68]:
fit(model, data, 1, F.mse_loss, opt)

[ 0.801793  0.804389]


In [69]:
set_lrs(opt, 1e-3)

In [70]:
fit(model, data, 4, F.mse_loss, opt)

[ 0.608584  0.774608]


[ 0.578653  0.763029]


[ 0.532351  0.757826]


[ 0.515739  0.755723]


### Dot product example

In [3]:
a = T([[1.,2],[3,4]])
b = T([[2.,2],[10,10]])
a,b

(
  1  2
  3  4
 [torch.FloatTensor of size 2x2], 
   2   2
  10  10
 [torch.FloatTensor of size 2x2])

In [4]:
a*b


  2   4
 30  40
[torch.FloatTensor of size 2x2]

In [5]:
(a*b).sum(1)


  6
 70
[torch.FloatTensor of size 2]

### Dot product model

In [53]:
class EmbeddingDot(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        self.u = nn.Embedding(n_users, n_factors)
        self.m = nn.Embedding(n_movies, n_factors)
        self.u.weight.data.uniform_(0,0.05)
        self.m.weight.data.uniform_(0,0.05)
        
    def forward(self, users, movies):
        return (self.u(users) * self.m(movies)).sum(1)

In [54]:
wd=1e-4
model = EmbeddingDot(cf.n_users, cf.n_items).cuda()
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)

In [55]:
fit(model, data, 3, F.mse_loss, opt)

[ 3.01861   3.094015]


[ 1.669732  1.846676]


[ 1.285729  1.5389  ]


### Bias

In [50]:
cf.min_score, cf.max_score

(0.5, 5.0)

In [51]:
def get_emb(ni,nf):
    e = nn.Embedding(ni, nf)
    e.weight.data.uniform_(-0.01,0.01)
    return e

class EmbeddingDotBias(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        (self.u, self.m, self.ub, self.mb) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors), (n_users,1), (n_movies,1)
        ]]
        
    def forward(self, users, movies):
        um = self.u(users)* self.m(movies)
        res = um.sum(1) + self.ub(users).squeeze() + self.mb(movies).squeeze()
        return F.sigmoid(res) * (5-0.5) + 0.5

In [52]:
wd=2e-4
model = EmbeddingDotBias(cf.n_users, cf.n_items).cuda()

params = []
for k, v in model.named_parameters():
    if k in ('u.weight','m.weight'): params += [{'params':v,'weight_decay':wd}]
    else: params += [{'params':v}]

opt = optim.Adam(params, 1e-2)

In [53]:
fit(model, data, 1, F.mse_loss, opt)


[ 0.796047  0.809999]


In [54]:
set_lrs(opt, 1e-3)

In [56]:
fit(model, data, 4, F.mse_loss, opt)

[ 0.491238  0.766061]


[ 0.463466  0.76654 ]


[ 0.450707  0.767772]


[ 0.471789  0.768434]


In [29]:
math.sqrt(0.75)

0.8660254037844386

### Mini net

In [348]:
class EmbeddingNet(nn.Module):
    def __init__(self, n_users, n_movies):
        super().__init__()
        (self.u, self.m) = [get_emb(*o) for o in [
            (n_users, n_factors), (n_movies, n_factors)]]
        self.lin1 = nn.Linear(n_factors*2, 10)
        self.lin2 = nn.Linear(10, 1)
        
    def forward(self, users, movies):
        x = F.dropout(torch.cat([self.u(users),self.m(movies)], dim=1), 0.75)
        x = F.dropout(F.relu(self.lin1(x)), 0.75)
        return F.sigmoid(self.lin2(x)) * (max_rating-min_rating+1) + min_rating-0.5

In [349]:
wd=5e-4
model = EmbeddingNet(n_users, n_movies).cuda()
opt = optim.Adam(model.parameters(), 1e-3, weight_decay=wd)

In [350]:
fit(model, TinyData(trn_dl,val_dl), 5, F.mse_loss, opt)

[ 0.831775  0.795582]


[ 0.827107  0.782025]


[ 0.781419  0.776098]


[ 0.766487  0.768581]


[ 0.782842  0.769452]


In [351]:
set_lrs(opt, 1e-4)

In [352]:
fit(model, TinyData(trn_dl,val_dl), 4, F.mse_loss, opt)

[ 0.639966  0.764555]


[ 0.66038   0.760858]


[ 0.651362  0.760858]


[ 0.657397  0.7612  ]


### From data frame

In [None]:
movie_df = movie_df[movie_df.movieId.isin(movies)]

In [36]:
data = LoaderModelData.from_data_frame(val_idxs, ratings, ['userId', 'movieId'], 'rating', 64)