In [1]:
from fastai.collab import *
from fastai.tabular.all import *
path = untar_data(URLs.ML_100k)

In [3]:
path, path.ls()

(Path('/Users/sagartr/.fastai/data/ml-100k'),
 (#23) [Path('/Users/sagartr/.fastai/data/ml-100k/u.item'),Path('/Users/sagartr/.fastai/data/ml-100k/u3.test'),Path('/Users/sagartr/.fastai/data/ml-100k/u1.base'),Path('/Users/sagartr/.fastai/data/ml-100k/u.info'),Path('/Users/sagartr/.fastai/data/ml-100k/u2.test'),Path('/Users/sagartr/.fastai/data/ml-100k/u5.test'),Path('/Users/sagartr/.fastai/data/ml-100k/u.genre'),Path('/Users/sagartr/.fastai/data/ml-100k/ub.test'),Path('/Users/sagartr/.fastai/data/ml-100k/ua.base'),Path('/Users/sagartr/.fastai/data/ml-100k/u.data')...])

In [4]:
ratings = pd.read_csv(path/'u.data', delimiter='\t',header=None,
                      names=['user', 'movie', 'rating', 'timestamp'])
ratings.head()

Unnamed: 0,user,movie,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
movies= pd.read_csv(path/'u.item',  delimiter='|',encoding='latin-1',
                    usecols=(0,1),names=('movie', 'title'), header=None)
movies.head()

Unnamed: 0,movie,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
ratings = ratings.merge(movies)
ratings.head()

Unnamed: 0,user,movie,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [10]:
dls = CollabDataLoaders.from_df(ratings, user_name='user', item_name='title', rating_name='rating', bs=64)
dls.show_batch()

Unnamed: 0,user,title,rating
0,568,Jaws (1975),3
1,121,"Graduate, The (1967)",4
2,443,"Saint, The (1997)",4
3,67,Phenomenon (1996),4
4,625,"Princess Bride, The (1987)",3
5,871,Terminator 2: Judgment Day (1991),5
6,622,Dumb & Dumber (1994),4
7,367,Psycho (1960),5
8,145,Mr. Wrong (1996),1
9,538,Sleepless in Seattle (1993),2


In [12]:
embs = get_emb_sz(dls)
embs

[(944, 74), (1665, 102)]

In [16]:
class DotProd(Module):
    def __init__(self, n_users, n_movies, n_internal=100, y_limit=(0,5.5)):  

        self.user_embd = Embedding(*n_users)
        self.movie_embd = Embedding(*n_movies)

        self.net = nn.Sequential(
            nn.Linear(n_users[1] + n_movies[1], n_internal),
            nn.ReLU(),
            nn.Linear(n_internal, 1)
        )

        self.y_range = y_limit

    def forward(self, x):
        x = self.user_embd(x[:,0]),  self.movie_embd(x[:,1])
        x = torch.cat(x, dim=1)
        return sigmoid_range(self.net(x), *self.y_range)

In [17]:
model = DotProd(*embs)

In [19]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-3, wd=0.1)

epoch,train_loss,valid_loss,time
0,0.840819,0.911101,01:37
1,0.820994,0.935597,01:04
2,0.772349,0.904178,01:06
3,0.763198,0.874487,01:26
4,0.723924,0.866866,00:54


In [39]:
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,  5.5)):
        self.user_factors =  Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)

        self.movie_bias = Embedding(n_movies,1)
        self.user_bias = Embedding(n_users, 1)

        self.y_range = y_range
    
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        
        res = (users * movies).sum(dim=1, keepdim=True)
        #print(res.shape)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        #print(res.shape)

        return sigmoid_range(res, *self.y_range)
    
n_users = len(dls.classes['user'])
n_movies = len(dls.classes['title'])
n_users,n_movies   
model = DotProduct(n_users, n_movies, 50)


In [41]:
learn = Learner(dls, model, loss_func=MSELossFlat())
learn.fit_one_cycle(5, 5e-5)

epoch,train_loss,valid_loss,time
0,1.888585,1.847053,00:29
1,1.833589,1.79215,01:07
2,1.790039,1.75022,01:08
3,1.747786,1.729761,00:43
4,1.741416,1.726411,00:45


Does sorting the movie biases give the same result as averaging overall movie ratings by movie? Why/why not?

In [25]:
ratings.groupby('title')['rating'].mean()

title
'Til There Was You (1997)                2.333333
1-900 (1994)                             2.600000
101 Dalmatians (1996)                    2.908257
12 Angry Men (1957)                      4.344000
187 (1997)                               3.024390
                                           ...   
Young Guns II (1990)                     2.772727
Young Poisoner's Handbook, The (1995)    3.341463
Zeus and Roxanne (1997)                  2.166667
unknown                                  3.444444
Á köldum klaka (Cold Fever) (1994)       3.000000
Name: rating, Length: 1664, dtype: float64

In [29]:
mov = ratings[ratings['title'] == 'Young Guns II (1990)']['rating']
mov.mean()

2.772727272727273