In [1]:
import os

import pandas as pd
import pytorch_lightning as pl
import torch
import torch.multiprocessing
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from torchvision.datasets import MNIST

torch.multiprocessing.set_sharing_strategy("file_system")

NUM_USERS = 943
NUM_MOVIES = 1682
NUM_GENRES = 19


class FactorizationMachine(pl.LightningModule):
    def __init__(self, field_dims, num_factors):
        super(FactorizationMachine, self).__init__()
        num_inputs = int(sum(field_dims))
        self.embedding = nn.Parameter(
            torch.randn(num_inputs, num_factors), requires_grad=True
        )
        self.linear_layer = nn.Linear(num_inputs, 1, bias=True)

    def forward(self, x):
        out_1 = torch.matmul(x, self.embedding).pow(2).sum(1, keepdim=True)  # S_1^2
        out_2 = torch.matmul(x.pow(2), self.embedding.pow(2)).sum(
            1, keepdim=True
        )

        out_inter = 0.5 * (out_1 - out_2)
        out_lin = self.linear_layer(x)
        out = out_inter + out_lin

        return torch.clip(out.squeeze(), min=1, max=5)

    def training_step(self, batch, batch_idx):
        inputs, rating = batch
        rating = rating.to(torch.float32)
        output = self.forward(inputs)
        loss = F.mse_loss(rating, output)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(), lr=0.5, momentum=0.5, weight_decay=1e-3
        )  # learning rate
        return optimizer


class MlDataset(Dataset):
    def __init__(self, file_path: str, field_dims=[943, 1682, 19]):
        self.df = pd.read_csv(file_path, delimiter="\t", header=None)
        self.genre_df = pd.read_csv(
            "data/ml-100k/u.item", delimiter="|", header=None, encoding="latin-1"
        )[range(5, 24)]
        self.field_dims = field_dims

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        # movie_onehot, user_onehot, genre_onehot
        user_index = self.df[0][index] - 1
        item_index = self.df[1][index] - 1
        user_index_offset = NUM_MOVIES + user_index
        genre_osset = NUM_MOVIES + NUM_USERS
        genre_indices = [
            genre_osset + i
            for i, val in enumerate(self.genre_df.iloc[item_index].tolist())
            if val == 1
        ]
        inputs = [item_index, user_index_offset] + genre_indices

        sparse = torch.zeros([NUM_MOVIES + NUM_USERS + NUM_GENRES], dtype=torch.float)
        indices = torch.LongTensor([inputs])
        values = torch.ones_like(torch.tensor((len(inputs),)), dtype=torch.float)
        sparse[indices] = values

        return sparse, self.df[2][index]


def eval_model(model, train_dataloader):
    loss = 0
    for inputs, rating in train_dataloader:
        pred = model(inputs)
        loss += F.mse_loss(pred, rating)
    avg_loss = loss / len(train_dataloader)
    print(f"avg rmse: {avg_loss}")


# if __name__ == "__main__":
#     run_pipeline()
    # run_pipeline2()


In [28]:
training_data = MlDataset("data/ml-100k/u1.base")
validation_data = MlDataset("data/ml-100k/u1.test")
batch_size = 256
train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, num_workers=10
)
validation_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=False, num_workers=10
)



# https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
n_factors = 5
model = FactorizationMachine(
    field_dims=[NUM_USERS, NUM_MOVIES, NUM_GENRES], num_factors=n_factors
)
trainer = pl.Trainer(gpus=1, max_epochs=80)
trainer.fit(model, train_dataloader, validation_dataloader)
print("Train loss")
eval_model(model, train_dataloader)
print("Validation loss")
eval_model(model, validation_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type   | Params
----------------------------------------
0 | linear_layer | Linear | 2.6 K 
----------------------------------------
15.9 K    Trainable params
0         Non-trainable params
15.9 K    Total params
0.063     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Train loss
avg rmse: 0.8344513177871704
Validation loss
avg rmse: 0.9350762963294983


In [29]:
import numpy as np
items = np.array(model.embedding.detach().numpy()[:NUM_MOVIES])
item_df = pd.read_csv("data/ml-100k/u.item", delimiter="|", header=None, encoding='latin-1')

In [30]:
#@title Solution
DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=DOT):
  """Computes the scores of the candidates given a query.
  Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding
      of item i.
    measure: a string specifying the similarity measure to be used. Can be
      either DOT or COSINE.
  Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
  """
  u = query_embedding
  V = item_embeddings
  if measure == COSINE:
    V = V / np.linalg.norm(V, axis=1, keepdims=True)
    u = u / np.linalg.norm(u)
  scores = u.dot(V.T)
  return scores

In [31]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
movie_index = 227
distances = compute_scores(items[movie_index, :], items, measure=COSINE)
indices = (-distances).argsort()[:10]
item_df[1][indices.reshape(-1)]


227         Star Trek: The Wrath of Khan (1982)
290                       Absolute Power (1997)
519                    Great Escape, The (1963)
600              For Whom the Bell Tolls (1943)
549           Die Hard: With a Vengeance (1995)
321                       Murder at 1600 (1997)
490        Adventures of Robin Hood, The (1938)
158                       Basic Instinct (1992)
529           Man Who Would Be King, The (1975)
518    Treasure of the Sierra Madre, The (1948)
Name: 1, dtype: object

379                 Star Trek: Generations (1994)
721                            Nine Months (1995)
803                        Jimmy Hollywood (1994)
1650                 Spanish Prisoner, The (1997)
1387                                Gabbeh (1996)
347                     Desperate Measures (1998)
1014                                Shiloh (1997)
452                               Jaws 3-D (1983)
1348                       Mille bolle blu (1993)
211     Unbearable Lightness of Being, The (1988)
Name: 1, dtype: object

In [49]:
item_df[1][indices.reshape(-1)]

0                  Toy Story (1995)
521              Down by Law (1986)
792                 Crooklyn (1994)
210                  M*A*S*H (1970)
611             Lost Horizon (1937)
1079     Celestial Clockwork (1994)
1456    Love Is All There Is (1996)
515               Local Hero (1983)
388             Black Beauty (1994)
788     Swimming with Sharks (1995)
Name: 1, dtype: object

In [50]:
item_map = {a: b for a, b in item_df[[0, 1]].values}

In [51]:
item_map

{1: 'Toy Story (1995)',
 2: 'GoldenEye (1995)',
 3: 'Four Rooms (1995)',
 4: 'Get Shorty (1995)',
 5: 'Copycat (1995)',
 6: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 7: 'Twelve Monkeys (1995)',
 8: 'Babe (1995)',
 9: 'Dead Man Walking (1995)',
 10: 'Richard III (1995)',
 11: 'Seven (Se7en) (1995)',
 12: 'Usual Suspects, The (1995)',
 13: 'Mighty Aphrodite (1995)',
 14: 'Postino, Il (1994)',
 15: "Mr. Holland's Opus (1995)",
 16: 'French Twist (Gazon maudit) (1995)',
 17: 'From Dusk Till Dawn (1996)',
 18: 'White Balloon, The (1995)',
 19: "Antonia's Line (1995)",
 20: 'Angels and Insects (1995)',
 21: 'Muppet Treasure Island (1996)',
 22: 'Braveheart (1995)',
 23: 'Taxi Driver (1976)',
 24: 'Rumble in the Bronx (1995)',
 25: 'Birdcage, The (1996)',
 26: 'Brothers McMullen, The (1995)',
 27: 'Bad Boys (1995)',
 28: 'Apollo 13 (1995)',
 29: 'Batman Forever (1995)',
 30: 'Belle de jour (1967)',
 31: 'Crimson Tide (1995)',
 32: 'Crumb (1994)',
 33: 'Desperado (1995)',
 34: '

In [60]:
print([(i, a) for i, a in item_map.items() if 'Star' in a])

[(50, 'Star Wars (1977)'), (62, 'Stargate (1994)'), (124, 'Lone Star (1996)'), (146, 'Unhook the Stars (1996)'), (222, 'Star Trek: First Contact (1996)'), (227, 'Star Trek VI: The Undiscovered Country (1991)'), (228, 'Star Trek: The Wrath of Khan (1982)'), (229, 'Star Trek III: The Search for Spock (1984)'), (230, 'Star Trek IV: The Voyage Home (1986)'), (271, 'Starship Troopers (1997)'), (380, 'Star Trek: Generations (1994)'), (449, 'Star Trek: The Motion Picture (1979)'), (450, 'Star Trek V: The Final Frontier (1989)'), (1061, 'Evening Star, The (1996)'), (1068, "Star Maker, The (Uomo delle stelle, L') (1995)"), (1265, 'Star Maps (1997)'), (1293, 'Star Kid (1997)'), (1464, 'Stars Fell on Henrietta, The (1995)'), (1545, 'Frankie Starlight (1995)')]


In [29]:
item_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
!ls data/ml-100k/

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [31]:
! cat data/ml-100k/u.genre

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4
Comedy|5
Crime|6
Documentary|7
Drama|8
Fantasy|9
Film-Noir|10
Horror|11
Musical|12
Mystery|13
Romance|14
Sci-Fi|15
Thriller|16
War|17
Western|18



In [32]:
genre_df = item_df[range(5, 24)]

In [33]:
sparse=torch.zeros([30000])
indices = torch.LongTensor([1,3,4,6])
values = torch.FloatTensor([1,1,1,1])
sparse[indices]=values

In [34]:
sparse

tensor([0., 1., 0.,  ..., 0., 0., 0.])

In [35]:
offset = 10
[offset + i for i, val in enumerate(genre_df.iloc[0].tolist()) if val == 1]

[13, 14, 15]

In [36]:
i = [[0, 1, 1],
         [2, 0, 2]]
v =  [3, 4, 5]
s = torch.sparse_coo_tensor(i, v, (2, 3))
s

tensor(indices=tensor([[0, 1, 1],
                       [2, 0, 2]]),
       values=tensor([3, 4, 5]),
       size=(2, 3), nnz=3, layout=torch.sparse_coo)

In [37]:
i = [[0, 1]]
v = [1, 1]
s = torch.sparse_coo_tensor(i, v, (4,))


In [80]:
s

tensor(indices=tensor([[0, 1]]),
       values=tensor([1, 1]),
       size=(4,), nnz=2, layout=torch.sparse_coo)

In [81]:
s.to_dense()

tensor([1, 1, 0, 0])

In [32]:
!ls data/ml-100k


allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [33]:
!cat data/ml-100k/u.user

1|24|M|technician|85711
2|53|F|other|94043
3|23|M|writer|32067
4|24|M|technician|43537
5|33|F|other|15213
6|42|M|executive|98101
7|57|M|administrator|91344
8|36|M|administrator|05201
9|29|M|student|01002
10|53|M|lawyer|90703
11|39|F|other|30329
12|28|F|other|06405
13|47|M|educator|29206
14|45|M|scientist|55106
15|49|F|educator|97301
16|21|M|entertainment|10309
17|30|M|programmer|06355
18|35|F|other|37212
19|40|M|librarian|02138
20|42|F|homemaker|95660
21|26|M|writer|30068
22|25|M|writer|40206
23|30|F|artist|48197
24|21|F|artist|94533
25|39|M|engineer|55107
26|49|M|engineer|21044
27|40|F|librarian|30030
28|32|M|writer|55369
29|41|M|programmer|94043
30|7|M|student|55436
31|24|M|artist|10003
32|28|F|student|78741
33|23|M|student|27510
34|38|F|administrator|42141
35|20|F|homemaker|42459
36|19|F|student|93117
37|23|M|student|55105
38|28|F|other|54467
39|41|M|entertainment|01040
40|38|M|scientist|27514
41|33|M|engineer|80525
42|30|M|administrator|1787