In [1]:
import os

import pandas as pd
import pytorch_lightning as pl
import torch
import torch.multiprocessing
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms
from torchvision.datasets import MNIST

torch.multiprocessing.set_sharing_strategy("file_system")


class MatrixFactorization(pl.LightningModule):
    def __init__(self, n_users, n_items, n_factors=40, dropout_p=0, sparse=False):
        """
        Parameters
        ----------
        n_users : int
            Number of users
        n_items : int
            Number of items
        n_factors : int
            Number of latent factors (or embeddings or whatever you want to
            call it).
        dropout_p : float
            p in nn.Dropout module. Probability of dropout.
        sparse : bool
            Whether or not to treat embeddings as sparse. NOTE: cannot use
            weight decay on the optimizer if sparse=True. Also, can only use
            Adagrad.
        """
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.n_factors = n_factors
        self.user_biases = nn.Embedding(n_users, 1, sparse=sparse)
        self.item_biases = nn.Embedding(n_items, 1, sparse=sparse)
        self.bias = nn.Parameter(torch.rand(1))
        self.user_embeddings = nn.Embedding(n_users, n_factors, sparse=sparse)
        self.item_embeddings = nn.Embedding(n_items, n_factors, sparse=sparse)

        self.dropout_p = dropout_p
        self.dropout = nn.Dropout(p=self.dropout_p)

        self.sparse = sparse

    def forward(self, users, items):
        """
        Forward pass through the model. For a single user and item, this
        looks like:
        user_bias + item_bias + user_embeddings.dot(item_embeddings)
        Parameters
        ----------
        users : np.ndarray
            Array of user indices
        items : np.ndarray
            Array of item indices
        Returns
        -------
        preds : np.ndarray
            Predicted ratings.
        """
        ues = self.user_embeddings(users)
        uis = self.item_embeddings(items)

        preds = self.user_biases(users) + self.bias
        preds += self.item_biases(items)
        preds += torch.reshape(
            torch.diag(
                torch.matmul(
                    self.dropout(ues), torch.transpose(self.dropout(uis), 0, 1)
                )
            ),
            (-1, 1),
        )

        return torch.clip(preds.squeeze(), min=1, max=5)

    def training_step(self, batch, batch_idx):
        users, items, rating = batch
        rating = rating.to(torch.float32)
        output = self.forward(users, items)
        loss = F.mse_loss(rating, output)
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(), lr=0.5, momentum=0.5, weight_decay=1e-3
        )  # learning rate
        return optimizer


class MlDataset(Dataset):
    def __init__(self, file_path: str):
        self.df = pd.read_csv(file_path, delimiter="\t", header=None)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        return self.df[0][index] - 1, self.df[1][index] - 1, self.df[2][index]


def eval_model(model, train_dataloader):
    loss = 0
    for users, items, rating in train_dataloader:
        pred = model(users, items)
        loss += F.mse_loss(pred, rating) ** 0.5
    avg_loss = loss / len(train_dataloader)
    print(f"avg rmse: {avg_loss}")


def run_pipeline():
    training_data = MlDataset("data/ml-100k/u1.base")
    validation_data = MlDataset("data/ml-100k/u1.test")
    batch_size = 256
    train_dataloader = DataLoader(
        training_data, batch_size=batch_size, shuffle=True, num_workers=10
    )
    validation_dataloader = DataLoader(
        validation_data, batch_size=batch_size, shuffle=False, num_workers=10
    )
    # https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
    n_users = 943
    n_movies = 1682
    n_factors = 30
    model = MatrixFactorization(n_users=n_users, n_items=n_movies, n_factors=n_factors)
    trainer = pl.Trainer(gpus=1, max_epochs=100)
    trainer.fit(model, train_dataloader, validation_dataloader)
    print("Train loss")
    eval_model(model, train_dataloader)
    print("Validation loss")
    eval_model(model, validation_dataloader)


# if __name__ == "__main__":
    run_pipeline()
    # run_pipeline2()


In [2]:
run_pipeline()

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type      | Params
----------------------------------------------
0 | user_biases     | Embedding | 943   
1 | item_biases     | Embedding | 1.7 K 
2 | user_embeddings | Embedding | 28.3 K
3 | item_embeddings | Embedding | 50.5 K
4 | dropout         | Dropout   | 0     
----------------------------------------------
81.4 K    Trainable params
0         Non-trainable params
81.4 K    Total params
0.326     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Train loss




avg rmse: 0.9438539743423462
Validation loss
avg rmse: 0.9740405678749084


In [3]:
training_data = MlDataset("data/ml-100k/u1.base")
validation_data = MlDataset("data/ml-100k/u1.test")
batch_size = 256
train_dataloader = DataLoader(
    training_data, batch_size=batch_size, shuffle=True, num_workers=10
)
validation_dataloader = DataLoader(
    validation_data, batch_size=batch_size, shuffle=False, num_workers=10
)
# https://files.grouplens.org/datasets/movielens/ml-100k-README.txt
n_users = 943
n_movies = 1682
n_factors = 30
model = MatrixFactorization(n_users=n_users, n_items=n_movies, n_factors=n_factors)
trainer = pl.Trainer(gpus=1, max_epochs=10)
trainer.fit(model, train_dataloader, validation_dataloader)
print("Train loss")
eval_model(model, train_dataloader)
print("Validation loss")
eval_model(model, validation_dataloader)


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type      | Params
----------------------------------------------
0 | user_biases     | Embedding | 943   
1 | item_biases     | Embedding | 1.7 K 
2 | user_embeddings | Embedding | 28.3 K
3 | item_embeddings | Embedding | 50.5 K
4 | dropout         | Dropout   | 0     
----------------------------------------------
81.4 K    Trainable params
0         Non-trainable params
81.4 K    Total params
0.326     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Train loss
avg rmse: 0.9415215849876404
Validation loss
avg rmse: 0.9791469573974609


In [4]:
model

MatrixFactorization(
  (user_biases): Embedding(943, 1)
  (item_biases): Embedding(1682, 1)
  (user_embeddings): Embedding(943, 30)
  (item_embeddings): Embedding(1682, 30)
  (dropout): Dropout(p=0, inplace=False)
)

In [6]:
model.item_embeddings

Embedding(1682, 30)

In [9]:
model.item_embeddings(torch.tensor(0))

tensor([ 0.0280, -0.0813,  0.0552,  0.1535, -0.0859, -0.0521, -0.0089, -0.0522,
         0.0072, -0.0166, -0.0261,  0.0441, -0.0774, -0.0250,  0.0073, -0.1266,
        -0.0863,  0.0011,  0.0418, -0.0568,  0.0407,  0.0201,  0.0815, -0.0211,
         0.0609,  0.0303, -0.0053,  0.0176,  0.0038,  0.0171],
       grad_fn=<EmbeddingBackward>)

In [15]:
import numpy as np
items = np.array(model.item_embeddings.weight.data)

In [53]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
nbrs = NearestNeighbors(n_neighbors=10, algorithm='ball_tree').fit(items)
distances, indices = nbrs.kneighbors(items[0].reshape(1, -1))

In [54]:
indices

array([[   0,  164, 1680, 1374,  318, 1593,  403, 1676,  512, 1145]])

In [55]:
item_df = pd.read_csv("data/ml-100k/u.item", delimiter="|", header=None, encoding='latin-1')

In [56]:
item_df[1][indices.reshape(-1)]

0                      Toy Story (1995)
164             Jean de Florette (1986)
1680                You So Crazy (1994)
1374          Cement Garden, The (1993)
318     Everyone Says I Love You (1996)
1593                     Everest (1998)
403                    Pinocchio (1940)
1676               Sweet Nothing (1995)
512               Third Man, The (1949)
1145               Calendar Girl (1993)
Name: 1, dtype: object

In [37]:
item_df[1][indices.reshape(-1)]

227        Star Trek: The Wrath of Khan (1982)
98      Snow White and the Seven Dwarfs (1937)
1173                             Caught (1996)
1410                         Barbarella (1968)
754                             Jumanji (1995)
1329            An Unforgettable Summer (1994)
697               Browning Version, The (1994)
488                           Notorious (1946)
425        Transformers: The Movie, The (1986)
538                          Mouse Hunt (1997)
Name: 1, dtype: object

In [30]:
item_map = {a: b for a, b in item_df[[0, 1]].values}

In [32]:
item_map

{1: 'Toy Story (1995)',
 2: 'GoldenEye (1995)',
 3: 'Four Rooms (1995)',
 4: 'Get Shorty (1995)',
 5: 'Copycat (1995)',
 6: 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
 7: 'Twelve Monkeys (1995)',
 8: 'Babe (1995)',
 9: 'Dead Man Walking (1995)',
 10: 'Richard III (1995)',
 11: 'Seven (Se7en) (1995)',
 12: 'Usual Suspects, The (1995)',
 13: 'Mighty Aphrodite (1995)',
 14: 'Postino, Il (1994)',
 15: "Mr. Holland's Opus (1995)",
 16: 'French Twist (Gazon maudit) (1995)',
 17: 'From Dusk Till Dawn (1996)',
 18: 'White Balloon, The (1995)',
 19: "Antonia's Line (1995)",
 20: 'Angels and Insects (1995)',
 21: 'Muppet Treasure Island (1996)',
 22: 'Braveheart (1995)',
 23: 'Taxi Driver (1976)',
 24: 'Rumble in the Bronx (1995)',
 25: 'Birdcage, The (1996)',
 26: 'Brothers McMullen, The (1995)',
 27: 'Bad Boys (1995)',
 28: 'Apollo 13 (1995)',
 29: 'Batman Forever (1995)',
 30: 'Belle de jour (1967)',
 31: 'Crimson Tide (1995)',
 32: 'Crumb (1994)',
 33: 'Desperado (1995)',
 34: '

In [44]:
print([a for _, a in item_map.items() if 'Lord' in a])

['Lord of Illusions (1995)', 'Little Lord Fauntleroy (1936)']


In [57]:
item_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
!ls data/ml-100k/

allbut.pl  u1.base  u2.test  u4.base  u5.test  ub.base	u.genre  u.occupation
mku.sh	   u1.test  u3.base  u4.test  ua.base  ub.test	u.info	 u.user
README	   u2.base  u3.test  u5.base  ua.test  u.data	u.item


In [59]:
! cat data/ml-100k/u.genre

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4
Comedy|5
Crime|6
Documentary|7
Drama|8
Fantasy|9
Film-Noir|10
Horror|11
Musical|12
Mystery|13
Romance|14
Sci-Fi|15
Thriller|16
War|17
Western|18



In [65]:
genre_df = item_df[range(5, 24)]

In [63]:
sparse=torch.zeros([30000])
indices = torch.LongTensor([1,3,4,6])
values = torch.FloatTensor([1,1,1,1])
sparse[indices]=values

In [64]:
sparse

tensor([0., 1., 0.,  ..., 0., 0., 0.])

In [74]:
offset = 10
[offset + i for i, val in enumerate(genre_df.iloc[0].tolist()) if val == 1]

[13, 14, 15]

In [75]:
i = [[0, 1, 1],
         [2, 0, 2]]
v =  [3, 4, 5]
s = torch.sparse_coo_tensor(i, v, (2, 3))
s

tensor(indices=tensor([[0, 1, 1],
                       [2, 0, 2]]),
       values=tensor([3, 4, 5]),
       size=(2, 3), nnz=3, layout=torch.sparse_coo)

In [79]:
i = [[0, 1]]
v = [1, 1]
s = torch.sparse_coo_tensor(i, v, (4,))


In [80]:
s

tensor(indices=tensor([[0, 1]]),
       values=tensor([1, 1]),
       size=(4,), nnz=2, layout=torch.sparse_coo)

In [81]:
s.to_dense()

tensor([1, 1, 0, 0])