In [1]:
# https://github.com/facebookresearch/dlrm
# https://github.com/pytorch/torchrec/blob/main/torchrec/models/dlrm.py
# https://github.com/PaddlePaddle/PaddleRec/tree/master/models/rank/dlrm
# https://github.com/alibaba/easyrec
# https://nvidia-merlin.github.io/NVTabular/stable/examples/01-Getting-started.html#movielens25m
# https://github.com/NVIDIA-Merlin/models/blob/eb1e54196a64a70950b2a7e7744d2150e052d53e/examples/01-Getting-started.ipynb#L52
# https://github.com/NVIDIA-Merlin/models/blob/eb1e54196a64a70950b2a7e7744d2150e052d53e/merlin/models/torch/models/ranking.py
# https://github.com/NVIDIA-Merlin/Merlin

# datasets
#https://github.com/caserec/Datasets-for-Recommender-Systems

# datasets -- movielens
# https://www.kaggle.com/code/shihabshahriar/pytorch-movielens
# https://github.com/pytorch/torchrec/blob/main/torchrec/datasets/movielens.py
# https://pureai.substack.com/p/recommender-systems-with-pytorch
# https://nvidia-merlin.github.io/Merlin/stable/examples/getting-started-movielens/03-Training-with-PyTorch.html
# https://nvidia-merlin.github.io/NVTabular/v0.7.0/examples/getting-started-movielens/03-Training-with-PyTorch.html

# https://github.com/ycjuan/kaggle-2014-criteo
# https://www.kaggle.com/c/criteo-display-ad-challenge/data
# https://github.com/pytorch/torchrec/blob/main/torchrec/datasets/criteo.py

# datasets smaller
# https://www.kaggle.com/datasets/leonerd/criteo-small


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

In [3]:
class MLP(nn.Module):
    def __init__(self, num_inp, num_outp):
        super().__init__()

        self.linear = nn.Linear(num_inp, num_inp*3)
        self.act = nn.GELU()
        self.linear_final = nn.Linear(num_inp*3, num_outp)
        

    def forward(self, inp):
        inp = self.act(self.linear(inp))
        inp = self.act(self.linear_final(inp))
        
        return inp

class MyDLRM(nn.Module):
    def __init__(self, emb_dim, num_dense_features, num_sparse_features, sparse_features_categories, num_output_categories):
        super().__init__()
        self.emb_dim = emb_dim # embedding dim
        self.num_sparse_features = num_sparse_features
        self.num_dense_features = num_dense_features
        if num_dense_features == 0:
            self.has_dense_feature = 0
        else:
            self.has_dense_feature = 0
            
        # dense feature MLP
        if num_dense_features:
            self.dense_mlp = MLP(num_dense_features, emb_dim)

        # sparse feature embedding
        self.sparse_emb_list = nn.ModuleList()
        for i in range(num_sparse_features):
            self.sparse_emb_list.append(nn.Embedding(sparse_features_categories[i], emb_dim))

        # top MLP's input == sparse interaction feature + 1 of dense feature
        self.top_mlp = MLP((num_sparse_features+self.has_dense_feature)*(num_sparse_features+self.has_dense_feature-1)//2 + num_dense_features * emb_dim, num_output_categories)

        # trilu_indices
        self.register_buffer(
            "triu_indices",
            torch.triu_indices(self.num_sparse_features + self.has_dense_feature, self.num_sparse_features + self.has_dense_feature, offset=1),
            persistent=False,
        )

    def forward(self, inp_dense, inp_sparse):
        # dense feature go through MLP
        if self.num_dense_features:
            dense_emb = self.dense_mlp(inp_dense)[:,None,...]

        # sparse feature go through embedding
        sparse_emb_list = []
        for i in range(self.num_sparse_features):
            sparse_emb_list.append(self.sparse_emb_list[i](inp_sparse[:,i][:,None,...]))

        sparse_emb = torch.cat(sparse_emb_list, dim = 1)
        
        # concat dense_emb and sparse_emb
        if self.num_dense_features:
            output = torch.cat([dense_emb, sparse_emb], dim = 1)
        else:
            output = sparse_emb

        # concated embeddings go through interaction of matrix multiplication (batched)
        output = output @ output.transpose(-2,-1)

        # take the interaction between any two features by the upper tril
        # option 1: use torch.trilu_indices (https://github.com/pytorch/torchrec/blob/main/torchrec/models/dlrm.py)
        output = output[:, self.triu_indices[0], self.triu_indices[1]]
        # option 2: use calculated indices (https://github.com/facebookresearch/dlrm/blob/main/dlrm_s_pytorch.py)
        #li = torch.tensor([i for i in range(ni) for j in range(i + offset)])
        #lj = torch.tensor([j for i in range(nj) for j in range(i + offset)])

        # cat the interaction output with dense embedding again
        if self.num_dense_features:
            output = torch.cat([dense_emb[:,-1]] + [output], dim = 1)
        
        
        # go through final top_mlp
        output = self.top_mlp(output)

        # For CTR, clamp between 0~1 with Sigmoid, then BCDLoss for loss function
        # For regression task like rating prediction, it's not needed
        #output = nn.Sigmoid()(output)

        return output

In [4]:
#!wget https://files.grouplens.org/datasets/movielens/ml-20m.zip
#!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
#!unzip ml-20m.zip
#!unzip ml-latest-small.zip

In [5]:
df_rating = pd.read_csv("./ml-latest-small/ratings.csv")
#df_rating = pd.read_csv("./ml-20m/ratings.csv")

In [6]:
df_rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [7]:
df_rating.rating.unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [8]:
rating_range = df_rating.rating.max() - df_rating.rating.min()

In [9]:
df_movies = pd.read_csv("./ml-latest-small/movies.csv")
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [10]:
df = df_rating.merge(df_movies, on = "movieId")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [11]:
df["movieCat"] = df.movieId.astype("category").cat.codes
df["userCat"] = df.userId.astype("category").cat.codes
df["genresCat"] = df.genres.astype("category").cat.codes
#df["movieCat"] = df.movieId
#df["userCat"] = df.movieId

df["timestamp_norm"] = (df.timestamp - df.timestamp.mean())/df.timestamp.std()
#df["rating_norm"] = (df.rating - df.rating.mean())/df.rating.std()
#df["rating_norm"] = (df.rating - (rating_range/2))/rating_range
df["rating_norm"] = df.rating
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,movieCat,userCat,genresCat,timestamp_norm,rating_norm
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,351,-1.114225,4.0
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,4,351,-1.65777,4.0
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,6,351,-0.459214,4.5
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,14,351,1.40863,2.5
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,16,351,0.46125,4.5


In [12]:
from torch.utils.data import Dataset, DataLoader
class MovieLensDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        timestamp_data = float(self.df["timestamp_norm"].iloc[idx])  # float is expected by linear layer. avoid "RuntimeError: mat1 and mat2 must have the same dtype, but got Double and Float"
        movie_data = int(self.df["movieCat"].iloc[idx])
        user_data = int(self.df["userCat"].iloc[idx])
        genres_data = int(self.df["genresCat"].iloc[idx])
        rating_data = float(self.df["rating_norm"].iloc[idx])

        dense_data = torch.tensor([timestamp_data])
        sparse_data = torch.tensor([user_data, movie_data, genres_data])
        target_data = torch.tensor(rating_data)

        return dense_data, sparse_data, target_data


In [13]:
train_count = len(df)*9//10
len(df.iloc[:train_count])

90752

In [14]:
# fixed split
#movielensDataset_train = MovieLensDataset(df.iloc[:train_count])
#movielensDataset_test = MovieLensDataset(df.iloc[train_count:-1])

# random split
df_train=df.sample(frac=0.8,random_state=200)
df_test=df.drop(df_train.index)
movielensDataset_train = MovieLensDataset(df_train)
movielensDataset_test = MovieLensDataset(df_test)

movielensDataset_test[0]

(tensor([1.4086]), tensor([ 14,   0, 351]), tensor(2.5000))

In [15]:
movielensDataloader_train = DataLoader(movielensDataset_train, batch_size=512, shuffle=True)
movielensDataloader_test = DataLoader(movielensDataset_test, batch_size=512, shuffle=True)

dense_data, sparse_data, target_data = next(iter(movielensDataloader_test))
dense_data.shape, sparse_data.shape, target_data.shape

(torch.Size([512, 1]), torch.Size([512, 3]), torch.Size([512]))

In [16]:
max(df["movieId"])

193609

In [17]:
len(df["movieId"].unique())

9724

In [18]:
max(df["userId"])

610

In [19]:
len(df["userId"].unique())

610

In [20]:
len(df)

100836

In [21]:
len(df["movieCat"].unique())

9724

In [22]:
device = "cuda"
# only train with sparse features (user and movie)
#model = MyDLRM( emb_dim=64, num_dense_features=0, num_sparse_features=3, sparse_features_categories=[len(df["userCat"].unique()),len(df["movieCat"].unique()), len(df["genresCat"].unique())], num_output_categories=10).to(device)
# train with sparse features (user and movie) and dense feature (timestamp)
model = MyDLRM( emb_dim=64, num_dense_features=1, num_sparse_features=3, sparse_features_categories=[len(df["userCat"].unique()),len(df["movieCat"].unique()), len(df["genresCat"].unique())], num_output_categories=10).to(device)

In [23]:
lossFunc = F.mse_loss
#lossFunc = nn.L1Loss()
#optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_loop():
    model.train()
    
    step = 0
    losses = 0
    for dense_data, sparse_data, target_data in movielensDataloader_train:
        outp = model(dense_data.to(device), sparse_data.to(device))
        #use the mean of the output
        outp = outp.mean(dim = 1, keepdim=True)
        loss = lossFunc(outp, target_data[...,None].to(device))
        loss.backward()
    
        with torch.no_grad():
            optimizer.step()
            optimizer.zero_grad()
            
        step+=1
        losses += loss.item()
        #if step % 100 == 0:
            #print(loss)
    print(f"train epoch loss: {losses/step}")

@torch.no_grad()
def eval_loop():
    model.eval()
    step = 0
    losses = 0
    for dense_data, sparse_data, target_data in movielensDataloader_test:
        outp = model(dense_data.to(device), sparse_data.to(device))
        #use the mean of the output
        outp = outp.mean(dim = 1, keepdim=True)
        loss = lossFunc(outp, target_data[...,None].to(device))
            
        step+=1
        losses += loss.item()
        if step % 100 == 0:
            print(loss)
    print(f"eval epoch loss: {losses/step}")
    
for i in range(10):
    print(f'epoch #{i}')
    train_loop()
    eval_loop()

epoch #0
train epoch loss: 2.836675643166409
eval epoch loss: 1.098043218255043
epoch #1
train epoch loss: 1.0884144257141064
eval epoch loss: 1.0873429104685783
epoch #2
train epoch loss: 1.0845518078230605
eval epoch loss: 1.083783385157585
epoch #3
train epoch loss: 1.0750988110711304
eval epoch loss: 1.067153199017048
epoch #4
train epoch loss: 1.0491753529898729
eval epoch loss: 1.0342456743121147
epoch #5
train epoch loss: 1.0130420942095262
eval epoch loss: 1.0071798026561738
epoch #6
train epoch loss: 0.9777060034154337
eval epoch loss: 0.9849074944853783
epoch #7
train epoch loss: 0.9374102078661134
eval epoch loss: 0.9483043819665908
epoch #8
train epoch loss: 0.8972101867953434
eval epoch loss: 0.9208995923399925
epoch #9
train epoch loss: 0.8592976520333109
eval epoch loss: 0.9041521430015564


In [33]:
movielensDataloader_test = DataLoader(movielensDataset_test, batch_size=64, shuffle=True)

dense_data, sparse_data, target_data = next(iter(movielensDataloader_test))

outp=model(dense_data.to(device), sparse_data.to(device))
outp = outp.mean(dim = 1, keepdim=True)

#print(outp.squeeze())
#print(target_data)
loss = lossFunc(outp, target_data[...,None].to(device))
print(loss)

[(outp[i].item(), target_data[i].item()) for i in range(outp.shape[0])]

tensor(0.9154, device='cuda:0', grad_fn=<MseLossBackward0>)


[(3.533228635787964, 3.5),
 (3.1699705123901367, 4.0),
 (3.2973134517669678, 2.0),
 (3.192746162414551, 3.0),
 (3.3784687519073486, 1.0),
 (4.250201225280762, 4.0),
 (3.8495118618011475, 2.5),
 (3.2597832679748535, 3.0),
 (3.7079074382781982, 1.0),
 (4.722701549530029, 4.5),
 (4.178431034088135, 4.0),
 (3.866569995880127, 3.0),
 (3.2814903259277344, 4.0),
 (3.9912049770355225, 4.0),
 (3.644556760787964, 4.0),
 (3.689218282699585, 5.0),
 (3.3449249267578125, 4.0),
 (3.73606538772583, 3.5),
 (2.7740910053253174, 4.0),
 (2.129265308380127, 1.5),
 (4.149108409881592, 4.0),
 (4.075967311859131, 4.0),
 (3.655681610107422, 2.5),
 (4.211141109466553, 5.0),
 (3.8193652629852295, 4.0),
 (2.665369749069214, 3.5),
 (4.260011196136475, 4.5),
 (3.2092621326446533, 3.0),
 (3.694141387939453, 3.0),
 (2.4285762310028076, 3.0),
 (3.756079912185669, 5.0),
 (3.7829196453094482, 4.5),
 (3.7952568531036377, 4.0),
 (3.997725009918213, 4.0),
 (3.264937162399292, 3.5),
 (3.8302018642425537, 3.0),
 (2.662503004