## Steps
1. Reproducing kaggle https://www.kaggle.com/code/jamesloy/deep-learning-based-recommender-systems
2. Recreate all models in neural collaborative filtering

In [1]:
import os

os.chdir("/Users/yenchenchou/Documents/GitHub/mle-career-path")

In [2]:
import random
import polars as pl
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets

In [3]:
def available_device():
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = "cpu"
    print(f"Device: {device}")
    return device


def fix_seed(seed: int) -> None:
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed()
        torch.cuda.manual_seed_all()

In [4]:
device = available_device()
fix_seed(seed=5432)

Device: mps


In [79]:
class MovieLensPrep:
    def __init__(self):
        self.df = None

    def load_df(self, **plkwargs) -> None:
        self.df = pl.from_pandas(
            pd.read_table(
                "data/ml-1m/ratings.dat",
                header=None,
                sep="::",
                names=["user_id", "movie_id", "rating", "timestamp"],
                nrows=100,
            )
        )

    def clean_df(self) -> None:
        self.df = self.df.sort(["user_id", "timestamp"])

    def split_df(
        self,
        method: str = "leave-one-out",
        n: int = 1,
    ) -> tuple[pl.DataFrame, pl.DataFrame, pl.DataFrame]:
        if method == "leave-one-out":
            n = 1
            df_eval_test = self.df.group_by(["user_id"], maintain_order=True).tail(
                n=2 * n
            )
            df_train = self.df.join(df_eval_test, on=self.df.columns, how="anti")
            df_test = df_eval_test.group_by(["user_id"], maintain_order=True).tail(n=n)
            df_eval = df_eval_test.group_by(["user_id"], maintain_order=True).head(n=n)
        return df_train, df_eval, df_test

    def neg_sampling(self, df_train: pl.DataFrame, ratio: int = 4) -> pl.DataFrame:
        # 1. get all items
        uniq_items = self.df["user_id"].unique()
        # 2. get uniq user item pairs
        user_item_pairs = set(zip(df_train["user_id"], df_train["movie_id"]))
        # 3. get negative sampling
        users, items, labels = [], [], []
        for user, item in user_item_pairs:
            users.append(user)
            items.append(item)
            labels.append(1)
            neg_item = np.random.choice(uniq_items, replace=True)
            while (user, neg_item) in user_item_pairs:
                neg_item = np.random.choice(uniq_items, replace=True)
            users.append(user)
            items.append(neg_item)
            labels.append(0)
        return torch.tensor(users), torch.tensor(items), torch.tensor(labels)

    def __call__(self):
        self.load_data()
        self.clean_data()
        df_train, df_eval, df_test = self.split_data(method="leave-one-out")
        df_train = self.neg_sampling(df_train)
        return df_train, df_eval, df_test


def MovieLensDataset(Dataset):
    def __init__(self, data, label=None):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        if self.label is None:
            return self.data[idx]
        else:
            return self.data[idx], self.label[idx]

In [80]:
data_prep = MovieLensPrep()
df_train, df_eval, df_test = data_prep()
train_loader = DataLoader(df_train, batch_size=batch_size)
eval_loader = DataLoader(df_eval, batch_size=batch_size)

  pd.read_table(


In [88]:
data_prep()[0][0][0]

  pd.read_table(


tensor(1)

In [84]:
torch.tensor([1, 2])

tensor([1, 2])

In [57]:
df_train.head(2)

user_id,movie_id,rating,timestamp
i64,i64,i64,i64
1,3186,4,978300019
1,1270,5,978300055


In [90]:
ratings = pd.read_table(
    "data/ml-1m/ratings.dat",
    header=None,
    sep="::",
    names=["user_id", "movie_id", "rating", "timestamp"],
    nrows=100,
)
ratings.groupby("user_id")["movie_id"].apply(list).to_dict()

  ratings = pd.read_table(


{1: [1193,
  661,
  914,
  3408,
  2355,
  1197,
  1287,
  2804,
  594,
  919,
  595,
  938,
  2398,
  2918,
  1035,
  2791,
  2687,
  2018,
  3105,
  2797,
  2321,
  720,
  1270,
  527,
  2340,
  48,
  1097,
  1721,
  1545,
  745,
  2294,
  3186,
  1566,
  588,
  1907,
  783,
  1836,
  1022,
  2762,
  150,
  1,
  1961,
  1962,
  2692,
  260,
  1028,
  1029,
  1207,
  2028,
  531,
  3114,
  608,
  1246],
 2: [1357,
  3068,
  1537,
  647,
  2194,
  648,
  2268,
  2628,
  1103,
  2916,
  3468,
  1210,
  1792,
  1687,
  1213,
  3578,
  2881,
  3030,
  1217,
  3105,
  434,
  2126,
  3107,
  3108,
  3035,
  1253,
  1610,
  292,
  2236,
  3071,
  902,
  368,
  1259,
  3147,
  1544,
  1293,
  1188,
  3255,
  3256,
  3257,
  110,
  2278,
  2490,
  1834,
  3471,
  589,
  1690]}

In [71]:
np.random.choice(df_train["user_id"].unique(), replace=True)

2

In [61]:
df_train[["user_id", "movie_id"]]

user_id,movie_id
i64,i64
1,3186
1,1270
1,1721
1,1022
1,2340
1,1836
1,3408
1,2804
1,1207
1,1193


In [59]:
df_train.select(["user_id", "movie_id"])

user_id,movie_id
i64,i64
1,3186
1,1270
1,1721
1,1022
1,2340
1,1836
1,3408
1,2804
1,1207
1,1193
