In [1]:
import pandas as pd
import numpy as np

from preprocess_data import load_preprocessed_data

train_df, test_df = load_preprocessed_data()

Loading preprocessed Parquet files...... Done!


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class UserProductDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['UserId'].astype('category').cat.codes.values)  # Encode UserId as category
        self.products = torch.tensor(df['ProductId'].astype('category').cat.codes.values)  # Encode ProductId
        self.scores = torch.tensor(df['Score'].values, dtype=torch.float32)

        # Store number of users and products for later use in embedding layers
        self.n_users = len(df['UserId'].unique())
        self.n_products = len(df['ProductId'].unique())

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        return self.users[idx], self.products[idx], self.scores[idx]

# Split data into train and validation
train_size = int(0.8 * len(train_df))
val_size = len(train_df) - train_size
train_data, val_data = torch.utils.data.random_split(train_df, [train_size, val_size])

# Create PyTorch datasets
train_dataset = UserProductDataset(train_data.dataset.iloc[train_data.indices])
val_dataset = UserProductDataset(val_data.dataset.iloc[val_data.indices])

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1024)


In [3]:
torch.tensor([1,2,3]).to("cuda")  # NOTE this is an important catch for weird CUDA errors

tensor([1, 2, 3], device='cuda:0')

In [4]:
import torch.nn as nn
import torch.optim as optim

class MatrixFactorizationModel(nn.Module):
    def __init__(self, n_users, n_products, n_factors=50):
        super(MatrixFactorizationModel, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)  # User embedding
        self.product_factors = nn.Embedding(n_products, n_factors)  # Product embedding

    def forward(self, user, product):
        user_embedding = self.user_factors(user)  # Shape: (batch_size, n_factors)
        product_embedding = self.product_factors(product)  # Shape: (batch_size, n_factors)
        
        prediction = (user_embedding * product_embedding).sum(1)  # Dot product
        return prediction

print(f"User tensor sample: {train_dataset[0][0]}")  # Should print a user ID as an integer
print(f"Product tensor sample: {train_dataset[0][1]}")  # Should print a product ID as an integer


# Initialize the model
n_factors = 50  # Number of latent factors
model = MatrixFactorizationModel(train_dataset.n_users, train_dataset.n_products, n_factors)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


User tensor sample: 70806
Product tensor sample: 16931


MatrixFactorizationModel(
  (user_factors): Embedding(123865, 50)
  (product_factors): Embedding(50035, 50)
)