In [1]:
import pandas as pd
import numpy as np

from preprocess_data import load_preprocessed_data

train_df, test_df = load_preprocessed_data()

Loading preprocessed Parquet files...... Done!


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd

class UserProductDataset(Dataset):
    def __init__(self, df):
        # Encode UserId and ProductId as categorical codes
        self.users = torch.tensor(df['UserId'].astype('category').cat.codes.values, dtype=torch.long)  # Long for embedding layers
        self.products = torch.tensor(df['ProductId'].astype('category').cat.codes.values, dtype=torch.long)
        self.scores = torch.tensor(df['Score'].values, dtype=torch.float32)

        # Store the number of users and products for embedding layer dimensions
        self.n_users = len(df['UserId'].unique())
        self.n_products = len(df['ProductId'].unique())

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        return self.users[idx], self.products[idx], self.scores[idx]

# Split data into train and validation sets
train_size = int(0.8 * len(train_df))
val_size = len(train_df) - train_size
train_data, val_data = torch.utils.data.random_split(train_df, [train_size, val_size])

# Create PyTorch datasets
train_dataset = UserProductDataset(train_data.dataset.iloc[train_data.indices])
val_dataset = UserProductDataset(val_data.dataset.iloc[val_data.indices])

# Create DataLoaders with multi-threading for faster data loading, larger batch size, and pinning memory for GPU
train_loader = DataLoader(train_dataset, batch_size=4096, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=4096, shuffle=False, num_workers=4, pin_memory=True)


In [3]:
torch.tensor([1,2,3]).to("cuda")  # NOTE this is an important catch for weird CUDA errors

tensor([1, 2, 3], device='cuda:0')

In [4]:
import torch.nn as nn
import torch.optim as optim

class MatrixFactorizationModel(nn.Module):
    def __init__(self, n_users, n_products, n_factors=50):
        super(MatrixFactorizationModel, self).__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)  # User embedding
        self.product_factors = nn.Embedding(n_products, n_factors)  # Product embedding

    def forward(self, user, product):
        user_embedding = self.user_factors(user)  # Shape: (batch_size, n_factors)
        product_embedding = self.product_factors(product)  # Shape: (batch_size, n_factors)
        
        prediction = (user_embedding * product_embedding).sum(1)  # Dot product
        return prediction

print(f"User tensor sample: {train_dataset[0][0]}")  # Should print a user ID as an integer
print(f"Product tensor sample: {train_dataset[0][1]}")  # Should print a product ID as an integer


# Initialize the model
n_factors = 50  # Number of latent factors
model = MatrixFactorizationModel(train_dataset.n_users, train_dataset.n_products, n_factors)

# Move model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


User tensor sample: 44444
Product tensor sample: 42347


MatrixFactorizationModel(
  (user_factors): Embedding(123826, 50)
  (product_factors): Embedding(50034, 50)
)

In [5]:
loss_fn = nn.L1Loss()

In [6]:
for param in model.parameters():
    print(param.requires_grad)

True
True


In [7]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train(model, train_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for user, product, score in train_loader:
        # Move data to GPU
        user, product, score = user.to(device, non_blocking=True), product.to(device, non_blocking=True), score.to(device, non_blocking=True)

        optimizer.zero_grad()
        predictions = model(user, product)
        loss = loss_fn(predictions, score)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(train_loader)

def validate(model, val_loader, loss_fn, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, product, score in val_loader:
            # Move data to GPU
            user, product, score = user.to(device, non_blocking=True), product.to(device, non_blocking=True), score.to(device, non_blocking=True)
            predictions = model(user, product)
            loss = loss_fn(predictions, score)
            total_loss += loss.item()

    return total_loss / len(val_loader)

# Training loop
n_epochs = 10

for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, loss_fn, device)
    val_loss = validate(model, val_loader, loss_fn, device)
    
    print(f"Epoch {epoch + 1}/{n_epochs}, Train Loss (L1): {train_loss:.4f}, Val Loss (L1): {val_loss:.4f}")

Epoch 1/10, Train Loss (L1): 6.5725, Val Loss (L1): 6.5906
Epoch 2/10, Train Loss (L1): 6.1603, Val Loss (L1): 6.5493
Epoch 3/10, Train Loss (L1): 5.7664, Val Loss (L1): 6.5125
Epoch 4/10, Train Loss (L1): 5.4033, Val Loss (L1): 6.4798
Epoch 5/10, Train Loss (L1): 5.0670, Val Loss (L1): 6.4510
Epoch 6/10, Train Loss (L1): 4.7569, Val Loss (L1): 6.4261
Epoch 7/10, Train Loss (L1): 4.4726, Val Loss (L1): 6.4040
Epoch 8/10, Train Loss (L1): 4.2085, Val Loss (L1): 6.3851
Epoch 9/10, Train Loss (L1): 3.9663, Val Loss (L1): 6.3685
Epoch 10/10, Train Loss (L1): 3.7422, Val Loss (L1): 6.3544


In [8]:
for epoch in range(n_epochs):
    train_loss = train(model, train_loader, optimizer, loss_fn, device)
    val_loss = validate(model, val_loader, loss_fn, device)
    
    print(f"Epoch {epoch + 1}/{n_epochs}, Train Loss (L1): {train_loss:.4f}, Val Loss (L1): {val_loss:.4f}")

Epoch 1/10, Train Loss (L1): 3.5350, Val Loss (L1): 6.3421
Epoch 2/10, Train Loss (L1): 3.3426, Val Loss (L1): 6.3318
Epoch 3/10, Train Loss (L1): 3.1627, Val Loss (L1): 6.3229
Epoch 4/10, Train Loss (L1): 2.9971, Val Loss (L1): 6.3153
Epoch 5/10, Train Loss (L1): 2.8416, Val Loss (L1): 6.3090
Epoch 6/10, Train Loss (L1): 2.6980, Val Loss (L1): 6.3039
Epoch 7/10, Train Loss (L1): 2.5652, Val Loss (L1): 6.2992
Epoch 8/10, Train Loss (L1): 2.4408, Val Loss (L1): 6.2966
Epoch 9/10, Train Loss (L1): 2.3245, Val Loss (L1): 6.2933
Epoch 10/10, Train Loss (L1): 2.2163, Val Loss (L1): 6.2915


In [10]:
def evaluate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for user, product, score in data_loader:
            user, product, score = user.to(device), product.to(device), score.to(device)
            predictions = torch.round(model(user, product))  # Round predictions to the nearest integer
            correct += (predictions == score).sum().item()
            total += score.size(0)
            print(predictions, score)

    return correct / total

# Evaluate accuracy on the validation set
accuracy = evaluate_accuracy(model, val_loader)
print(f"Validation Accuracy: {accuracy:.4f}")

tensor([-8.,  6.,  4.,  ..., -5., -5.,  3.], device='cuda:0') tensor([5., 4., 3.,  ..., 5., 5., 5.], device='cuda:0')
tensor([4., 5., 1.,  ..., 2., 3., 3.], device='cuda:0') tensor([3., 1., 4.,  ..., 5., 4., 4.], device='cuda:0')
tensor([-9.,  8., -8.,  ...,  0., -2.,  2.], device='cuda:0') tensor([5., 1., 4.,  ..., 3., 1., 5.], device='cuda:0')
tensor([15., -3.,  7.,  ...,  3.,  5.,  3.], device='cuda:0') tensor([5., 5., 4.,  ..., 4., 5., 4.], device='cuda:0')
tensor([ -1.,  -4.,   3.,  ...,   2., -12.,  -2.], device='cuda:0') tensor([5., 5., 5.,  ..., 4., 5., 4.], device='cuda:0')
tensor([ -7.,  -3.,  15.,  ..., -11.,   6.,  -0.], device='cuda:0') tensor([5., 3., 5.,  ..., 5., 5., 5.], device='cuda:0')
tensor([  2.,  17.,   3.,  ...,  16., -13.,  -6.], device='cuda:0') tensor([4., 5., 1.,  ..., 5., 3., 5.], device='cuda:0')
tensor([-3.,  5.,  2.,  ...,  1.,  5.,  7.], device='cuda:0') tensor([5., 4., 5.,  ..., 5., 4., 5.], device='cuda:0')
tensor([ -4.,  -7.,   1.,  ...,  -8.,   4., 