In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from scipy.sparse import coo_matrix

# Load MovieLens dataset
def load_movielens_data(filepath="ml-latest-small/ratings.csv"):
    data = pd.read_csv(filepath)
    return data

# Preprocess: Convert ratings to binary interactions
def preprocess_movielens(data, threshold=4.0):
    data['interaction'] = (data['rating'] >= threshold).astype(int)
    user_map = {user: idx for idx, user in enumerate(data['userId'].unique())}
    item_map = {item: idx for idx, item in enumerate(data['movieId'].unique())}
    data['userId'] = data['userId'].map(user_map)
    data['movieId'] = data['movieId'].map(item_map)
    num_users = len(user_map)
    num_items = len(item_map)
    return data, num_users, num_items

# Matrix Factorization Model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, latent_dim):
        super().__init__()
        self.user_embeddings = nn.Embedding(num_users, latent_dim)
        self.item_embeddings = nn.Embedding(num_items, latent_dim)
    
    def forward(self, user_ids, item_ids):
        user_factors = self.user_embeddings(user_ids)
        item_factors = self.item_embeddings(item_ids)
        scores = (user_factors * item_factors).sum(dim=1)  # Dot product
        probabilities = torch.sigmoid(scores)  # Convert to probabilities
        return probabilities

# Training function
def train_model(data, num_users, num_items, latent_dim=10, epochs=10, lr=0.01):
    model = MatrixFactorization(num_users, num_items, latent_dim)
    criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    interactions = torch.tensor(data[['userId', 'movieId', 'interaction']].values, dtype=torch.long)
    for epoch in range(epochs):
        total_loss = 0
        for row in interactions:
            user_id, item_id, label = row
            user_id = torch.tensor([user_id])
            item_id = torch.tensor([item_id])
            label = torch.tensor([label], dtype=torch.float32)
            
            # Forward pass
            prediction = model(user_id, item_id)
            
            # Compute loss
            loss = criterion(prediction, label)
            total_loss += loss.item()
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
    
    return model

# Evaluation: Generate predictions for all user-item pairs
def evaluate_model(model, num_users, num_items):
    user_ids = torch.arange(num_users).repeat_interleave(num_items)
    item_ids = torch.arange(num_items).repeat(num_users)
    with torch.no_grad():
        predictions = model(user_ids, item_ids)
    return predictions.reshape(num_users, num_items)


: 

In [3]:

import movielens
# Load and preprocess data
movielens_data = movielens.load_pandas_df("100K")
processed_data, num_users, num_items = preprocess_movielens(movielens_data)



100%|██████████| 4.81k/4.81k [00:29<00:00, 162KB/s] 

ZIPPATH=  /home/vinmike/Documents/GitHub/LLM4Rec-Dataloader/data/data/ml-100K.zip
1





NameError: name 'preprocess_movielens' is not defined

In [None]:

# Train the model
model = train_model(processed_data, num_users, num_items, latent_dim=10, epochs=10, lr=0.01)

# Evaluate the model
predictions = evaluate_model(model, num_users, num_items)
print(predictions)