In [1]:
# --- Configuration Cell ---
import torch
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
import math
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Is CUDA available:{device}")
print("CUDA Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU detected")

# --- Paths and Constants ---
RATINGS_CSV_PATH = 'cleaned_merged.csv' 
USER_COL = 'UserId'
ITEM_COL = 'ProductId'
RATING_COL = 'Score'
OUTPUT_MODEL_PATH = 'cf_model.pth'
CF_MAPPINGS_PATH = 'cf_mappings.pkl' # File to save the new mappings

# --- Model and Training Hyperparameters ---
EMBEDDING_DIM = 512
BATCH_SIZE = 256
EPOCHS = 100 
TEST_SPLIT = 0.2
VALIDATION_SPLIT = 0.2 # Proportion of original data for validation
RANDOM_STATE = 42
LEARNING_RATE = 0.001

Is CUDA available:cpu
CUDA Device Name: No GPU detected


In [2]:
# --- Dataset Class ---
class RatingsDataset(Dataset):
    def __init__(self, dataframe, user_map, item_map, user_col, item_col, rating_col):
        # Mappings are now passed in, derived from the full dataset used for training
        # We assume the dataframe passed here only contains users/items present in the maps
        self.users = torch.tensor([user_map[i] for i in dataframe[user_col]], dtype=torch.long)
        self.items = torch.tensor([item_map[i] for i in dataframe[item_col]], dtype=torch.long)
        self.ratings = torch.tensor(dataframe[rating_col].values, dtype=torch.float)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

In [None]:
# def sentiment_label(score):
#     if score < 3:
#         return 0  # Negative
#     elif score == 3:
#         return 1  # Neutral
#     else:
#         return 2  # Positive
# # Assuming df_loaded['mean_score'] is the correct column
# df_loaded['label'] = df_loaded['mean_score'].apply(sentiment_label)

In [4]:
# --- Load Data, Create Mappings, and Split ---

user_id_to_idx = {}
item_id_to_idx = {}
num_users = 0
num_items = 0
train_loader, val_loader, test_loader = None, None, None

print(f"Loading ratings data from {RATINGS_CSV_PATH}...")
try:
    # Load only necessary columns
    ratings_df = pd.read_csv(RATINGS_CSV_PATH, usecols=[USER_COL, ITEM_COL, RATING_COL])
    ratings_df.dropna(subset=[USER_COL, ITEM_COL, RATING_COL], inplace=True)
    print(f"Loaded {len(ratings_df)} valid ratings.")

    if len(ratings_df) > 0:
        # --- Create Mappings Dynamically ---
        print("Creating user and item mappings...")
        unique_users = sorted(ratings_df[USER_COL].unique().tolist())
        unique_items = sorted(ratings_df[ITEM_COL].unique().tolist())

        user_id_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
        item_id_to_idx = {item_id: idx for idx, item_id in enumerate(unique_items)}

        num_users = len(user_id_to_idx)
        num_items = len(item_id_to_idx)
        print(f"Found {num_users} unique users and {num_items} unique items in the dataset.")

        # --- Split Data ---
        print(f"Splitting data into train, validation ({VALIDATION_SPLIT:.0%}), and test ({TEST_SPLIT:.0%})...")
        
        # First split: Separate test set
        train_val_df, test_df = train_test_split(
            ratings_df, 
            test_size=TEST_SPLIT, 
            random_state=RANDOM_STATE,
            # Stratify might be useful if ratings are imbalanced, but requires ratings column
            # stratify=ratings_df[RATING_COL] if RATING_COL in ratings_df else None 
        )
        
        # Check if train_val_df is empty before the second split
        if len(train_val_df) > 0:
            # Calculate adjusted validation split size relative to the remaining data
            val_split_adjusted = VALIDATION_SPLIT / (1.0 - TEST_SPLIT)
            val_split_adjusted = min(val_split_adjusted, 1.0) # Ensure it doesn't exceed 1.0
            
            train_df, val_df = train_test_split(
                train_val_df, 
                test_size=val_split_adjusted, 
                random_state=RANDOM_STATE, # Use same random state for consistency
                # stratify=train_val_df[RATING_COL] if RATING_COL in train_val_df else None
            )
            
            print(f"Training set size: {len(train_df)}")
            print(f"Validation set size: {len(val_df)}")
            print(f"Test set size: {len(test_df)}")

            # --- Create Datasets and DataLoaders ---
            if len(train_df) > 0:
                train_dataset = RatingsDataset(train_df, user_id_to_idx, item_id_to_idx, USER_COL, ITEM_COL, RATING_COL)
                train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0) # num_workers=0 for Windows compatibility often
            else:
                print("Warning: Training set is empty after split.")
                
            if len(val_df) > 0:
                val_dataset = RatingsDataset(val_df, user_id_to_idx, item_id_to_idx, USER_COL, ITEM_COL, RATING_COL)
                val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
            else:
                 print("Warning: Validation set is empty after split.")

            if len(test_df) > 0:
                test_dataset = RatingsDataset(test_df, user_id_to_idx, item_id_to_idx, USER_COL, ITEM_COL, RATING_COL)
                test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
            else:
                print("Warning: Test set is empty after split.")
                
        else:
            print("Warning: No data left for training/validation after initial test split.")
            # Handle test set if it's the only one with data
            if len(test_df) > 0:
                 print(f"Test set size: {len(test_df)}")
                 test_dataset = RatingsDataset(test_df, user_id_to_idx, item_id_to_idx, USER_COL, ITEM_COL, RATING_COL)
                 test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
            else:
                 print("Warning: All sets are empty after splits.")
    else:
        print("Error: No valid ratings data loaded. Cannot proceed.")

except FileNotFoundError:
    print(f"Error: Ratings file not found at {RATINGS_CSV_PATH}")
    raise
except KeyError as e:
    print(f"Error: Column {e} not found in {RATINGS_CSV_PATH}. Adjust column names in config.")
    raise
except Exception as e:
    print(f"An error occurred during data loading, mapping, or splitting: {e}")
    raise


Loading ratings data from cleaned_merged.csv...
Loaded 393560 valid ratings.
Creating user and item mappings...
Found 256027 unique users and 67553 unique items in the dataset.
Splitting data into train, validation (20%), and test (20%)...
Training set size: 236136
Validation set size: 78712
Test set size: 78712


In [5]:
# --- Define CF Model (Matrix Factorization) ---
# !! Make sure to run this cell BEFORE the model initialization cell !!
import torch.nn as nn # Ensure nn is imported

class CFModel(nn.Module):
    def __init__(self, n_users, n_items, embedding_dim):
        super().__init__()
        # Add small epsilon to handle potential zero users/items if data loading failed
        self.user_embeddings = nn.Embedding(max(n_users, 1), embedding_dim)
        self.item_embeddings = nn.Embedding(max(n_items, 1), embedding_dim)
        self.user_bias = nn.Embedding(max(n_users, 1), 1)
        self.item_bias = nn.Embedding(max(n_items, 1), 1)

        # Initialize weights
        nn.init.normal_(self.user_embeddings.weight, std=0.01)
        nn.init.normal_(self.item_embeddings.weight, std=0.01)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, user_indices, item_indices):
        user_embedding = self.user_embeddings(user_indices)
        item_embedding = self.item_embeddings(item_indices)
        dot_product = (user_embedding * item_embedding).sum(1)
        user_b = self.user_bias(user_indices).squeeze()
        item_b = self.item_bias(item_indices).squeeze()
        # Ensure biases are added correctly even if batch size is 1
        if dot_product.dim() == 0: # Handle batch size of 1 where squeeze might remove the dim
             user_b = user_b.unsqueeze(0)
             item_b = item_b.unsqueeze(0)
             dot_product = dot_product.unsqueeze(0)
        elif user_b.dim() == 0:
             user_b = user_b.unsqueeze(0)
        elif item_b.dim() == 0:
             item_b = item_b.unsqueeze(0)
             
        # Clamp output to reasonable rating range (e.g., 1-5) if applicable
        # return torch.clamp(dot_product + user_b + item_b, 1.0, 5.0) 
        # Or return raw scores if clamping is not desired
        return dot_product + user_b + item_b


In [6]:
# --- Training and Evaluation Functions ---
# !! Make sure to run this cell BEFORE the model initialization/training cell !!
import torch
import math

def train_one_epoch(model, dataloader, criterion, optimizer, device):
    model.train() # Set model to training mode
    total_loss = 0.0
    total_samples = 0

    for users, items, ratings in dataloader:
        users, items, ratings = users.to(device), items.to(device), ratings.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(users, items)
        loss = criterion(outputs, ratings)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * users.size(0)
        total_samples += users.size(0)

    avg_loss = total_loss / total_samples if total_samples > 0 else 0
    return avg_loss

def evaluate_model(model, dataloader, criterion, device):
    model.eval() # Set model to evaluation mode
    total_loss = 0.0
    total_samples = 0

    with torch.no_grad(): # Disable gradient calculation
        for users, items, ratings in dataloader:
            users, items, ratings = users.to(device), items.to(device), ratings.to(device)

            # Forward pass
            outputs = model(users, items)
            loss = criterion(outputs, ratings)

            total_loss += loss.item() * users.size(0)
            total_samples += users.size(0)

    avg_loss = total_loss / total_samples if total_samples > 0 else 0
    rmse = math.sqrt(avg_loss) if avg_loss >= 0 else float('nan') # Calculate RMSE from MSE
    return avg_loss, rmse

def train_and_validate(model, train_loader, val_loader, criterion, optimizer, epochs, device):
    best_val_rmse = float('inf')
    best_epoch = -1

    for epoch in range(epochs):
        # --- Training ---
        avg_train_loss = 0
        if train_loader:
            avg_train_loss = train_one_epoch(model, train_loader, criterion, optimizer, device)
            train_rmse = math.sqrt(avg_train_loss) if avg_train_loss >= 0 else float('nan')
        else:
            print(f"Epoch {epoch+1}/{epochs} - Skipping training: train_loader not available.")
            train_rmse = float('nan')

        # --- Validation ---
        avg_val_loss = float('inf')
        val_rmse = float('inf')
        if val_loader:
            avg_val_loss, val_rmse = evaluate_model(model, val_loader, criterion, device)
        else:
            print(f"Epoch {epoch+1}/{epochs} - Skipping validation: val_loader not available.")

        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Train RMSE: {train_rmse:.4f}, Val Loss: {avg_val_loss:.4f}, Val RMSE: {val_rmse:.4f}')

        # --- Checkpoint Best Model (based on validation RMSE) ---
        if val_loader and val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_epoch = epoch + 1
            # Optionally save the best model checkpoint here if desired
            # torch.save(model.state_dict(), 'best_cf_model.pth')
            # print(f'   -> New best validation RMSE found: {best_val_rmse:.4f}. Model checkpoint saved.')

    print(f'\nFinished Training. Best Validation RMSE: {best_val_rmse:.4f} at epoch {best_epoch}')


In [7]:
# --- Initialize Model, Optimizer, Criterion ---

# Ensure num_users and num_items were correctly determined in the data loading cell
if num_users > 0 and num_items > 0:
    cf_model = CFModel(num_users, num_items, EMBEDDING_DIM).to(device)
    criterion = nn.MSELoss() # Mean Squared Error Loss for ratings
    optimizer = optim.Adam(cf_model.parameters(), lr=LEARNING_RATE)

    print("CF model, criterion, and optimizer initialized.")

    # --- Start Training & Validation ---
    if train_loader or val_loader: # Proceed only if there's data to train or validate
        print("Starting training and validation...")
        train_and_validate(cf_model, train_loader, val_loader, criterion, optimizer, EPOCHS, device)
        print("Training and validation loop finished.")

        # --- Save the Trained Model ---
        try:
            torch.save(cf_model.state_dict(), OUTPUT_MODEL_PATH)
            print(f"Trained CF model saved to {OUTPUT_MODEL_PATH}")
        except Exception as e:
            print(f"Error saving trained model: {e}")

        # --- Save the Mappings Used During Training ---
        try:
            mappings_to_save = {
                'user_id_to_idx': user_id_to_idx,
                'item_id_to_idx': item_id_to_idx
            }
            with open(CF_MAPPINGS_PATH, 'wb') as f:
                pickle.dump(mappings_to_save, f)
            print(f"CF mappings saved to {CF_MAPPINGS_PATH}")
        except Exception as e:
            print(f"Error saving CF mappings: {e}")

        # --- Optional: Evaluate on Test Set ---
        if test_loader:
            print("--- Evaluating on Test Set ---")
            test_loss, test_rmse = evaluate_model(cf_model, test_loader, criterion, device)
            print(f'--- Test Set Evaluation Finished, Average Test Loss: {test_loss:.4f}, Test RMSE: {test_rmse:.4f} ---')
        else:
            print("Skipping test set evaluation: test_loader not available.")
            
    else:
        print("Skipping training as no training or validation data is available.")
else:
    print("Error: Cannot initialize model. num_users or num_items is zero. Check data loading.")


CF model, criterion, and optimizer initialized.
Starting training and validation...
Epoch 1/100, Train Loss: 18.4959, Train RMSE: 4.3007, Val Loss: 17.5757, Val RMSE: 4.1923
Epoch 2/100, Train Loss: 13.2468, Train RMSE: 3.6396, Val Loss: 16.3076, Val RMSE: 4.0383
Epoch 3/100, Train Loss: 4.6678, Train RMSE: 2.1605, Val Loss: 15.9258, Val RMSE: 3.9907
Epoch 4/100, Train Loss: 1.1691, Train RMSE: 1.0813, Val Loss: 15.8917, Val RMSE: 3.9864
Epoch 5/100, Train Loss: 0.3644, Train RMSE: 0.6037, Val Loss: 15.8405, Val RMSE: 3.9800
Epoch 6/100, Train Loss: 0.3176, Train RMSE: 0.5635, Val Loss: 15.7657, Val RMSE: 3.9706
Epoch 7/100, Train Loss: 0.5395, Train RMSE: 0.7345, Val Loss: 15.6808, Val RMSE: 3.9599
Epoch 8/100, Train Loss: 0.7393, Train RMSE: 0.8598, Val Loss: 15.5495, Val RMSE: 3.9433
Epoch 9/100, Train Loss: 0.6887, Train RMSE: 0.8299, Val Loss: 15.4472, Val RMSE: 3.9303
Epoch 10/100, Train Loss: 0.5283, Train RMSE: 0.7269, Val Loss: 15.3369, Val RMSE: 3.9162
Epoch 11/100, Train Los