In [2]:
# -*- coding: utf-8 -*-
"""
PyTorch Product Category Recommender based on Co-Purchase Patterns

This script trains a model to recommend product categories based on co-purchase
patterns identified within user site sessions. It uses purchase data from the
transactions file, identifying co-purchases by grouping transactions that share
the same machine_id and site_session_id.
"""

# --- Configuration: Define Dataset Sources ---
# IMPORTANT: This model uses transactions.csv to find co-purchase pairs within sessions.
#            The necessary identifiers (machine_id, site_session_id, prod_category_id)
#            are expected in this file. sessions.csv provides context but isn't
#            directly used for generating the co-purchase pairs in this specific model.
TRANSACTIONS_FILE = 'transactions.csv' # MODIFY THIS: Path to your transactions CSV file (MUST contain machine_id, site_session_id, prod_category_id)
CATEGORIES_FILE = 'product_categories.csv' # MODIFY THIS: Path to your product categories CSV file (optional, for names)
# SESSIONS_FILE = 'sessions.csv' # Path to sessions.csv (Not directly used in this script's core logic, but good to note)

# --- Imports ---
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import permutations
import os
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings # To manage warnings more gracefully

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# --- Global Settings & Hyperparameters ---
TEST_SET_SIZE = 0.2  # 20% of sessions for testing
RANDOM_STATE = 42    # For reproducible splits
RECOMMENDATION_COUNT = 2 # Number of categories to recommend

# Model Hyperparameters
EMBEDDING_DIM = 64     # Size of the category embedding vectors
LEARNING_RATE = 0.001
BATCH_SIZE = 1024     # Adjust based on memory
NUM_EPOCHS = 5        # Adjust based on convergence and dataset size
NEGATIVE_SAMPLES_PER_POSITIVE = 5 # How many negatives to sample for each positive

In [None]:
# -*- coding: utf-8 -*-
"""
PyTorch Product Category Recommender based on Co-Purchase Patterns
and Machine ID Context.

This script trains a model to recommend product categories based on co-purchase
patterns identified within user site sessions, incorporating the machine ID
as an additional context feature. It uses purchase data from the transactions
file, identifying co-purchases by grouping transactions that share the same
machine_id and site_session_id. The machine ID is used to potentially learn
machine-specific purchase behaviors.
"""

# --- Configuration: Define Dataset Sources ---
# IMPORTANT: This model uses transactions.csv to find co-purchase pairs within sessions.
#            The necessary identifiers (machine_id, site_session_id, prod_category_id)
#            are expected in this file.
TRANSACTIONS_FILE = 'transactions.csv' # MODIFY THIS: Path to your transactions CSV file (MUST contain machine_id, site_session_id, prod_category_id)
CATEGORIES_FILE = 'product_categories.csv' # MODIFY THIS: Path to your product categories CSV file (optional, for names)
# SESSIONS_FILE = 'sessions.csv' # Path to sessions.csv (Not directly used in this script's core logic, but good to note)

# --- Imports ---
import pandas as pd
import numpy as np
from collections import defaultdict
from itertools import permutations
import os
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings # To manage warnings more gracefully
import random # For picking a sample machine ID

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# --- Global Settings & Hyperparameters ---
TEST_SET_SIZE = 0.2  # 20% of sessions for testing
RANDOM_STATE = 42    # For reproducible splits
RECOMMENDATION_COUNT = 3 # Number of categories to recommend

# Model Hyperparameters
EMBEDDING_DIM = 64     # Size of the category and machine embedding vectors
LEARNING_RATE = 0.001
BATCH_SIZE = 1024      # Adjust based on memory
NUM_EPOCHS = 5         # Adjust based on convergence and dataset size
NEGATIVE_SAMPLES_PER_POSITIVE = 5 # How many negatives to sample for each positive

# --- Helper Functions ---

def load_data(transactions_file, categories_file):
    """
    Loads purchase transaction data, category names, and creates mappings
    for categories and machines.
    Assumes transactions_file contains machine_id, site_session_id, and prod_category_id.
    """
    print(f"Loading data...")
    print(f"  Transactions file: {transactions_file}")
    print(f"  Categories file: {categories_file}")
    start_time = time.time()

    if not os.path.exists(transactions_file):
        raise FileNotFoundError(f"Error: Transactions file not found at {transactions_file}")

    # --- Load Transactions ---
    required_cols = ['machine_id', 'site_session_id', 'prod_category_id']
    try:
        # Use iterator and chunking for potentially large files
        chunk_iter = pd.read_csv(
            transactions_file,
            usecols=required_cols,
            dtype={'prod_category_id': 'Int64', # Use nullable integer type first
                   'machine_id': 'Int64',      # Assuming these can be large integers
                   'site_session_id': 'Int64'},
            chunksize=1000000 # Process in chunks of 1 million rows
        )

        df_trans_list = []
        print("  Processing transaction file in chunks...")
        for i, chunk in enumerate(chunk_iter):
            # Drop rows where essential IDs are missing within the chunk
            chunk.dropna(subset=required_cols, inplace=True)
            # Ensure IDs are standard integers AFTER dropping NA
            if not chunk.empty:
                 # Use numpy int64 for consistency and potential compatibility
                chunk['prod_category_id'] = chunk['prod_category_id'].astype(np.int64)
                chunk['machine_id'] = chunk['machine_id'].astype(np.int64)
                chunk['site_session_id'] = chunk['site_session_id'].astype(np.int64)
                df_trans_list.append(chunk)
            print(f"    Processed chunk {i+1}...")

        if not df_trans_list:
            raise ValueError(f"No valid data found in {transactions_file} after checking required columns.")

        df_trans = pd.concat(df_trans_list, ignore_index=True)
        print(f"Loaded {len(df_trans)} transaction items from {len(df_trans_list)} chunks.")
        del df_trans_list # Free memory

    except KeyError as e:
        raise KeyError(f"Error loading transactions: Column '{e}' not found. Required columns are {required_cols}. Please check {transactions_file}.")
    except Exception as e:
        raise RuntimeError(f"Error loading transactions file {transactions_file}: {e}")


    category_lookup = None
    category_map = None
    reverse_category_map = None
    num_categories = 0
    machine_map = None
    reverse_machine_map = None
    num_machines = 0

    # --- Category Mapping ---
    print("  Creating category mappings...")
    unique_categories = sorted(df_trans['prod_category_id'].unique())
    if not unique_categories:
        raise ValueError("No valid product categories found in the transactions data.")

    num_categories = len(unique_categories)
    category_map = {cat_id: idx for idx, cat_id in enumerate(unique_categories)}
    reverse_category_map = {idx: cat_id for cat_id, idx in category_map.items()}
    print(f"Found {num_categories} unique product categories.")

    # --- Machine Mapping ---
    print("  Creating machine mappings...")
    unique_machines = sorted(df_trans['machine_id'].unique())
    if not unique_machines:
        raise ValueError("No valid machine IDs found in the transactions data.")

    num_machines = len(unique_machines)
    machine_map = {mach_id: idx for idx, mach_id in enumerate(unique_machines)}
    reverse_machine_map = {idx: mach_id for mach_id, idx in machine_map.items()}
    print(f"Found {num_machines} unique machine IDs.")


    # --- Load Category Names (Optional) ---
    if categories_file and os.path.exists(categories_file):
        print(f"  Loading category names from {categories_file}...")
        try:
            df_categories = pd.read_csv(
                categories_file,
                usecols=['Product Category ID', 'Report Category', 'Item Category', 'Item Subcategory']
            )
            df_categories.rename(columns={'Product Category ID': 'prod_category_id'}, inplace=True)
            # Handle potential missing values in name components before concatenation
            name_cols = ['Report Category', 'Item Category', 'Item Subcategory']
            for col in name_cols:
                 if col in df_categories.columns:
                     df_categories[col] = df_categories[col].fillna('')
                 else:
                     print(f"    Warning: Column '{col}' not found in categories file. It will be skipped for naming.")
                     df_categories[col] = '' # Add empty column if missing to prevent error

            df_categories['category_name'] = df_categories['Report Category'] + ' | ' + \
                                             df_categories['Item Category'] + ' | ' + \
                                             df_categories['Item Subcategory']
            # Clean up separators if parts are missing
            df_categories['category_name'] = df_categories['category_name'].str.replace(r'\s*\|\s*$', '', regex=True).str.replace(r'^\s*\|\s*', '', regex=True).str.replace(r'\s*\|\s*\|\s*', ' | ', regex=True).str.strip()
            # Create lookup dictionary
            category_lookup = df_categories.set_index('prod_category_id')['category_name'].to_dict()
            print(f"    Loaded {len(df_categories)} category definitions.")
        except KeyError as e:
            print(f"    Warning: Column '{e}' not found in categories file {categories_file}. Check column names. Category names might be incomplete.")
            category_lookup = None # Reset lookup if error occurs
        except Exception as e:
            print(f"    Warning: Error loading or processing categories file {categories_file}: {e}")
            category_lookup = None
    elif categories_file:
         print(f"  Warning: Categories file specified but not found at {categories_file}. Recommendations will use IDs only.")
    else:
         print(f"  Info: Categories file not specified. Recommendations will use IDs only.")


    print(f"Data loading finished in {time.time() - start_time:.2f} seconds.")
    return (df_trans,
            category_map, reverse_category_map, category_lookup, num_categories,
            machine_map, reverse_machine_map, num_machines)

def prepare_training_data(df_trans, category_map, machine_map, test_size=0.2, neg_samples=5, random_state=42):
    """
    Groups transactions by session (machine_id, site_session_id), splits sessions
    into train/test sets, and generates positive/negative (machine, category, category)
    tuples for training.
    """
    print("\nPreparing training data...")
    start_time = time.time()

    # Map categories and machines to indices
    if not category_map:
        raise ValueError("Category map is empty or None. Cannot proceed.")
    if not machine_map:
        raise ValueError("Machine map is empty or None. Cannot proceed.")

    print("Step 1: Mapping category and machine IDs to indices...")
    df_trans['category_idx'] = df_trans['prod_category_id'].map(category_map)
    df_trans['machine_idx'] = df_trans['machine_id'].map(machine_map)

    rows_before_drop = len(df_trans)
    # Drop rows where mapping failed for either category or machine
    df_trans.dropna(subset=['category_idx', 'machine_idx'], inplace=True)
    rows_after_drop = len(df_trans)
    if rows_before_drop > rows_after_drop:
        print(f"  Warning: Dropped {rows_before_drop - rows_after_drop} transaction rows due to missing category or machine index mapping.")

    if df_trans.empty:
         raise ValueError("No transactions remaining after category and machine index mapping.")

    df_trans['category_idx'] = df_trans['category_idx'].astype(int)
    df_trans['machine_idx'] = df_trans['machine_idx'].astype(int)


    # 2. Group by session (machine_id, site_session_id) and get unique category indices
    #    Keep machine_idx associated with the session.
    print("Step 2: Grouping transactions by session identifier (machine_id, site_session_id)...")
    # Grouping by both machine_id and site_session_id defines a unique session context
    # We also need the machine index for each session. Since machine_id is part of the group key,
    # we can get it from there.

    # Aggregate unique categories and also get the first machine_idx for each group
    # (since machine_idx is constant within a session group)
    session_data = df_trans.groupby(['machine_id', 'site_session_id'], observed=True, sort=False).agg(
        categories=('category_idx', lambda x: list(x.unique())),
        machine_idx=('machine_idx', 'first') # Get the single machine index for the session
    )

    # Filter out sessions with only one unique category type purchased
    session_data = session_data[session_data['categories'].apply(len) > 1]
    num_multi_item_sessions = len(session_data)
    print(f"  Found {num_multi_item_sessions} sessions with purchases of multiple distinct categories.")

    if session_data.empty:
        raise ValueError("No sessions found with multiple distinct categories purchased. Cannot generate training pairs.")

    # 3. Split Sessions into Train/Test
    print(f"Step 3: Splitting {num_multi_item_sessions} sessions into train/test ({1-test_size:.0%}/{test_size:.0%})...")
    session_ids = session_data.index.tolist() # List of (machine_id, site_session_id) tuples
    try:
        # Ensure stratification is possible if test_size is small and num_multi_item_sessions is small
        stratify_param = None # No stratification needed here usually
        min_sessions_for_split = 2 # Need at least one for train and one for test potentially
        if len(session_ids) < min_sessions_for_split:
             raise ValueError(f"Too few sessions ({len(session_ids)}) with multiple items to perform train/test split.")

        train_session_ids, test_session_ids = train_test_split(
            session_ids,
            test_size=test_size,
            random_state=random_state,
            stratify=stratify_param
        )
    except Exception as e:
        raise RuntimeError(f"Error during train/test split: {e}. Check if there are enough sessions ({len(session_ids)} found).")

    train_sessions_data = session_data.loc[train_session_ids]
    test_sessions_data = session_data.loc[test_session_ids]
    print(f"  Train sessions: {len(train_sessions_data)}, Test sessions: {len(test_sessions_data)}")


    # 4. Generate Positive and Negative Pairs (including machine index)
    print(f"Step 4: Generating positive and negative tuples (k={neg_samples})...")
    all_category_indices_list = list(category_map.values()) # Use list for np.random.choice
    if not all_category_indices_list:
         raise ValueError("List of all category indices is empty.")

    train_data = []
    test_data = []

    # --- Inner function for tuple generation ---
    def generate_tuples_for_sessions(sessions_df, all_indices_list, k):
        data = []
        processed_count = 0
        total_count = len(sessions_df)
        start_pair_gen = time.time()
        print(f"    Generating tuples for {total_count} sessions...")

        # Iterate through rows (each row is a session with its machine_idx and categories)
        for _, session_row in sessions_df.iterrows():
            machine_idx = session_row['machine_idx']
            categories_in_session = session_row['categories']

            # Ensure indices are integers and valid
            session_indices = [int(idx) for idx in categories_in_session if pd.notna(idx)]
            if len(session_indices) < 2:
                continue # Skip if not enough valid indices after cleaning

            session_cat_set = set(session_indices)

            # Positive pairs (co-purchased categories within the session)
            # Using permutations captures direction if needed, combinations is faster if symmetric
            positive_pairs = list(permutations(session_indices, 2))
            for anchor_idx, context_idx in positive_pairs:
                data.append((machine_idx, anchor_idx, context_idx, 1)) # Label 1 for positive

                # Negative Sampling: Sample categories *not* purchased in this session
                neg_count = 0
                max_attempts = k * 10 + 20 # Increased attempts limit
                attempts = 0
                while neg_count < k and attempts < max_attempts:
                    # Sample random category index from all possible categories
                    negative_idx = np.random.choice(all_indices_list)
                    if negative_idx not in session_cat_set: # Check if it wasn't in *this* session
                        data.append((machine_idx, anchor_idx, negative_idx, 0)) # Label 0 for negative
                        neg_count += 1
                    attempts += 1
                # Optional: Warning if not enough negative samples found
                # if neg_count < k:
                #     warnings.warn(f"Could only generate {neg_count}/{k} negative samples for a session (size {len(session_cat_set)}). May indicate sparse data or very large sessions.")

            processed_count += 1
            # Print progress update periodically
            if processed_count % (max(1, total_count // 20)) == 0 or processed_count == total_count: # Print ~20 times
                elapsed = time.time() - start_pair_gen
                rate = processed_count / elapsed if elapsed > 0 else 0
                eta = (total_count - processed_count) / rate if rate > 0 else 0
                print(f"\r      Processed {processed_count}/{total_count} sessions... ({rate:.1f} sess/sec, ETA: {eta:.1f}s)", end="")

        print() # Newline after progress bar
        return data
    # --- End inner function ---

    print("  Generating train tuples...")
    train_data = generate_tuples_for_sessions(train_sessions_data, all_category_indices_list, neg_samples)
    print("  Generating test tuples...")
    test_data = generate_tuples_for_sessions(test_sessions_data, all_category_indices_list, neg_samples)


    print(f"Data preparation finished in {time.time() - start_time:.2f} seconds.")
    print(f"Generated {len(train_data)} training examples and {len(test_data)} testing examples.")

    # Free up memory by deleting intermediate objects
    del df_trans, session_data, train_sessions_data, test_sessions_data
    import gc
    gc.collect()

    return train_data, test_data


def get_category_name(cat_id, lookup):
    """Returns the category name from the lookup, or the ID if not found."""
    if lookup is None:
        return f"ID: {cat_id}"
    # Ensure cat_id is comparable to keys in lookup (might be int/str)
    try:
        # Attempt direct lookup first (assuming keys are integers)
        return lookup.get(int(cat_id), f"ID: {cat_id} (Name not found)")
    except (ValueError, TypeError):
        # Fallback to string conversion if direct lookup fails or ID is not int-like
         return lookup.get(str(cat_id), f"ID: {cat_id} (Name not found)")


# --- PyTorch Dataset ---
class MachineCoOccurrenceDataset(Dataset):
    def __init__(self, data):
        """
        Initializes dataset with list of (machine_idx, anchor_idx, context_idx, label) tuples.
        """
        if not data:
            warnings.warn("MachineCoOccurrenceDataset initialized with empty data.", RuntimeWarning)
            # Initialize empty tensors with correct types
            self.machines = torch.empty(0, dtype=torch.long)
            self.anchors = torch.empty(0, dtype=torch.long)
            self.contexts = torch.empty(0, dtype=torch.long)
            self.labels = torch.empty(0, dtype=torch.float)
        else:
            # Convert list of tuples into separate lists/arrays first for efficiency
            machines_list = [item[0] for item in data]
            anchors_list = [item[1] for item in data]
            contexts_list = [item[2] for item in data]
            labels_list = [item[3] for item in data]
            # Convert to tensors
            self.machines = torch.tensor(machines_list, dtype=torch.long)
            self.anchors = torch.tensor(anchors_list, dtype=torch.long)
            self.contexts = torch.tensor(contexts_list, dtype=torch.long)
            self.labels = torch.tensor(labels_list, dtype=torch.float) # Loss function expects float label
            # Clear intermediate lists
            del machines_list, anchors_list, contexts_list, labels_list

    def __len__(self):
        """Returns the total number of samples."""
        return self.labels.shape[0] # Use shape[0] which is reliable even for empty tensors

    def __getitem__(self, idx):
        """Returns the sample at the given index."""
        return self.machines[idx], self.anchors[idx], self.contexts[idx], self.labels[idx]


# --- PyTorch Model ---
class MachineCategoryBundleModel(nn.Module):
    """
    Embedding-based model to predict category co-occurrence, incorporating machine context.
    """
    def __init__(self, num_categories, num_machines, embedding_dim):
        super(MachineCategoryBundleModel, self).__init__()
        if num_categories <= 0:
            raise ValueError("Number of categories must be positive.")
        if num_machines <= 0:
            raise ValueError("Number of machines must be positive.")
        self.num_categories = num_categories
        self.num_machines = num_machines
        self.embedding_dim = embedding_dim

        # Embedding layer for anchor categories
        self.anchor_embedding = nn.Embedding(num_categories, embedding_dim)
        # Embedding layer for context categories (can be same or different)
        self.context_embedding = nn.Embedding(num_categories, embedding_dim)
        # Embedding layer for machines
        self.machine_embedding = nn.Embedding(num_machines, embedding_dim)

        self.init_weights() # Initialize weights

    def init_weights(self):
        """Initializes embedding weights."""
        initrange = 0.1 / self.embedding_dim # Scale init range by embedding dim
        nn.init.uniform_(self.anchor_embedding.weight, -initrange, initrange)
        nn.init.uniform_(self.context_embedding.weight, -initrange, initrange)
        nn.init.uniform_(self.machine_embedding.weight, -initrange, initrange)

    def forward(self, machine_idx, anchor_cat_idx, context_cat_idx):
        """
        Performs the forward pass.
        Args:
            machine_idx (Tensor): Tensor of machine indices (batch_size).
            anchor_cat_idx (Tensor): Tensor of anchor category indices (batch_size).
            context_cat_idx (Tensor): Tensor of context category indices (batch_size).
        Returns:
            Tensor: Predicted co-occurrence scores (logits) (batch_size).
        """
        # Look up embeddings: shape (batch_size, embedding_dim)
        machine_embeds = self.machine_embedding(machine_idx)
        anchor_embeds = self.anchor_embedding(anchor_cat_idx)
        context_embeds = self.context_embedding(context_cat_idx)

        # Combine machine and anchor embeddings (simple addition)
        # Other options: concatenation + linear layer, element-wise product
        combined_anchor_embeds = anchor_embeds + machine_embeds

        # Compute dot product score between combined anchor and context: shape (batch_size)
        # score = torch.einsum('be,be->b', combined_anchor_embeds, context_embeds)
        score = torch.sum(combined_anchor_embeds * context_embeds, dim=1)

        return score # Return logits (raw scores before sigmoid)

# --- Training Function ---
def train_model(model, train_loader, test_loader, criterion, optimizer, num_epochs, device):
    """Trains the model and evaluates it on the test set after each epoch."""
    print("\n--- Starting Training ---")
    model.to(device) # Move model to the specified device

    train_losses = []
    test_losses = []
    test_accuracies = []

    total_batches_train = len(train_loader)
    total_batches_test = len(test_loader)

    if total_batches_train == 0:
         print("Warning: Train loader is empty. Skipping training loop.")
         return [], [], []

    print(f"Training for {num_epochs} epochs with {total_batches_train} train batches and {total_batches_test} test batches per epoch.")

    for epoch in range(num_epochs):
        model.train() # Set model to training mode
        running_loss = 0.0
        start_time_epoch = time.time()

        for i, (machine_idx, anchor_idx, context_idx, labels) in enumerate(train_loader):
            # Move batch data to the device
            machine_idx = machine_idx.to(device, non_blocking=True)
            anchor_idx = anchor_idx.to(device, non_blocking=True)
            context_idx = context_idx.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(machine_idx, anchor_idx, context_idx) # Pass machine_idx too

            # Calculate loss
            loss = criterion(outputs, labels)

            # Check for NaN/Inf loss and skip update if found
            if not torch.isfinite(loss):
                warnings.warn(f"Warning: Non-finite loss detected at epoch {epoch+1}, batch {i+1}. Skipping batch update.", RuntimeWarning)
                continue

            # Backward pass and optimize
            loss.backward()
            # Optional: Gradient clipping (helps prevent exploding gradients)
            # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            running_loss += loss.item()

            # Print progress periodically
            if (i + 1) % (max(1, total_batches_train // 10)) == 0: # Print ~10 times per epoch
                print(f'\rEpoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_batches_train}], Batch Loss: {loss.item():.4f}', end="")

        # Calculate average loss for the epoch
        epoch_loss = running_loss / total_batches_train if total_batches_train > 0 else 0.0
        train_losses.append(epoch_loss)
        epoch_time = time.time() - start_time_epoch
        print() # Newline after epoch progress

        # --- Evaluation Phase ---
        model.eval() # Set model to evaluation mode
        current_test_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        if total_batches_test == 0:
            print("  Warning: Test loader is empty. Skipping evaluation for this epoch.")
            avg_test_loss = float('nan') # Indicate missing evaluation
            accuracy = float('nan')
        else:
            with torch.no_grad(): # Disable gradient calculations for evaluation
                for machine_idx, anchor_idx, context_idx, labels in test_loader:
                    # Move batch data to the device
                    machine_idx = machine_idx.to(device, non_blocking=True)
                    anchor_idx = anchor_idx.to(device, non_blocking=True)
                    context_idx = context_idx.to(device, non_blocking=True)
                    labels = labels.to(device, non_blocking=True)

                    # Forward pass
                    outputs = model(machine_idx, anchor_idx, context_idx) # Pass machine_idx

                    # Calculate loss
                    loss = criterion(outputs, labels)

                    # Accumulate test loss, checking for non-finite values
                    if torch.isfinite(loss):
                        current_test_loss += loss.item()
                    else:
                        warnings.warn(f"Warning: Non-finite loss encountered during evaluation epoch {epoch+1}.", RuntimeWarning)


                    # Calculate accuracy
                    # Apply sigmoid to get probabilities, then threshold at 0.5
                    predicted = torch.sigmoid(outputs) >= 0.5
                    total_predictions += labels.size(0)
                    correct_predictions += (predicted.byte() == labels.byte()).sum().item() # Ensure types match for comparison

            # Calculate average test loss and accuracy
            avg_test_loss = current_test_loss / total_batches_test if total_batches_test > 0 else float('nan')
            accuracy = (correct_predictions / total_predictions) * 100 if total_predictions > 0 else 0.0

        test_losses.append(avg_test_loss)
        test_accuracies.append(accuracy)

        # Print epoch summary
        print(f'Epoch [{epoch+1}/{num_epochs}] completed in {epoch_time:.2f}s.')
        print(f'  Avg Train Loss: {epoch_loss:.4f}')
        print(f'  Avg Test Loss: {avg_test_loss:.4f}, Test Accuracy: {accuracy:.2f}%')

    print("--- Training Finished ---")
    return train_losses, test_losses, test_accuracies

# --- Recommendation Function ---
def recommend_bundles_pytorch(input_machine_id, input_category_id, model,
                              machine_map, category_map, reverse_category_map,
                              num_categories, num_recommendations, device, category_lookup):
    """
    Recommends product categories to bundle with the input category for a specific machine,
    using the trained model.

    Args:
        input_machine_id (int): The original ID of the machine to get recommendations for.
        input_category_id (int): The original ID of the category to get recommendations for.
        model (nn.Module): The trained MachineCategoryBundleModel.
        machine_map (dict): Mapping from original machine ID to model index.
        category_map (dict): Mapping from original category ID to model index.
        reverse_category_map (dict): Mapping from model index to original category ID.
        num_categories (int): Total number of unique categories the model was trained on.
        num_recommendations (int): The number of recommendations to return.
        device (torch.device): The device (CPU or CUDA) to run inference on.
        category_lookup (dict, optional): Mapping from original category ID to category name.

    Returns:
        list: A list of recommended original category IDs.
    """
    model.eval() # Ensure model is in evaluation mode
    model.to(device) # Move model to the correct device

    # Validate inputs
    if input_machine_id not in machine_map:
        print(f"\nError: Input machine ID {input_machine_id} was not found in the model's machine map (likely not seen in training data).")
        print(f"Available machine IDs in map: {list(machine_map.keys())[:10]}...") # Show a few examples
        return []
    if input_category_id not in category_map:
        print(f"\nError: Input category ID {input_category_id} was not found in the model's category map (likely not seen in training data).")
        print(f"Available category IDs in map: {list(category_map.keys())[:10]}...") # Show a few examples
        return []

    input_machine_idx = machine_map[input_machine_id]
    input_category_idx = category_map[input_category_id]

    print(f"\nGenerating recommendations for Machine ID: {input_machine_id} (Index: {input_machine_idx})")
    print(f"  and Category: {get_category_name(input_category_id, category_lookup)} (ID: {input_category_id}, Index: {input_category_idx})")

    # Prepare input tensors for the machine and anchor category (repeated for batching)
    machine_idx_tensor = torch.tensor([input_machine_idx], dtype=torch.long).to(device)
    anchor_idx_tensor = torch.tensor([input_category_idx], dtype=torch.long).to(device)

    # Prepare input tensor for all possible context categories
    all_context_indices = torch.arange(num_categories, dtype=torch.long).to(device)

    # Predict scores in batches to manage memory if num_categories is large
    all_scores = []
    eval_batch_size = 8192 # Adjust based on GPU memory and num_categories
    print(f"  Predicting scores against {num_categories} potential categories...")
    with torch.no_grad(): # Disable gradient calculation for inference
        for i in range(0, num_categories, eval_batch_size):
            # Create batch for context indices
            batch_context = all_context_indices[i : i + eval_batch_size]
            batch_size = len(batch_context)

            # Repeat the machine and anchor index for the batch size
            batch_machine = machine_idx_tensor.repeat(batch_size)
            batch_anchor = anchor_idx_tensor.repeat(batch_size)

            # Get scores for the batch
            batch_scores = model(batch_machine, batch_anchor, batch_context)
            all_scores.append(batch_scores.cpu()) # Move scores to CPU immediately

    # Concatenate scores from all batches and convert to numpy array
    scores = torch.cat(all_scores).numpy()
    print(f"  Finished predicting scores.")

    # Create pairs of (original_category_id, score)
    category_scores = []
    for i in range(num_categories):
        # Exclude the input category itself from recommendations
        if i != input_category_idx:
            original_cat_id = reverse_category_map.get(i, None)
            if original_cat_id is not None:
                # scores[i] corresponds to the score between (input_machine, input_cat) and context cat i
                category_scores.append((original_cat_id, scores[i]))

    # Sort categories by predicted score in descending order
    category_scores.sort(key=lambda x: x[1], reverse=True)

    # Get top N recommendations
    top_recs = category_scores[:num_recommendations]

    print(f"\nTop {len(top_recs)} recommended categories to bundle:")
    recommendations = []
    if not top_recs:
        print("  No recommendations found (or all other categories had low predicted scores).")
    else:
        for cat_id, score in top_recs:
            recommendations.append(cat_id)
            print(f"  - {get_category_name(cat_id, category_lookup)} (ID: {cat_id}) (Score: {score:.4f})")

    return recommendations



In [None]:

# ===================
# --- Main Script ---
# ===================

print("Starting Product Bundle Recommender Script (with Machine Context)")
print("="*60)
overall_start_time = time.time()

# 0. Setup Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == 'cuda':
        print(f"  CUDA Device Name: {torch.cuda.get_device_name(0)}")

# 1. Load Data and Mappings
# This step reads transactions.csv and prepares category AND machine mappings
(df_transactions,
 cat_map, rev_cat_map, cat_lookup, n_categories,
 mach_map, rev_mach_map, n_machines) = load_data(
    TRANSACTIONS_FILE, CATEGORIES_FILE
)

# 2. Prepare Data for Training
# This step groups transactions by session, splits sessions, and generates (machine, anchor, context, label) tuples
train_tuples, test_tuples = prepare_training_data(
    df_trans=df_transactions, # Pass the loaded dataframe
    category_map=cat_map,
    machine_map=mach_map, # Pass the machine map
    test_size=TEST_SET_SIZE,
    neg_samples=NEGATIVE_SAMPLES_PER_POSITIVE,
    random_state=RANDOM_STATE
)
# Clear the large transactions dataframe now that tuples are generated
del df_transactions
import gc
gc.collect()


# Check if data generation was successful
if not train_tuples:
        print("\nError: No training data tuples were generated. This might happen if there are no sessions with multiple category purchases.")
        exit(1)

# 3. Create Datasets and DataLoaders
print("\nCreating Datasets and DataLoaders...")
start_dataloader_time = time.time()
train_dataset = MachineCoOccurrenceDataset(train_tuples) # Use the new dataset class
test_dataset = MachineCoOccurrenceDataset(test_tuples)
# Clear tuple lists to save memory
del train_tuples, test_tuples
gc.collect()


# Determine num_workers based on platform/device
num_workers = 0 # Often safer default, especially with GPU and Windows
# Adjust based on your system's capabilities and testing
# if device.type == 'cpu' and os.name != 'nt':
#       num_workers = max(1, os.cpu_count() // 2) # Example heuristic for CPU on non-Windows

print(f"  Using {num_workers} workers for DataLoaders.")
pin_memory_flag = True if device.type == 'cuda' else False

# Set persistent_workers based on num_workers
persist_workers_flag = False if num_workers == 0 else True

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=num_workers, pin_memory=pin_memory_flag, persistent_workers=persist_workers_flag)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=num_workers, pin_memory=pin_memory_flag, persistent_workers=persist_workers_flag)
print(f"  DataLoaders created in {time.time() - start_dataloader_time:.2f} seconds.")



Starting Product Bundle Recommender Script (with Machine Context)
Using device: cpu
Loading data...
  Transactions file: transactions.csv
  Categories file: product_categories.csv
  Processing transaction file in chunks...
    Processed chunk 1...
    Processed chunk 2...
    Processed chunk 3...
Loaded 2308972 transaction items from 3 chunks.
  Creating category mappings...
Found 222 unique product categories.
  Creating machine mappings...
Found 103347 unique machine IDs.
  Loading category names from product_categories.csv...
Data loading finished in 5.78 seconds.

Preparing training data...
Step 1: Mapping category and machine IDs to indices...
Step 2: Grouping transactions by session identifier (machine_id, site_session_id)...
  Found 205802 sessions with purchases of multiple distinct categories.
Step 3: Splitting 205802 sessions into train/test (80%/20%)...
  Train sessions: 164641, Test sessions: 41161
Step 4: Generating positive and negative tuples (k=5)...
  Generating train 

In [None]:

NUM_EPOCHS = 3

# 4. Initialize Model, Loss, Optimizer
print("\nInitializing Model, Loss Function, and Optimizer...")
model = MachineCategoryBundleModel( # Use the new model class
    num_categories=n_categories,
    num_machines=n_machines, # Pass the number of machines
    embedding_dim=EMBEDDING_DIM
)
criterion = nn.BCEWithLogitsLoss() # Numerically stable loss for binary classification with logits
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(f"  Model: {model.__class__.__name__} with {EMBEDDING_DIM}-dim embeddings for {n_categories} categories and {n_machines} machines.")
print(f"  Optimizer: Adam (LR={LEARNING_RATE})")
print(f"  Loss Function: BCEWithLogitsLoss")


# 5. Train the Model and get loss history
train_loss_history, test_loss_history, test_accuracy_history = train_model(
    model, train_loader, test_loader, criterion, optimizer, NUM_EPOCHS, device
)

# --- Plotting ---
if train_loss_history or test_loss_history: # Only plot if training actually ran
    print("\n--- Plotting Training and Test Curves ---")
    epochs = range(1, len(train_loss_history) + 1)
    plt.figure(figsize=(12, 5.5)) # Slightly adjusted figure size

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_loss_history, 'bo-', label='Training Loss')
    plt.plot(epochs, test_loss_history, 'ro-', label='Test Loss')
    plt.title('Training and Test Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss (BCEWithLogits)')
    plt.legend()
    plt.grid(True)

    # Plot Accuracy
    if test_accuracy_history: # Only plot if accuracy was calculated
        plt.subplot(1, 2, 2)
        plt.plot(epochs, test_accuracy_history, 'go-', label='Test Accuracy')
        plt.title('Test Accuracy')
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy (%)')
        plt.legend()
        plt.grid(True)

    plt.tight_layout()
    plt.savefig("training_curves_machine_model.png") # Save the plot
    print("Saved training curves plot to 'training_curves_machine_model.png'")
    # plt.show() # Optionally display plot


# --- Example Recommendation ---
print("\n--- Generating Example Recommendation ---")
if not mach_map:
    print("Machine map is empty, cannot generate recommendations.")
elif not cat_map:
    print("Category map is empty, cannot generate recommendations.")
else:
    # Get a random machine ID and category ID from the training data maps for demonstration
    try:
        # Pick a machine ID that was actually used in training/mapping
        available_machine_ids = list(mach_map.keys())
        if not available_machine_ids:
             raise ValueError("Machine map keys are empty.")
        target_machine_id = random.choice(available_machine_ids)

        # Pick a category ID that was actually used in training/mapping
        available_category_ids = list(cat_map.keys())
        if not available_category_ids:
             raise ValueError("Category map keys are empty.")
        target_category_id = random.choice(available_category_ids)

        print(f"Selected sample machine ID: {target_machine_id}")
        print(f"Selected sample category ID: {target_category_id}")

        # Generate recommendations
        recommended_ids = recommend_bundles_pytorch(
            input_machine_id=target_machine_id,
            input_category_id=target_category_id,
            model=model,
            machine_map=mach_map,
            category_map=cat_map,
            reverse_category_map=rev_cat_map,
            num_categories=n_categories,
            num_recommendations=RECOMMENDATION_COUNT,
            device=device,
            category_lookup=cat_lookup
        )

    except IndexError:
         print("Could not select a random machine/category ID (maps might be empty after all).")
    except ValueError as e:
         print(f"Error selecting sample IDs: {e}")
    except Exception as e:
         print(f"An unexpected error occurred during recommendation generation: {e}")


# --- Optional: Save Model ---
# print("\n--- Saving Model ---")
# model_save_path = "machine_category_recommender_model.pth"
# try:
#     torch.save(model.state_dict(), model_save_path)
#     print(f"Model state dictionary saved to {model_save_path}")
# except Exception as e:
#     print(f"Error saving model: {e}")


overall_end_time = time.time()
print("\n" + "="*60)
print(f"Script finished in {overall_end_time - overall_start_time:.2f} seconds.")


Initializing Model, Loss Function, and Optimizer...


NameError: name 'MachineCategoryBundleModel' is not defined

In [None]:
(df_transactions,
 cat_map, rev_cat_map, cat_lookup, n_categories,
 mach_map, rev_mach_map, n_machines) = load_data(
    TRANSACTIONS_FILE, CATEGORIES_FILE
)


recommended_ids = recommend_bundles_pytorch(
    input_machine_id=target_machine_id,
    input_category_id=target_category_id,
    model=model,
    machine_map=mach_map,
    category_map=cat_map,
    reverse_category_map=rev_cat_map,
    num_categories=n_categories,
    num_recommendations=RECOMMENDATION_COUNT,
    device=device,
    category_lookup=cat_lookup
)