<a href="https://colab.research.google.com/github/vaghemanth/RecSys_Projects/blob/main/Multi_faceted_recomender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Get the required Libraries

In [22]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
from itertools import permutations
import re
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer

from warnings import filterwarnings
filterwarnings('ignore')

# Added for plotting the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

<b>Model:</b><br>
This is a cascaded model which uses the best of all the models in a weighted combination.

We have used-
User-Based Filtering,  Item-to-Item Co-occurrence, Content-Based Filtering, Matrix Factorization (SVD) and  Fallback Models.

In [23]:
def build_models(df, decay_rate=0.01, n_components=20, n_iter=5,
                 frequency_power=1.0):
    """
    Builds all necessary models, with tunable internal parameters.

    This function trains several recommendation models including:
    - Time-Weighted User History
    - Item-to-Item Co-occurrence
    - Content-Based Filtering
    - SVD Matrix Factorization
    - Fallback Popularity Models

    Args:
        df (pd.DataFrame): The training dataframe with all historical orders.
            Expected columns: 'Member', 'Order', 'SKU', 'Delivery Date', 'Name'.
        decay_rate (float): The rate for the time-decay function. Higher values
                            mean more recent purchases have a stronger influence.
        n_components (int): The number of latent factors for SVD. Represents the
                            dimensionality of the reduced space for users and items.
        n_iter (int): The number of iterations for the SVD solver. More iterations
                      can lead to better convergence but take longer.
        frequency_power (float): A power to apply to the time-weighted scores
                                 to adjust for purchase frequency. Values > 1.0
                                 will amplify the scores of frequently purchased
                                 items; values < 1.0 will dampen them.

    Returns:
        dict: A dictionary containing all the trained models.
              Keys include 'user_history', 'co_occurrence', 'content',
              'popularity', 'category_popularity', and 'svd'.
    """
    print(
        f"Building models with decay={decay_rate}, n_comp={n_components}, "
        f"n_iter={n_iter}, freq_power={frequency_power}..."
    )
    # Ensure 'Delivery Date' is in datetime format
    df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], format='%d/%m/%y')

    # Popular items - for default fall back
    # Get a list of SKUs sorted by their overall purchase frequency
    global_popular_skus = df['SKU'].value_counts().index.tolist()

    # Time-Weighted User History
    # Calculate days since the latest purchase date in the dataset
    latest_date = df['Delivery Date'].max()
    df['days_since_purchase'] = (latest_date - df['Delivery Date']).dt.days
    # Calculate a time score based on the decay rate
    df['time_score'] = np.exp(-decay_rate * df['days_since_purchase'])
    # Aggregate time scores for each user-SKU pair
    user_sku_scores = df.groupby(['Member', 'SKU'])['time_score'].sum().reset_index()

    # Apply the frequency_power hyperparameter to adjust scores
    user_sku_scores['time_score'] = np.power(
        user_sku_scores['time_score'], frequency_power
    )
    # Convert user-SKU scores to a dictionary for quick lookup
    user_history_scores = user_sku_scores.set_index(
        ['Member', 'SKU']
    )['time_score'].to_dict()

    # Item-to-Item Co-occurrence
    # Build a co-occurrence matrix based on items appearing in the same order
    co_occurrence_model = defaultdict(Counter)
    order_items = df.groupby('Order')['SKU'].apply(list)
    for items in order_items:
        # Consider all pairs of items within an order
        for item1, item2 in permutations(items, 2):
            co_occurrence_model[item1][item2] += 1

    # Normalize co-occurrence counts by the maximum count to scale between 0 and 1
    max_count = max(
        (c.most_common(1)[0][1] for c in co_occurrence_model.values() if c),
        default=0
    )
    normalized_co_occurrence_model = defaultdict(Counter)
    if max_count > 0:
        for item1, counters in co_occurrence_model.items():
            for item2, count in counters.items():
                normalized_co_occurrence_model[item1][item2] = count / max_count

    # Content-Based Filtering
    # Define keyword mappings for categories
    CATEGORY_KEYWORDS = {
        'vegetables': ['vegetables', 'brinjals', 'gourd', 'cucumber', 'f&v'],
        'dals & pulses': ['dal', 'dals', 'beans', 'pulses', 'peanuts'],
        'flour & grains': ['flour', 'flours', 'sooji', 'rava', 'maida',
                           'basmati', 'avalakki', 'poha', 'rice', 'besan',
                           'grains', 'oats'],
        'spices & sweeteners': ['sugar', 'jaggery', 'spices', 'masalas',
                                'honey', 'salt'],
        'oils & ghee': ['oils', 'ghee', 'oil'],
        'dairy & eggs': ['yogurt', 'lassi', 'curd', 'buttermilk', 'milk',
                         'butter', 'cream', 'eggs'],
        'bakery & cakes': ['bread', 'buns', 'pavs', 'cakes'],
        'snacks & biscuits': ['chips', 'cookies', 'biscuits', 'namkeen',
                              'wafers', 'snacks', 'snacky'],
        'nuts & dry fruits': ['cashews', 'almonds', 'raisins', 'dry fruits'],
        'instant food & noodles': ['vermicelli', 'noodles', 'pasta', 'pastas',
                                   'ready mix', 'baking mixes'],
        'sauces & spreads': ['sauces', 'paste', 'ketchup', 'spreads'],
        'beverages': ['juices', 'coffee', 'drinks', 'water'],
        'frozen & desserts': ['ice creams', 'desserts', 'frozen'],
        'personal care': ['bandages', 'soaps', 'wash', 'hair oil', 'shaving',
                          'razors', 'pads', 'polish', 'toothpaste', 'dyes',
                          'body wash'],
        'household & cleaning': ['repellent', 'foil', 'wrap', 'pooja',
                                 'cleaners', 'phenyles', 'acids',
                                 'agarbatti', 'detergent', 'brooms'],
        'sweets & candy': ['sweets', 'rasagulla', 'gulab jamun', 'toffee',
                           'candy'],
        'health': ['health drinks', 'supplements'],
        'baby care': ['diapers', 'wipes']
    }
    # Create a mapping from SKU to item name
    sku_to_name = df[['SKU', 'Name']].drop_duplicates().set_index(
        'SKU'
    )['Name'].to_dict()
    # Assign categories to SKUs based on keywords in their names
    sku_to_category, category_to_skus = {}, defaultdict(list)
    for sku, name in sku_to_name.items():
        name_lower = name.lower()
        assigned_category = 'other' # Default category
        for category, keywords in CATEGORY_KEYWORDS.items():
            if any(re.search(r'\b' + keyword + r'\b', name_lower)
                   for keyword in keywords):
                assigned_category = category
                break
        sku_to_category[sku] = assigned_category
        if assigned_category != 'other':
            category_to_skus[assigned_category].append(sku)

    # Calculate item popularity to determine popular items within each category
    item_popularity = df['SKU'].value_counts().to_dict()
    category_popular_items = {
        cat: sorted(
            skus, key=lambda sku: item_popularity.get(sku, 0), reverse=True
        ) for cat, skus in category_to_skus.items()
    }

    # SVD Matrix Factorization
    # Create mappings for users and SKUs to integer IDs for matrix factorization
    user_ids = {
        user: i for i, user in enumerate(user_sku_scores['Member'].unique())
    }
    sku_ids = {
        sku: i for i, sku in enumerate(user_sku_scores['SKU'].unique())
    }
    # Create a sparse matrix from user-SKU scores
    sparse_matrix = csr_matrix((
        user_sku_scores['time_score'],
        (
            user_sku_scores['Member'].map(user_ids),
            user_sku_scores['SKU'].map(sku_ids)
        )
    ))

    # Apply Truncated SVD to decompose the sparse matrix into user and item factors
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=42)
    user_factors = svd.fit_transform(sparse_matrix)
    item_factors = svd.components_.T
    # Predict scores by multiplying user and item factors
    predicted_scores = user_factors.dot(item_factors.T)
    # Normalize predicted scores
    normalizer = Normalizer()
    predicted_scores = normalizer.fit_transform(predicted_scores)

    # Store SVD results in a dictionary
    svd_model = {
        "predictions": predicted_scores,
        "user_map": user_ids,
        "sku_map": sku_ids,
        "reverse_sku_map": {i: sku for sku, i in sku_ids.items()}
    }

    # Combine all trained models into a single dictionary
    models = {
        "user_history": user_history_scores,
        "co_occurrence": normalized_co_occurrence_model,
        "content": sku_to_category,
        "popularity": global_popular_skus,
        "category_popularity": category_popular_items,
        "svd": svd_model
    }
    print("Models built successfully.")
    return models

In [24]:
def get_all_recommendations(orders_to_predict, order_to_member_map,
                            items_in_orders, models, co_occurrence_weight,
                            content_weight, svd_weight):
    """
    Generates recommendations using a four-part hybrid scoring model.

    This function combines scores from User-Based History, Item-to-Item Co-occurrence,
    Content-Based Filtering, and SVD Matrix Factorization to generate a ranked
    list of recommended SKUs for each order. It also includes fallback
    mechanisms based on category and global popularity.

    Args:
        orders_to_predict (list): A list of order IDs for which to generate recommendations.
        order_to_member_map (dict): A dictionary mapping order IDs to member IDs.
        items_in_orders (dict): A dictionary mapping order IDs to a set of SKUs
                                present in that order (used as the current cart).
        models (dict): A dictionary containing all the trained models, as returned
                       by the `build_models` function.
        co_occurrence_weight (float): The weight to apply to scores from the
                                      Item-to-Item Co-occurrence model.
        content_weight (float): The weight to apply to scores from the
                                Content-Based Filtering model.
        svd_weight (float): The weight to apply to scores from the SVD model.

    Returns:
        list: A list of dictionaries, where each dictionary represents a
              recommendation and contains 'Member', 'Order', and 'SKU'.
    """
    # Unpack the models from the input dictionary
    user_history, co_occurrence, sku_to_category, global_popular, \
    cat_popular, svd_model = (
        models['user_history'], models['co_occurrence'], models['content'],
        models['popularity'], models['category_popularity'], models['svd']
    )

    final_recommendations = []
    # Iterate through each order for which recommendations are needed
    for order_id in orders_to_predict:
        member_id = order_to_member_map[order_id]
        items_in_cart = items_in_orders.get(order_id, set())
        candidate_scores = Counter() # To accumulate scores for potential recommendations

        # User Based History Scoring
        # Get all items previously purchased by the current member
        all_user_items = [
            sku for (mem, sku) in user_history.keys() if mem == member_id
        ]
        # Add time-weighted scores from user history to candidate scores
        for sku in all_user_items:
            candidate_scores[sku] += user_history.get((member_id, sku), 0)

        # Item-to-Item Co-occurrence Scoring
        # For each item in the current order (cart), add scores of related items
        for item_in_cart in items_in_cart:
            if item_in_cart in co_occurrence:
                for related_item, norm_count in \
                        co_occurrence[item_in_cart].items():
                    # Apply co-occurrence weight
                    candidate_scores[related_item] += \
                        co_occurrence_weight * norm_count

        # Content-Based Scoring
        # Identify categories present in the current order (cart)
        session_cats = Counter(
            sku_to_category.get(sku, 'other') for sku in items_in_cart
        )
        if session_cats:
            # Get the count of the most frequent category in the cart
            max_cat_count = session_cats.most_common(1)[0][1]
            # For each potential recommendation (from user history),
            # add a score based on its category's frequency in the cart
            for cand_sku in all_user_items:
                cand_cat = sku_to_category.get(cand_sku, 'other')
                if cand_cat in session_cats and cand_cat != 'other':
                    # Apply content weight and normalize by max category count
                    candidate_scores[cand_sku] += \
                        content_weight * (session_cats[cand_cat] / max_cat_count)

        # SVD Model Scoring
        # Get the user index for the SVD model
        user_idx = svd_model['user_map'].get(member_id)
        if user_idx is not None:
            # Get the predicted scores for the user from the SVD model
            user_svd_scores = svd_model['predictions'][user_idx]
            # Add SVD scores to candidate scores for each SKU
            for sku_idx, score in enumerate(user_svd_scores):
                sku = svd_model['reverse_sku_map'].get(sku_idx)
                if sku:
                    # Apply SVD weight
                    candidate_scores[sku] += svd_weight * score

        # Ranking and Filtering
        # Remove items already in the current order (cart) from candidates
        for item in items_in_cart:
            if item in candidate_scores:
                del candidate_scores[item]

        # Get the top 5 recommended SKUs based on the combined scores
        recs = [sku for sku, score in candidate_scores.most_common(5)]

        # Default Fallback Items
        # Keep track of all SKUs considered (recommended or in cart)
        seen_skus = set(recs).union(items_in_cart)
        # If less than 5 recommendations are generated, use fallback
        if len(recs) < 5:
            # First fallback: popular items from the most frequent categories in the cart
            sorted_session_cats = [
                cat for cat, count in session_cats.most_common()
            ]
            for cat in sorted_session_cats:
                if len(recs) >= 5:
                    break
                if cat in cat_popular:
                    for sku in cat_popular[cat]:
                        if len(recs) >= 5:
                            break
                        if sku not in seen_skus:
                            recs.append(sku)
                            seen_skus.add(sku)
        # Second fallback: globally popular items
        if len(recs) < 5:
            for sku in global_popular:
                if len(recs) >= 5:
                    break
                if sku not in seen_skus:
                    recs.append(sku)
                    seen_skus.add(sku)

        # Format the final recommendations for the submission file
        for sku in recs:
            final_recommendations.append(
                {'Member': member_id, 'Order': order_id, 'SKU': sku}
            )

    return final_recommendations

In [25]:
def test_recall_at_5(train_df, test_orders_df, forgotten_item_fraction=0.3,
                     decay_rate=0.01, co_occurrence_weight=0.5,
                     content_weight=0.2, svd_weight=0.5, n_components=20,
                     n_iter=5, frequency_power=1.0):
    """
    Tests the recommendation model's recall@5 and calculates confusion matrix values.

    This function simulates a "Did You Forget" scenario by hiding a fraction
    of items from test orders and checking if the model recommends them.
    It calculates Recall@5 and confusion matrix components (TP, FP, FN, TN).

    Args:
        train_df (pd.DataFrame): DataFrame containing historical orders for training.
        test_orders_df (pd.DataFrame): DataFrame containing the orders to be used for testing.
        forgotten_item_fraction (float): The fraction of items to hide from each
                                         test order to simulate forgotten items.
        decay_rate (float): Hyperparameter for the time-weighted user history model.
        co_occurrence_weight (float): Hyperparameter for the item-to-item co-occurrence model.
        content_weight (float): Hyperparameter for the content-based filtering model.
        svd_weight (float): Hyperparameter for the SVD model.
        n_components (int): Hyperparameter for the number of SVD components.
        n_iter (int): Hyperparameter for the number of SVD iterations.
        frequency_power (float): Hyperparameter for adjusting time-weighted scores
                                 by purchase frequency.

    Returns:
        tuple: A tuple containing:
               - recall (float): The calculated Recall@5 score.
               - total_tp (int): Total True Positives across all test orders.
               - total_fp (int): Total False Positives across all test orders.
               - total_fn (int): Total False Negatives across all test orders.
               - total_tn (int): Total True Negatives across all test orders.
    """
    print(
        f"\n--- Testing with decay={decay_rate}, co_occur={co_occurrence_weight}, "
        f"content={content_weight}, svd={svd_weight}, n_comp={n_components}, "
        f"n_iter={n_iter}, freq_pow={frequency_power}, "
        f"forgotten={forgotten_item_fraction} ---"
    )

    # Build the recommendation models using the training data and specified hyperparameters
    models = build_models(
        train_df.copy(), decay_rate=decay_rate, n_components=n_components,
        n_iter=n_iter, frequency_power=frequency_power
    )
    # Get unique order IDs from the test set
    test_orders_to_predict = test_orders_df['Order'].unique()
    # Create a mapping from order ID to member ID for the test set
    order_to_member_map = test_orders_df.set_index('Order')['Member'].to_dict()
    # Initialize dictionaries to store items kept in the cart and forgotten items (ground truth)
    items_in_test_orders_subset, forgotten_items_ground_truth = {}, {}

    # Create a lookup for all items ever purchased by each member in the training data
    all_member_items = train_df.groupby('Member')['SKU'].apply(set).to_dict()

    # Simulate forgotten items for each test order
    for order_id in test_orders_to_predict:
        all_items_in_order = test_orders_df[
            test_orders_df['Order'] == order_id
        ]['SKU'].tolist()
        np.random.shuffle(all_items_in_order) # Randomly shuffle items
        # Determine the number of items to hide based on the fraction
        num_to_hide = int(len(all_items_in_order) * forgotten_item_fraction)
        # Ensure at least one item is hidden if the order has more than one item
        if num_to_hide == 0 and len(all_items_in_order) > 1:
            num_to_hide = 1
        # Split items into forgotten (hidden) and items kept in the cart
        forgotten_items = set(all_items_in_order[:num_to_hide])
        items_to_keep = set(all_items_in_order[num_to_hide:])
        # Store the items kept in the cart and the forgotten items
        items_in_test_orders_subset[order_id] = items_to_keep
        forgotten_items_ground_truth[order_id] = forgotten_items

    # Generate recommendations using the trained models and the items kept in the subset orders
    recommendations_list = get_all_recommendations(
        test_orders_to_predict, order_to_member_map,
        items_in_test_orders_subset, models,
        co_occurrence_weight=co_occurrence_weight,
        content_weight=content_weight, svd_weight=svd_weight
    )
    # Group generated recommendations by order ID
    recs_by_order = pd.DataFrame(recommendations_list).groupby(
        'Order'
    )['SKU'].apply(set).to_dict()

    total_recall_score, orders_with_forgotten_items = 0, 0
    # Initialize confusion matrix counters
    total_tp, total_fp, total_fn, total_tn = 0, 0, 0, 0

    # Evaluate recommendations against the forgotten items (ground truth)
    for order_id in test_orders_to_predict:
        forgotten_set = forgotten_items_ground_truth.get(order_id, set())
        # Skip orders with no forgotten items (e.g., single-item orders where 0 items were hidden)
        if not forgotten_set:
            continue

        member_id = order_to_member_map[order_id]
        orders_with_forgotten_items += 1
        # Get the set of recommended items for the current order
        predicted_set = recs_by_order.get(order_id, set())

        # Calculate recall for the current order
        correctly_recalled_items = len(forgotten_set.intersection(predicted_set))
        total_recall_score += correctly_recalled_items / len(forgotten_set)

        # Calculate confusion matrix values for the current order
        # Universe of items for TN calculation: all items the user has ever purchased
        universe = all_member_items.get(member_id, set())
        tp = correctly_recalled_items # True Positives: Forgotten items that were recommended
        fn = len(forgotten_set.difference(predicted_set)) # False Negatives: Forgotten items that were NOT recommended
        fp = len(predicted_set.difference(forgotten_set)) # False Positives: Recommended items that were NOT forgotten
        # TN: Items in user's history, not forgotten, and not predicted
        tn = len(universe - forgotten_set - predicted_set)

        # Aggregate confusion matrix values
        total_tp += tp
        total_fp += fp
        total_fn += fn
        total_tn += tn

    # Calculate the overall Recall@5
    recall = total_recall_score / orders_with_forgotten_items if orders_with_forgotten_items > 0 else 0

    # Return recall and aggregated confusion matrix values
    return recall, total_tp, total_fp, total_fn, total_tn

In [26]:
def tune_hyperparameters(all_orders_path):
    """
    Runs the recall test across a grid of hyperparameters, summarizes the
    results, and plots a confusion matrix for the best model.

    This function orchestrates the hyperparameter tuning process by iterating
    through predefined grids of hyperparameters, evaluating the model's
    performance using the `test_recall_at_5` function, and reporting the
    best performing set of parameters based on Recall@5. It also visualizes
    the confusion matrix for the best model.

    Args:
        all_orders_path (str): The file path to the CSV file containing all
                               historical order data.
    """

    # Define the grid of hyperparameters to search over
    decay_rate_grid = [0.0003]
    co_occurrence_weight_grid = [0.5]
    content_weight_grid = [0.5]
    svd_weight_grid = [0.5]
    n_components_grid = [70]
    n_iter_grid = [5]
    frequency_power_grid = [1.3]
    forgotten_fraction_grid = [0.25] # Fraction of items to "forget" in test orders

    results = [] # List to store results for each hyperparameter combination
    print(f"Loading data from '{all_orders_path}' for tuning")
    try:
        df = pd.read_csv(all_orders_path)
    except FileNotFoundError as e:
        print(f"Error: {e}.")
        return

    # Prepare data for training and testing
    df['Delivery Date'] = pd.to_datetime(df['Delivery Date'], format='%d/%m/%y')
    # Identify the last order for each member to use as the test set
    last_order_info = df.loc[df.groupby('Member')['Delivery Date'].idxmax()]
    test_orders_df = df[df['Order'].isin(last_order_info['Order'])]
    # Use all orders except the last one for each member as the training set
    train_df = df[~df['Order'].isin(last_order_info['Order'])]
    # print(
    #     f"Training set size: {len(train_df)} rows, "
    #     f"Test set size: {len(test_orders_df)} rows"
    # )

    # Iterate through the hyperparameter grid and run the test for each combination
    for decay_rate in decay_rate_grid:
        for co_occurrence_weight in co_occurrence_weight_grid:
            for content_weight in content_weight_grid:
                for svd_weight in svd_weight_grid:
                    for n_components in n_components_grid:
                        for n_iter in n_iter_grid:
                            for frequency_power in frequency_power_grid:
                                for forgotten_fraction in forgotten_fraction_grid:
                                    # Run the test and get recall and confusion matrix values
                                    recall, tp, fp, fn, tn = test_recall_at_5(
                                        train_df.copy(), test_orders_df.copy(),
                                        forgotten_item_fraction=forgotten_fraction,
                                        decay_rate=decay_rate,
                                        co_occurrence_weight=co_occurrence_weight,
                                        content_weight=content_weight,
                                        svd_weight=svd_weight,
                                        n_components=n_components,
                                        n_iter=n_iter,
                                        frequency_power=frequency_power
                                    )
                                    # Store the results
                                    results.append({
                                        'decay_rate': decay_rate,
                                        'co_occurrence_weight': co_occurrence_weight,
                                        'content_weight': content_weight,
                                        'svd_weight': svd_weight,
                                        'n_components': n_components,
                                        'n_iter': n_iter,
                                        'frequency_power': frequency_power,
                                        'forgotten_fraction': forgotten_fraction,
                                        'recall_at_5': recall,
                                        # Store confusion matrix values
                                        'tp': tp, 'fp': fp, 'fn': fn, 'tn': tn
                                    })

    # Analyze and display the results
    results_df = pd.DataFrame(results)
    print("\nHyperparameter Tuning Summary")
    # Sort results by Recall@5 in descending order
    results_df.sort_values(by='recall_at_5', ascending=False, inplace=True)
    print(results_df.to_string(index=False)) # Print the summary table

    # Identify and display the best performing set of hyperparameters
    best_result = results_df.iloc[0]
    print("\nBest Overall Result")
    print(best_result)

    # Plot Confusion Matrix for the Best Result
    print("\n Confusion Matrix: Best Model")
    # Create a numpy array for the confusion matrix heatmap
    cm_data = np.array([
        [best_result['tn'], best_result['fp']],
        [best_result['fn'], best_result['tp']]
    ])

    # Plot the confusion matrix using seaborn
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm_data, annot=True, fmt='.0f', cmap='Blues',
                xticklabels=['Not Recommended', 'Recommended'],
                yticklabels=['Not Forgotten', 'Forgotten'])
    plt.title('Confusion Matrix for Best Performing Model')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Generate the final submission file using the best hyperparameters
    print("\nGenerating final submission file with best hyperparameters...")
    generate_recommendations(
        all_orders_path=all_orders_path,
        last_orders_path='last_orders_subset.csv', # Assuming this is the path to the actual last orders for submission
        output_path='Recsys_5_sets.csv', # Assuming this is the desired output path
        decay_rate=best_result['decay_rate'],
        co_occurrence_weight=best_result['co_occurrence_weight'],
        content_weight=best_result['content_weight'],
        svd_weight=best_result['svd_weight'],
        n_components=int(best_result['n_components']), # Cast to int
        n_iter=int(best_result['n_iter']), # Cast to int
        frequency_power=best_result['frequency_power']
    )

In [27]:
def generate_recommendations(all_orders_path, last_orders_path, output_path,
                             group_number=1, decay_rate=0.01,
                             co_occurrence_weight=0.5, content_weight=0.2,
                             svd_weight=0.5, n_components=20, n_iter=5,
                             frequency_power=1.0):
    """
    Generates the final recommendation file for submission.

    This function loads the historical and last order data, builds the
    recommendation models using specified (or tuned) hyperparameters,
    generates recommendations for the last orders, and saves the results
    in a specified CSV format.

    Args:
        all_orders_path (str): The file path to the CSV file containing all
                               historical order data (training data).
        last_orders_path (str): The file path to the CSV file containing the
                                last orders for which recommendations are needed.
        output_path (str): The file path where the generated submission CSV
                           file will be saved.
        group_number (int): The group number (used in the output filename format).
        decay_rate (float): Hyperparameter for the time-weighted user history model.
        co_occurrence_weight (float): Hyperparameter for the item-to-item co-occurrence model.
        content_weight (float): Hyperparameter for the content-based filtering model.
        svd_weight (float): Hyperparameter for the SVD model.
        n_components (int): Hyperparameter for the number of SVD components.
        n_iter (int): Hyperparameter for the number of SVD iterations.
        frequency_power (float): Hyperparameter for adjusting time-weighted scores
                                 by purchase frequency.
    """

    # Load the datasets
    try:
        all_orders_df = pd.read_csv(all_orders_path)
        last_orders_df = pd.read_csv(last_orders_path)
    except FileNotFoundError as e:
        print(
            f"Files Not found: {e}"
        )
        return

    # Build Recommendation Models using historical data
    models = build_models(
        all_orders_df, decay_rate=decay_rate, n_components=n_components,
        n_iter=n_iter, frequency_power=frequency_power
    )

    # Prepare Last Order Data for prediction
    # Create a mapping from order ID to member ID for the last orders
    order_to_member_map = last_orders_df.set_index('Order')['Member'].to_dict()
    # Group items by order ID to get the items in each last order (the "cart" for prediction)
    items_in_last_orders = last_orders_df.groupby('Order')['SKU'].apply(set).to_dict()
    # Get the unique order IDs for which to generate recommendations
    orders_to_predict = last_orders_df['Order'].unique()
    print(
        f"Found {len(orders_to_predict)} unique orders to predict for submission."
    )

    # Generate Recommendations for Each Order
    final_recommendations = get_all_recommendations(
        orders_to_predict, order_to_member_map, items_in_last_orders,
        models, co_occurrence_weight=co_occurrence_weight,
        content_weight=content_weight, svd_weight=svd_weight
    )

    # Create and Save the Submission File
    print("Saving submission file")
    submission_df = pd.DataFrame(final_recommendations)
    # Add an 'ID' column starting from 1
    submission_df.reset_index(inplace=True)
    submission_df.rename(columns={'index': 'ID'}, inplace=True)
    submission_df["ID"] = submission_df["ID"] + 1

    # Reorder columns to match the required submission format
    submission_df = submission_df[['ID', 'Order', 'SKU', 'Member']]
    # Save the DataFrame to a CSV file without the index
    submission_df.to_csv(output_path, index=False)

In [21]:
# Set to True to run the hyperparameter search and see the confusion matrix
# Set to False to use predefined best hyperparameters and generate the submission file
RUN_TUNING = True

# --- Configuration ---
# File paths for the input data and the output submission file
ALL_ORDERS_FILE = 'all_except_last_orders.csv' # Path to the training data
LAST_ORDERS_FILE = 'last_orders_subset.csv' # Path to the test/prediction data
OUTPUT_FILE_FORMAT = 'Recsys_5_sets.csv' # Path for the output submission file

# Conditional execution based on the RUN_TUNING flag
if RUN_TUNING:
    # If RUN_TUNING is True, execute the hyperparameter tuning process
    # This will run the tuning process and find the best parameters,
    # and plots the confusion matrix for the best model.
    tune_hyperparameters(all_orders_path=ALL_ORDERS_FILE)
else:
    # If RUN_TUNING is False, use the predefined best hyperparameters
    # These parameters were likely determined from a previous tuning run
    best_decay_rate = 0.0003
    best_co_occurrence_weight = 0.5
    best_content_weight = 0.5
    best_svd_weight = 0.5
    best_n_components = 70
    best_n_iter = 5
    best_frequency_power = 1.3

    # Generate the final recommendation file for submission using the best parameters
    generate_recommendations(
        all_orders_path=ALL_ORDERS_FILE,
        last_orders_path=LAST_ORDERS_FILE,
        output_path=OUTPUT_FILE_FORMAT,
        decay_rate=best_decay_rate,
        co_occurrence_weight=best_co_occurrence_weight,
        content_weight=best_content_weight,
        svd_weight=best_svd_weight,
        n_components=best_n_components,
        n_iter=best_n_iter,
        frequency_power=best_frequency_power
    )

Building models with decay=0.0003, n_comp=70, n_iter=5, freq_power=1.3...
Models built successfully.
Found 638 unique orders to predict for submission.
Saving submission file
