In [1]:
## Testing functions
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
# Assuming the extract_product_names function is already defined

# Individual test cases as separate variables

test_case_1 = """Here is the list of categories we recommend:
1. Organic Skincare Essentials - Products focusing on natural, plant-based ingredients.
2. High-Quality Hair Treatments - Salon-grade treatments and masks for hair health.
3. Eco-Friendly Household Goods - Sustainable, biodegradable, and reusable household items."""

test_case_2 = """Generated categories for you:
1) Personalized Fitness Gear - Tailored fitness accessories like custom-fit shoes.
2) Luxury Fragrances - High-end, exclusive fragrances for special occasions.
3) Vegan Food Subscriptions - Monthly delivery of vegan-friendly snacks and meals."""

test_case_3 = """Suggested items based on your preferences:
- Meditation Aids - Guided meditation tools for relaxation.
- Natural Supplements - Supplements with organic, non-GMO ingredients.
- Specialty Coffee Beans - Ethically sourced, high-quality coffee varieties."""

test_case_4 = """Please check out these top-rated items:
* Wireless Earbuds - Premium sound quality with noise-canceling features.
* Smart Home Devices - Technology to enhance home automation and security.
* Minimalist Office Supplies - Sleek, modern supplies for an organized workspace."""

test_case_5 = """We've compiled these categories just for you:
1. Sustainable Fashion - Eco-friendly, ethically made clothing items.
* Organic Tea Selection - Handpicked, loose-leaf teas with health benefits.
2) Customizable Furniture - Furniture designed to fit user specifications.
+ Handcrafted Jewelry - Unique, artisan-made jewelry pieces.
- Reusable Water Bottles - BPA-free, eco-conscious water bottles."""

test_case_6 = """Based on our analysis, here are your favorite categories:
Favorite Products:
Eco-Friendly Beauty - Natural and sustainable beauty products.
Advanced Health Gadgets - Innovative gadgets promoting health.
Minimalist Decor - Simple and stylish decor items for the home."""

test_case_7 = """Here’s a quick list of products to explore:
1- Durable Outdoor Gear - Equipment for camping, hiking, and outdoor sports.
2- High-Performance Kitchen Appliances - Efficient and multifunctional appliances.
3- Educational Toys - Toys that encourage learning and creativity in children."""

test_case_8 = """Top categories for you:
Top Picks:
+ Eco-Conscious Products - Environmentally friendly items.
> Luxury Pet Accessories - High-end products for pets.
1) Travel Essentials - Compact, convenient items for travel.
Important Note: These are recommended based on your recent purchases."""

test_case_9 = """Hello! Here are some curated product categories:
1. Home Essentials - Core products for every household.
   a) Kitchenware - Pots, pans, and utensils.
   b) Bathroom - Towels, mats, and organizers.
2. Office Supplies - Must-have items for a productive workspace.
   a) Stationery - Pens, notebooks, and organizers.
   b) Electronics - Printers, monitors, and chargers."""

test_case_10 = """Thank you for your interest! Here’s a selection for you:
1. Organic Pet Care - Natural and gentle products for pets.
2. Premium Wines - A selection of award-winning wines.
3. Artisan Baked Goods - Freshly baked goods from local artisans.
Have a wonderful day!"""

# List of all test cases for easy iteration
test_cases = [
    ("Test Case 1", test_case_1),
    ("Test Case 2", test_case_2),
    ("Test Case 3", test_case_3),
    ("Test Case 4", test_case_4),
    ("Test Case 5", test_case_5),
    ("Test Case 6", test_case_6),
    ("Test Case 7", test_case_7),
    ("Test Case 8", test_case_8),
    ("Test Case 9", test_case_9),
    ("Test Case 10", test_case_10)
]

# Run each test case through the extract_product_names function and print the results
for name, input_text in test_cases:
    print(f"{name} Output:")
    output = extract_product_names(input_text)
    print(output)
    print("\n" + "-"*40 + "\n")


Test Case 1 Output:
['Organic Skincare Essentials - Products focusing on natural, plant-based ingredients.', 'High-Quality Hair Treatments - Salon-grade treatments and masks for hair health.', 'Eco-Friendly Household Goods - Sustainable, biodegradable, and reusable household items.']

----------------------------------------

Test Case 2 Output:
['Personalized Fitness Gear - Tailored fitness accessories like custom-fit shoes.', 'Luxury Fragrances - High-end, exclusive fragrances for special occasions.', 'Vegan Food Subscriptions - Monthly delivery of vegan-friendly snacks and meals.']

----------------------------------------

Test Case 3 Output:
['Meditation Aids - Guided meditation tools for relaxation.', 'Natural Supplements - Supplements with organic, non-GMO ingredients.', 'Specialty Coffee Beans - Ethically sourced, high-quality coffee varieties.']

----------------------------------------

Test Case 4 Output:
['Wireless Earbuds - Premium sound quality with noise-canceling featur

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products, extract_product_names_adapter
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 2
    MAX_RETRIES_ITEMS = 10

    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_base_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES_ITEMS:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names_adapter(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES_ITEMS})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES_ITEMS} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/val_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_with_description_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


### Combined Train and val

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_with_description_train_val_set_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 3
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set_numOfRev={n_latest_reviews}_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 7
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set_numOfRev={n_latest_reviews}_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date
import random
current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set__randomized_reviews_numOfRev={n_latest_reviews}_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    # Randomize the order of the latest reviews
                    random.shuffle(latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


TODO experiment run with adding time stemp
Still need to run this

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set_with_timestamp_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nTimestamp: {rev['timestamp']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
for i in range(3):
    run_experiment(num_run= i)
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")

### USE User profile and Candidate Items in one prompt

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'data/test_user_reviews.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K = 10
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_{sample_size}_{current_date}_2_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    recall = recall_at_k(matches, K)
                    ndcg = ndcg_at_k(matches, K)
                    all_recalls.append(recall)
                    all_ndcgs.append(ndcg)
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics
        mean_recall = np.mean(all_recalls) if all_recalls else 0.0
        mean_ndcg = np.mean(all_ndcgs) if all_ndcgs else 0.0
        result_file.write(f"Overall Mean Recall@{K}: {mean_recall}\nOverall Mean NDCG@{K}: {mean_ndcg}\n")
        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
run_experiment()
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'data/test_user_reviews.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K = 10
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_{sample_size}_{current_date}_3_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_product_name_only(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    

                    final_results = collect_results_per_product(product_names, collection, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    recall = recall_at_k(matches, K)
                    ndcg = ndcg_at_k(matches, K)
                    all_recalls.append(recall)
                    all_ndcgs.append(ndcg)
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics
        mean_recall = np.mean(all_recalls) if all_recalls else 0.0
        mean_ndcg = np.mean(all_ndcgs) if all_ndcgs else 0.0
        result_file.write(f"Overall Mean Recall@{K}: {mean_recall}\nOverall Mean NDCG@{K}: {mean_ndcg}\n")
        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
run_experiment()
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'data/test_user_reviews.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K = 10
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_{sample_size}_{current_date}_with_description_2_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    

                    final_results = collect_results_per_product(product_names, collection, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    recall = recall_at_k(matches, K)
                    ndcg = ndcg_at_k(matches, K)
                    all_recalls.append(recall)
                    all_ndcgs.append(ndcg)
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics
        mean_recall = np.mean(all_recalls) if all_recalls else 0.0
        mean_ndcg = np.mean(all_ndcgs) if all_ndcgs else 0.0
        result_file.write(f"Overall Mean Recall@{K}: {mean_recall}\nOverall Mean NDCG@{K}: {mean_ndcg}\n")
        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
run_experiment()
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date

current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'data/test_user_reviews.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K = 10
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_{sample_size}_{current_date}_with_description_3_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile(review_text)
                    print(f"profile {profile}")
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    

                    final_results = collect_results_per_product(product_names, collection, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    recall = recall_at_k(matches, K)
                    ndcg = ndcg_at_k(matches, K)
                    all_recalls.append(recall)
                    all_ndcgs.append(ndcg)
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics
        mean_recall = np.mean(all_recalls) if all_recalls else 0.0
        mean_ndcg = np.mean(all_ndcgs) if all_ndcgs else 0.0
        result_file.write(f"Overall Mean Recall@{K}: {mean_recall}\nOverall Mean NDCG@{K}: {mean_ndcg}\n")
        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size
run_experiment()
for sample_size in [64]:
    print(f"\nStarting experiment for sample size {sample_size}...")
    


### Alpaca UP and CI in one

In [5]:
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products

test = """
Profile: **Short-Term Interest:** Based on the latest reviewed products, we observe a pattern of interest in personal care and hygiene-related items such as oral health, skincare, and body cleansing products.

**Long-Term Preference:** By analyzing the user's entire reviewing history, we identify consistent themes and patterns:

* They tend to prefer natural and organic products, evident through repeated purchases and positive reviews of products containing essential oils and botanicals.
* Their preferences lean towards moisturizing and soothing products, often choosing options with hydrating properties, anti-aging benefits, and calming scents.
* They demonstrate awareness about environmental concerns, opting for eco-friendly packaging and biodegradable alternatives whenever possible.
* Although some negative experiences exist, the majority of reviews indicate a willingness to experiment with new products and brands, suggesting a desire for discovery and exploration within their preferred niches.

**User Profile Summary:** Our analysis reveals a user who prioritizes wellness, self-care, and sustainability. With a focus on natural ingredients, hydration, and relaxation, this individual seeks effective yet gentle products for various aspects of life. They appreciate thoughtful design, eco-conscious practices, and innovative approaches to common problems.

Five Candidate Item Categories that Align with the User's Preferences and Interests:

1. **Natural Skincare Sets**: Curated collections featuring luxurious, cruelty-free formulas enriched with botanical extracts, shea butter, and argan oil, designed for normal-to-dry skin types.
2. **Eco-Friendly Hair Care**: Products made from sustainable sources, biodegradable materials, and free from harsh chemicals; offering nourishment, volume, and texture enhancements without compromising nature.
3. **Hydration Essentials**: Expert-formulated serums, creams, and lotions infused with hyaluronic acid, green tea extract, and glycerin, specifically crafted to quench thirsty skin, soothe dryness, and promote radiance.
4. **Organic Wellness Supplements**: Capsules, powders, and elixirs derived from pure, natural sources, focusing on probiotics, antioxidants, and adaptogenic blends to support immune function, energy levels, and mental clarity.
5. **Water-Saving Bathroom Essentials**: Innovative, low-waste bathroom solutions incorporating refillable containers, reusable wipes, and intelligent dispensers, reducing household waste and promoting efficient resource management.

By recommending these categories, we aim to cater to the"""

extract_product_names(test)


['**Natural Skincare Sets**: Curated collections featuring luxurious, cruelty-free formulas enriched with botanical extracts, shea butter, and argan oil, designed for normal-to-dry skin types.',
 '**Eco-Friendly Hair Care**: Products made from sustainable sources, biodegradable materials, and free from harsh chemicals; offering nourishment, volume, and texture enhancements without compromising nature.',
 '**Hydration Essentials**: Expert-formulated serums, creams, and lotions infused with hyaluronic acid, green tea extract, and glycerin, specifically crafted to quench thirsty skin, soothe dryness, and promote radiance.',
 '**Organic Wellness Supplements**: Capsules, powders, and elixirs derived from pure, natural sources, focusing on probiotics, antioxidants, and adaptogenic blends to support immune function, energy levels, and mental clarity.',
 '**Water-Saving Bathroom Essentials**: Innovative, low-waste bathroom solutions incorporating refillable containers, reusable wipes, and inte

### Combine UP and CI in one prompt

Alpaca

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date


# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size and run number
def run_experiment(sample_size=None, num_run=0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    current_date = date.today()

    torch.cuda.empty_cache()
    type = "both"
    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size, type)
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)

    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 20
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size and run number
    result_file_path = f'results_finetuning_{sample_size}_{type}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False  # Initialize retries and success flag
            while not success and retries < MAX_RETRIES:
                try:
                    # Step 1: Extract latest reviews
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    print(f"Latest Reviews for User {userIndex + 1}:\n{review_text}\n")

                    # Step 2: Generate user profile and candidate items
                    print(f"Generating profile and items for user {userIndex + 1}")
                    retries_profile = 0
                    while retries_profile < MAX_RETRIES:
                        try:
                            items = recommender_model.create_user_profile_and_candidate_items_alpaca(review_text)
                            print(f"Profile and Candidate Items for User {userIndex + 1}:\n{items}\n")
                            # Write profile and items to result file
                            result_file.write(f"User {userIndex + 1} Profile and Candidate Items:\n{items}\n\n")
                            product_names = extract_product_names(items)
                            print(f"Extracted Product Names for User {userIndex + 1}: {product_names}\n")
                            if not product_names:
                                raise Exception("Empty product names extracted.")
                            # Write extracted product names to result file
                            result_file.write(f"User {userIndex + 1} Extracted Product Names:\n{product_names}\n\n")
                            break  # Success, exit profile retry loop
                        except Exception as e:
                            retries_profile += 1
                            print(f"Error generating profile for user {userIndex + 1}: {e}")
                            print(f"Retrying profile generation ({retries_profile}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop

                    # Step 4: Collect results
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"User {userIndex + 1} Purchase History ASINs: {user_history}\n")
                    retries_collect = 0
                    while retries_collect < MAX_RETRIES:
                        try:
                            print(f"Collecting final results for user {userIndex + 1}")
                            final_results = collect_results_per_product(product_names, collection, user_history=user_history, max_products=20)
                            if final_results == -1:
                                raise Exception("No recommendations found.")
                            break  # Success, exit collect retry loop
                        except Exception as e:
                            retries_collect += 1
                            print(f"Error collecting results for user {userIndex + 1}: {e}")
                            print(f"Retrying collect ({retries_collect}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in collecting results.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop

                    # Step 5: Evaluate recommendations
                    print(f"Evaluating recommendations for user {userIndex + 1}")
                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']
                    print(f"Test Product ASIN for User {userIndex + 1}: {test_product}\n")

                    recommended_products = []
                    for doc, distance, metadata in final_results:
                        asin = metadata['metadata']
                        # Retrieve the product title using metadata
                        title = asin  # Assuming 'asin' is sufficient; adjust if necessary
                        print(f"Recommended ASIN: {asin}")
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc)
                    print(f"Recommended Products for User {userIndex + 1}: {recommended_products}\n")

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. ASIN: {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")

                    # Write results for this user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product ASIN: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. ASIN: {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))

                    success = True  # Set success flag to exit retry loop
                except Exception as e:
                    retries += 1
                    print(f"Error processing user {userIndex + 1}: {e}")
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)
                    else:
                        print(f"Retrying user processing ({retries}/{MAX_RETRIES})...")

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\n")
            result_file.write(f"Overall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments multiple times
for sample_size in [16,32,64]:

    for i in range(3):  # Number of runs
        print(f"\nStarting experiment run number {i} for sample size {sample_size}...")
        run_experiment(sample_size=sample_size, num_run=i)  # You can adjust sample_size as needed




In [3]:
from utils import *
response_text = """
Candidate_Items:
[Revitalizing Face Masks],
[Organic Hair Care Products],
[Zero-Waste Bath Accessories],
[Plant-Based Beauty Tools],
[Environmentally Friendly Packaging]
"""

product_names = extract_product_names(response_text)
print(product_names)


['Revitalizing Face Masks', 'Organic Hair Care Products', 'Zero-Waste Bath Accessories', 'Plant-Based Beauty Tools', 'Environmentally Friendly Packaging']


Non Fine tuned

In [None]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date


# Import modules
from data_loader import load_user_reviews
from utils import extract_latest_n_reviews, extract_product_names, extract_ranked_products
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size and run number
def run_experiment(sample_size=None, num_run=0):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    current_date = date.today()

    torch.cuda.empty_cache()
    type = "both"
    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size, type)
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)

    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 20
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size and run number
    result_file_path = f'results_{sample_size}_{type}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False  # Initialize retries and success flag
            while not success and retries < MAX_RETRIES:
                try:
                    # Step 1: Extract latest reviews
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    print(f"Latest Reviews for User {userIndex + 1}:\n{review_text}\n")

                    # Step 2: Generate user profile and candidate items
                    print(f"Generating profile and items for user {userIndex + 1}")
                    retries_profile = 0
                    while retries_profile < MAX_RETRIES:
                        try:
                            items = recommender_model.create_user_profile_and_candidate_items_alpaca(review_text)
                            print(f"Profile and Candidate Items for User {userIndex + 1}:\n{items}\n")
                            # Write profile and items to result file
                            result_file.write(f"User {userIndex + 1} Profile and Candidate Items:\n{items}\n\n")
                            product_names = extract_product_names(items)
                            print(f"Extracted Product Names for User {userIndex + 1}: {product_names}\n")
                            if not product_names:
                                raise Exception("Empty product names extracted.")
                            # Write extracted product names to result file
                            result_file.write(f"User {userIndex + 1} Extracted Product Names:\n{product_names}\n\n")
                            break  # Success, exit profile retry loop
                        except Exception as e:
                            retries_profile += 1
                            print(f"Error generating profile for user {userIndex + 1}: {e}")
                            print(f"Retrying profile generation ({retries_profile}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop

                    # Step 4: Collect results
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"User {userIndex + 1} Purchase History ASINs: {user_history}\n")
                    retries_collect = 0
                    while retries_collect < MAX_RETRIES:
                        try:
                            print(f"Collecting final results for user {userIndex + 1}")
                            final_results = collect_results_per_product(product_names, collection, user_history=user_history, max_products=20)
                            if final_results == -1:
                                raise Exception("No recommendations found.")
                            break  # Success, exit collect retry loop
                        except Exception as e:
                            retries_collect += 1
                            print(f"Error collecting results for user {userIndex + 1}: {e}")
                            print(f"Retrying collect ({retries_collect}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in collecting results.\n\n")
                        skipped_users.append(userIndex + 1)
                        break  # Exit retry loop

                    # Step 5: Evaluate recommendations
                    print(f"Evaluating recommendations for user {userIndex + 1}")
                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']
                    print(f"Test Product ASIN for User {userIndex + 1}: {test_product}\n")

                    recommended_products = []
                    for doc, distance, metadata in final_results:
                        asin = metadata['metadata']
                        # Retrieve the product title using metadata
                        title = asin  # Assuming 'asin' is sufficient; adjust if necessary
                        print(f"Recommended ASIN: {asin}")
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc)
                    print(f"Recommended Products for User {userIndex + 1}: {recommended_products}\n")

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. ASIN: {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")

                    # Write results for this user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product ASIN: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. ASIN: {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))

                    success = True  # Set success flag to exit retry loop
                except Exception as e:
                    retries += 1
                    print(f"Error processing user {userIndex + 1}: {e}")
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)
                    else:
                        print(f"Retrying user processing ({retries}/{MAX_RETRIES})...")

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\n")
            result_file.write(f"Overall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

    print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments multiple times

for i in range(3):  # Number of runs
    print(f"\nStarting experiment run number {i}...")
    run_experiment(num_run=i)  # You can adjust sample_size as needed




  from .autonotebook import tqdm as notebook_tqdm



Starting experiment run number 0...


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.



Processing user 1/253
Latest Reviews for User 1:
Product: Manicure and Pedicure Nail Clipper from POWERGROOMING - Powerful Trimmer for Thick and Thin Finger Nails and Toe Nails - Included Nail File and"Catcher" for Easy Cleanup (1 Pack)
Review: This a really cute kit which would make for a great gift for someone. It is in a little leather like pouch and has everything you need to give yourself a quality manicure. The nail clipper is a perfect size and works just as well on a women or man's nails. The file is nice as well (although I still prefer to use emory boards on mine). I actually bought another one of these to give to my son as a stocking stuffer this last Christmas for him to use at college. Just a nice, quality made kit at a reasonable price.
Product: Iryasa Night Indulge Cream - Natural Face Cream for Dry Skin - Vegan Anti Aging Night Cream for Women - Firming Cream for Face and Neck - Organic Vitamin C Moisturizer for Face - 1.7oz
Review: To be honest, I rarely have used an 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Profile and Candidate Items for User 4:
Based on the provided user reviews, we can extract key information to build the user profile.

**Short-Term Interests**: 
From the latest reviews, we notice that the user tends towards:

* Pastel color schemes (pink, purple, and yellow)
* Comfortable and breathable products (e.g., shower caps, scrunchies)
* Affordable prices ($10-$20)

These characteristics suggest that the user values aesthetics, comfort, and practicality.

**Long-term Preferences**: 
Analyzing the user's complete review history reveals underlying patterns and preferences:

* The user gravitates toward gentle, non-irritating, and hypoallergenic products.
* She favors products with

Extracted Product Names for User 4: []

Error generating profile for user 4: Empty product names extracted.
Retrying profile generation (6/20)...
Profile and Candidate Items for User 4:
**Short-Term Interest Analysis**: From the user's recent purchases, we observe that she tends to favor products with

In [1]:
from utils import *
response_text = """
**User Profile Summary**: Our synthesized summary captures the essence of the user's preferences:

"Welcome to [Username]'s world, where self-care meets sustainability!

As an advocate for nature-based solutions, [Username] craves authentic, chemical-free products. With a penchant for luxurious textures and aromas, they seek indulgent experiences that cater to their senses. Beyond pampering themselves, [Username] cares deeply about reducing waste, conserving resources, and supporting environmentally responsible brands."

Now, let's craft some candidate item recommendations tailored to meet their evolving needs:

**Candidate Item Recommendations**

1. **L'Occitane en Provence Extraordinary Concentrate Facial Serum**: As a natural, non-comedogenic serum infused with antioxidants, this product aligns perfectly with [username]'s affinity for organic and cruelty-free practices.
2. **Drunk Elephant Protini Polypeptide Cream**: Comprising potent peptides, nourishing oils, and soothing extracts, this cream addresses concerns related to aging, hydration, and skin elasticity – resonating closely with [username]'s pursuit of radiant, healthy-looking skin.
3. **Acure Organics Hydrating Facial Moisturizer**: By leveraging the power of argan oil, green tea extract, and burdock root, Acure creates a moisturizer that harmonizes with [username]'s preference for eco-conscious ingredients and commitment to sustainability.
4. **Kosas Evangelist Lip Treatment**: Kosas' lip treatment embodies the qualities [username] adores: rich, hydrating, and free-from harsh chemicals. This product promises plumper, healthier lips via its blend of squalane, rosehip, and chamomile.
5. **Milk + Honey Ursa Major Luxurious Bath Salts**: Merging calming botanicals like lavender, calendula, and ylang-ylang, Milk + Honey crafts bath salts that would soothe and rejuvenate [username], respecting her values of relaxation, tranquility, and sensory delight.

By recommending these candidate items, we aim to satisfy [username]'s evolving desires, nurturing their relationships with sustainable brands offering exceptional quality, efficacy, and performance. Enjoy exploring these curated suggestions!

"""

product_names = extract_product_names(response_text)
print(product_names)


["L'Occitane en Provence Extraordinary Concentrate Facial Serum", 'Drunk Elephant Protini Polypeptide Cream', 'Acure Organics Hydrating Facial Moisturizer', 'Kosas Evangelist Lip Treatment', 'Milk + Honey Ursa Major Luxurious Bath Salts']
