In [1]:
# main.ipynb

# Import necessary packages
import sys
import os
import numpy as np
import pandas as pd
sys.path.append('./')  # Adjust the path if necessary
from datetime import date
import random
current_date = date.today()
# Import modules
from data_loader import load_user_reviews
from utils import *
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k
from model_pipeline import RecommenderModel  # Import the RecommenderModel class
from config import PIPELINE_PARAMS, USER_PROFILE_PROMPT, MODEL_PATH, TOKENIZER_PATH
# Additional imports
import torch

# Function to run the experiment for a given sample size
def run_experiment(sample_size = None, num_run = 0, adapter = False):
    # Set the MODEL_PATH and TOKENIZER_PATH dynamically in config
    # Assuming config.py defines TOKENIZER_PATH and MODEL_PATH as format strings
    
    torch.cuda.empty_cache()
    # Adjust paths for each sample size
    
    # Override config paths for this experiment

    # Initialize the RecommenderModel with the adjusted paths
    recommender_model = RecommenderModel(sample_size, adapter= adapter)    
    # Initialize ChromaDB
    db_path = "./chroma_db_mpnet"
    db = initialize_chromadb(db_path)
    collection_name = 'product_embeddings_filtered'
    collection = db.get_collection(name=collection_name)
    
    # Load product data and user reviews
    df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")
    train_file = 'new_data/new_train_val_output.json'
    input_set = load_user_reviews(train_file)
    test_file = 'new_data/test_output.json'
    input_set_test = load_user_reviews(test_file)

    # Experiment parameters
    n_latest_reviews = 10
    SIMILARITY_THRESHOLD = 90.0
    K_VALUES = [1, 5, 10, 20]
    MAX_RETRIES = 3
    all_similarity_scores, all_matches, all_recalls, all_ndcgs, skipped_users = [], [], [], [], []

    # Open a result file specific to this sample size
    result_file_path = f'results_product_name_train_val_set_adapter_{adapter}_numOfRev={n_latest_reviews}_{sample_size}_{current_date}_{num_run}_samples.txt'
    with open(result_file_path, 'w', encoding='utf-8') as result_file:
        num_users = len(input_set)        
        for userIndex in range(num_users):
            print(f"\nProcessing user {userIndex + 1}/{num_users}")
            retries, success = 0, False
            while not success and retries < MAX_RETRIES:
                try:
                    example_user = [input_set[userIndex]]
                    latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
                    # Randomize the order of the latest reviews
                    print(f"number of reviews {len(latest_reviews)}")
                    review_text = "\n".join([
                        f"Product: {rev['product_name']}\nReview: {rev['text']}"
                        for rev in latest_reviews
                    ])
                    if not latest_reviews:
                        result_file.write(f"User {userIndex + 1} skipped due to no latest reviews.\n\n")
                        skipped_users.append(userIndex + 1)
                        break
                    print(f"generating profile for {review_text}")
                    # Generate user profile
                    profile = recommender_model.create_user_profile_alpaca_adapter(review_text)
                    print(f"profile {profile}")
                    if not profile or not any(keyword in profile.lower() for keyword in ["short-term", "long-term", "user profile"]):
                        result_file.write(f"User {userIndex + 1} skipped due to empty or invalid profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        retries += 1  # Increment retries if you want to retry generating the profile
                        print(f"Profile invalid for user {userIndex + 1}. Retrying ({retries}/{MAX_RETRIES})...")
                        continue  # Retry or break, depending on your desired behavior
            # If you don't want to retry, use 'break' instead of 'continue'
        
        # Write profile to result file
                    
                    # Write profile to result file
                    result_file.write(f"User {userIndex + 1} Profile:\n{profile}\n\n")
                    
                    # Generate preliminary recommendations and collect results
                    retries_item = 0
                    while retries_item < MAX_RETRIES:
                        try:
                            print(f"Generating items for user {userIndex + 1}")
                            preliminary_rec = recommender_model.create_preliminary_recommendations_alpaca_adapter(profile)
                            print(f"preliminary_rec {preliminary_rec}")
                            # Write preliminary recommendations to result file


                            product_names = extract_product_names_adapter(preliminary_rec)
                            if not product_names:
                                result_file.write(f"User {userIndex + 1} skipped due to empty product names.\n\n")
                                skipped_users.append(userIndex + 1)
                                raise Exception
                            result_file.write(f"User {userIndex + 1} Preliminary Recommendations:\n{preliminary_rec}\n\n")
                            result_file.write(f"User {userIndex + 1} Extracted products :\n{product_names}\n\n")
                            print(f"extracted product names {product_names}")
                            break  # Success, exit retry loop
                        except Exception as e:
                            retries_item += 1
                            print(f"Error generating items for user {userIndex + 1}: {e}")
                            print(f"Retrying ({retries_item}/{MAX_RETRIES})...")
                    else:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries in generating profile.\n\n")
                        skipped_users.append(userIndex + 1)
                        user_skipped = True
                        continue  # Skip to next user

                    
                    #extract user history 
                    user_history = [rev['parent_asin'] for rev in latest_reviews if 'parent_asin' in rev]
                    print(f"user history id: {user_history}")  # Output: ['ASIN123', 'ASIN456']
                    final_results = collect_results_per_product(product_names, collection,user_history, max_products=20)
                    if final_results == -1:
                        result_file.write(f"User {userIndex + 1} skipped due to no recommendations.\n\n")
                        skipped_users.append(userIndex + 1)
                        raise Exception

                    example_user_test = [input_set_test[userIndex]]
                    test_review = extract_latest_n_reviews(example_user_test, 1)
                    test_product = test_review[0]['parent_asin']


                    recommended_products = []
                    for doc,distance, metadata in final_results:
                        asin = metadata['metadata']
                        filt = df['parent_asin'] == asin
                        title = asin
                        print(asin)
                        if len(title) > 0:
                            recommended_products.append(title)
                        else:
                            recommended_products.append(doc) 

                    # Evaluate recommendations
                    normalized_test_product = normalize(test_product)
                    normalized_ranked_products = [normalize(name) for name in recommended_products]
                    similarity_scores = []
                    matches = []
                    for rec_product in normalized_ranked_products:
                        sim_score = compute_similarity(rec_product, normalized_test_product)
                        similarity_scores.append(sim_score)
                        matches.append(sim_score >= SIMILARITY_THRESHOLD)

                    # Display similarity scores and matches
                    print("\nSimilarity Scores and Matches:")
                    for idx, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
                        print(f"{idx}. {asin}")
                        print(f"   Similarity Score: {score:.2f}%")
                        print(f"   Match: {'Yes' if match else 'No'}")
                    
                    # Write results for each user
                    result_file.write(f"User {userIndex + 1}:\n")
                    result_file.write(f"Test Product: {test_product}\n")
                    result_file.write(f"Recommended Products:\n")
                    for i, (asin, score, match) in enumerate(zip(recommended_products, similarity_scores, matches)):
                        result_file.write(f"  {i+1}. {asin} - Similarity: {score:.2f}% - {'Match' if match else 'No Match'}\n")
                    result_file.write("\n")

                    # Collect similarity scores and matches
                    all_similarity_scores.extend(similarity_scores)
                    all_matches.extend(matches)

                    # Calculate Recall@K and NDCG@K for this user for all K values
                    for k in K_VALUES:
                        recall = recall_at_k(matches, k)
                        ndcg = ndcg_at_k(matches, k)
                        all_recalls.append((k, recall))
                        all_ndcgs.append((k, ndcg))
                    
                    success = True
                except Exception as e:
                    retries += 1
                    if retries >= MAX_RETRIES:
                        result_file.write(f"User {userIndex + 1} skipped after {MAX_RETRIES} retries.\n\n")
                        skipped_users.append(userIndex + 1)

        # Calculate and write overall metrics for each K
        for k in K_VALUES:
            recalls_at_k = [rec for k_val, rec in all_recalls if k_val == k]
            ndcgs_at_k = [ndcg for k_val, ndcg in all_ndcgs if k_val == k]
            mean_recall = np.mean(recalls_at_k) if recalls_at_k else 0.0
            mean_ndcg = np.mean(ndcgs_at_k) if ndcgs_at_k else 0.0
            result_file.write(f"Overall Mean Recall@{k}: {mean_recall}\nOverall Mean NDCG@{k}: {mean_ndcg}\n")

        if skipped_users:
            result_file.write("Skipped Users:\n" + ", ".join(map(str, skipped_users)) + "\n")

        print(f"Experiment completed for sample size {sample_size}. Results saved to {result_file_path}.")

# Run experiments for each sample size

for sample_size in [32]:
    for i in range(4):
        run_experiment(sample_size= sample_size,num_run= i, adapter=True)
        print(f"\nStarting experiment for sample size {sample_size} run number {i} and adapters {True}...")
    


  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [3]:
from utils import *

test = """Here's your output:

The recommended product category set aligns closely with the user's expressed long-term preferences and short-term interests. These categories synergize to create an overarching theme emphasizing sustainability, eco-friendliness, natural ingredients, gentle formulations, and targeted benefits for mature skin. The collective assortment aims to satisfy both immediate needs (hydrated and nourished complexion) and long-standing aspirations (environmentally responsible living). Specifically, I recommend considering products from the categories listed below:

1. Eco-Friendly Skincare Sets
2. Natural Moisturizing Creams/Lotions
3. Gentle Exfoliants/Peels
4. Vegan/Cruelty-Free Facial Care Kits"""

print(extract_product_names_adapter(test))

['Eco-Friendly Skincare Sets', 'Natural Moisturizing Creams/Lotions', 'Gentle Exfoliants/Peels', 'Vegan/Cruelty-Free Facial Care Kits']
