In [1]:
import sys
sys.path.append('./')  # Adjust the path if necessary

from data_loader import load_user_reviews
from model_pipeline import RecommenderModel
from utils import extract_latest_n_reviews, extract_product_names
from retrieval import initialize_chromadb, collect_results_per_product
from evaluation import normalize, compute_similarity, recall_at_k, ndcg_at_k

# Additional imports
import numpy as np
import torch
import pandas as pd

# Initialize Recommender Model
recommender_model = RecommenderModel()

# Initialize ChromaDB
db_path = "./chroma_db_mpnet"
db = initialize_chromadb(db_path)
collection_name = 'product_embeddings_filtered'  # Use the same collection name
collection = db.get_collection(name=collection_name)
# Load the product data
df = pd.read_csv("data/meta_all_beauty_filtered_simple.csv")

# Load training data
train_file = 'data/train_val_user_reviews.json'
input_set = load_user_reviews(train_file)

# Load test data
test_file = 'data/test_user_reviews.json'
input_set_test = load_user_reviews(test_file)
count = collection.count()
print(f"Total embeddings in collection: {count}")

  from .autonotebook import tqdm as notebook_tqdm


max length is 2048


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


models/hf-frompretrained-download/meta-llama/Meta-Llama-3-8B-Instruct


Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.
Loading checkpoint shards: 100%|██████████| 2/2 [00:23<00:00, 11.52s/it]
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Total embeddings in collection: 356


In [2]:
# Parameters
n_latest_reviews = 3
SIMILARITY_THRESHOLD = 90.0

# Select a specific user index
userIndex = 0  # Change this index to select a different user

# Start processing for a single user
print(f"Processing user {userIndex + 1}")

# Extract latest reviews for training
example_user = [input_set[userIndex]]
latest_reviews = extract_latest_n_reviews(example_user, n_latest_reviews)
review_text = "\n".join([f"Product: {rev['product_name']}\nReview: {rev['text']}" for rev in latest_reviews])

# Display the user's latest reviews
print("\nUser's Latest Reviews:")
for idx, rev in enumerate(latest_reviews, 1):
    print(f"{idx}. Product: {rev['product_name']}")
    print(f"   Review: {rev['text']}\n")

# Generate the prompt for user profile
from config import USER_PROFILE_PROMPT

user_profile_prompt = USER_PROFILE_PROMPT.format(reviews=review_text)
print("\nUser Profile Generation Prompt:")
#print(user_profile_prompt)

# Create user profile
profile = recommender_model.create_user_profile(review_text)
print("\nGenerated User Profile:")
print(profile)

# Generate the prompt for preliminary recommendations
from config import PRELIMINARY_RECOMMENDATIONS_PROMPT

prelim_rec_prompt = PRELIMINARY_RECOMMENDATIONS_PROMPT.format(user_profile=profile)
print("\nPreliminary Recommendations Generation Prompt:")
#print(prelim_rec_prompt)

# Generate preliminary recommendations
preliminary_rec = recommender_model.create_preliminary_recommendations(profile)
print("\nPreliminary Recommendations:")
print(preliminary_rec)

# Extract product names
product_names = extract_product_names(preliminary_rec)
print("\nExtracted Product Names:")
for idx, name in enumerate(product_names, 1):
    print(f"{idx}. {name}")

# Check if product_names is empty
if not product_names:
    print("No product names extracted. Exiting.")
else:
    # Query ChromaDB and collect results
    final_results = collect_results_per_product(product_names, collection)
    print(f"Final results{final_results} ")
    # Check if collect_results_alternating_shortest returned -1
    if final_results == -1:
        print("No recommendations found. Exiting.")
    else:
        # Proceed with evaluation if we have recommendations
        # Get the actual product from the test set
        example_user_test = [input_set_test[userIndex]]
        test_review = extract_latest_n_reviews(example_user_test, 1)
        test_product = test_review[0]['parent_asin']
        print("\nTest Product:")
        print(test_product)

        # Retrieve recommended products
        recommended_products = []
        for doc, metadata in final_results:
            # Retrieve the product title using metadata (e.g., 'asin')
            #print(final_results)
            asin = metadata['metadata']
            filt = df['parent_asin'] == asin
            title = asin
            print(asin)
            if len(title) > 0:
                recommended_products.append(title)
            else:
                recommended_products.append(doc)  # Fallback to the document if title not found

        # Display recommended products with metadata
        print("\nRecommended Products:")
        for idx, (prod, (doc, metadata)) in enumerate(zip(recommended_products, final_results), 1):
            print(f"{idx}. {prod}")
            print(f"   Document: {doc}")
            print(f"   Metadata: {metadata}")

        # Evaluate recommendations
        normalized_test_product = test_product
        normalized_recommended_products =  recommended_products
        print(f"normalized test {normalized_test_product}")
        # Compute similarity scores and matches
        similarity_scores = []
        matches = []
        for rec_product in normalized_recommended_products:
            sim_score = compute_similarity(rec_product, normalized_test_product)
            print(f"normalized test {normalized_test_product}")
            print(f"normalized rec {rec_product}")

            similarity_scores.append(sim_score)
            matches.append(sim_score >= SIMILARITY_THRESHOLD)

        # Display similarity scores and matches
        print("\nSimilarity Scores and Matches:")
        for idx, (prod, score, match) in enumerate(zip(recommended_products, similarity_scores, matches), 1):
            print(f"{idx}. {prod}")
            print(f"   Similarity Score: {score:.2f}%")
            print(f"   Match: {'Yes' if match else 'No'}")

        # Calculate Recall@K and NDCG@K
        from evaluation import recall_at_k, ndcg_at_k

        K = 10  # For Recall@K and NDCG@K
        recall = recall_at_k(matches, K)
        ndcg = ndcg_at_k(matches, K)

        # Display evaluation metrics
        print("\nEvaluation Metrics:")
        print(f"Recall@{K}: {recall}")
        print(f"NDCG@{K}: {ndcg}")

Processing user 1

User's Latest Reviews:
1. Product: Manicure and Pedicure Nail Clipper from POWERGROOMING - Powerful Trimmer for Thick and Thin Finger Nails and Toe Nails - Included Nail File and"Catcher" for Easy Cleanup (1 Pack)
   Review: This a really cute kit which would make for a great gift for someone. It is in a little leather like pouch and has everything you need to give yourself a quality manicure. The nail clipper is a perfect size and works just as well on a women or man's nails. The file is nice as well (although I still prefer to use emory boards on mine). I actually bought another one of these to give to my son as a stocking stuffer this last Christmas for him to use at college. Just a nice, quality made kit at a reasonable price.

2. Product: Iryasa Night Indulge Cream - Natural Face Cream for Dry Skin - Vegan Anti Aging Night Cream for Women - Firming Cream for Face and Neck - Organic Vitamin C Moisturizer for Face - 1.7oz
   Review: To be honest, I rarely have use

In [3]:
collection = db.get_collection(name=collection_name)
count = collection.count()
print(f"Total embeddings in collection: {count}")

Total embeddings in collection: 112590
