In [8]:
import json
import torch
import logging
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import chromadb
import uuid
import os

import json
import os

def combine_datasets(dataset_name):
    """
    Combines train, test, and validation datasets into a single JSON file.

    Args:
        dataset_name (str): Name of the dataset.

    Returns:
        str: Path to the combined dataset file.
    """
    train_file = f"data/{dataset_name}_train_output.json"
    test_file = f"data/{dataset_name}_test_output.json"
    val_file = f"data/{dataset_name}_val_output.json"
    combined_file = f"data/{dataset_name}_combined.json"

    # Load and merge datasets
    combined_data = []
    for file in [train_file, test_file, val_file]:
        if os.path.exists(file):
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                combined_data.extend(data)
        else:
            print(f"Warning: {file} not found. Skipping.")

    # Save the combined dataset
    with open(combined_file, 'w', encoding='utf-8') as f:
        json.dump(combined_data, f, ensure_ascii=False, indent=4)
    
    print(f"Combined dataset saved to {combined_file}")
    return combined_file

# Example usage
dataset_name = "Beauty"


# ---------------------- Configuration ----------------------
REDUCED_FILE = combine_datasets(dataset_name)
print(f"Using combined dataset: {REDUCED_FILE}")
if dataset_name == "Beauty":
    META_FILE = "raw_data/meta_Beauty.jsonl"                      # The meta file for Beauty
else: META_FILE = f"raw_data/meta_{dataset_name}.jsonl"
                  # The meta file for Video Games
CHROMA_DB_PATH = "./chroma_db_blair"                       # Path for ChromaDB
CHROMA_COLLECTION_NAME = f"{dataset_name}_product_embeddings_filtered"  # ChromaDB collection name
BATCH_SIZE = 32                                            # Batch size for embedding
MODEL_NAME = "hyp1231/blair-roberta-large"                 # Model name for embeddings

# ---------------------- Logging ----------------------
logging.basicConfig(level=logging.DEBUG)  # Set to DEBUG level for detailed logs
logger = logging.getLogger(__name__)

# ---------------------- Device Setup ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.debug(f"Using device: {device}")

# Check if ChromaDB path exists and what's in it
if os.path.exists(CHROMA_DB_PATH):
    logger.debug(f"ChromaDB directory '{CHROMA_DB_PATH}' already exists. "
                 f"Make sure this directory does not contain embeddings from another model.")
else:
    logger.debug(f"ChromaDB directory '{CHROMA_DB_PATH}' does not exist. It will be created.")

# ---------------------- Load Reduced Data ----------------------
logger.debug(f"Loading reduced file: {REDUCED_FILE}")
with open(REDUCED_FILE, 'r', encoding='utf-8') as f:
    reduced_data = json.load(f)

# Extract all unique parent_asin from reduced_data
unique_items = set()
for user_entry in reduced_data:
    for review in user_entry["reviews"]:
        unique_items.add(review["parent_asin"])
logger.debug(f"Number of unique items in reduced data: {len(unique_items)}")

# ---------------------- Initialize Model ----------------------
logger.debug(f"Loading model and tokenizer from {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()
logger.debug("Model loaded and set to evaluation mode.")

def embed_texts(texts):
    """Embed a list of texts using the model and return the embeddings."""
    logger.debug(f"Embedding {len(texts)} texts...")
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    logger.debug("Tokenization complete.")
    for k, v in inputs.items():
        logger.debug(f"Input {k}: shape {v.shape}, dtype {v.dtype}")

    inputs = {key: value.to(device) for key, value in inputs.items()}
    logger.debug("Inputs moved to device.")

    with torch.no_grad():
        outputs = model(**inputs, return_dict=True)
        logger.debug("Model forward pass complete.")
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token
        logger.debug(f"Raw embeddings shape: {embeddings.shape}")
        embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # Normalize
        logger.debug("Embeddings normalized.")

    embeddings_np = embeddings.cpu().numpy()
    logger.debug(f"Embeddings moved to CPU. Final embedding shape: {embeddings_np.shape}")
    return embeddings_np.tolist()

# ---------------------- Read Meta File and Select Items ----------------------
logger.debug(f"Reading meta file: {META_FILE}")
selected_asins = []
selected_texts = []
line_count = 0
matched_count = 0

with open(META_FILE, 'r', encoding='utf-8') as f:
    for line in f:
        line_count += 1
        line = line.strip()
        if not line:
            continue
        try:
            data = json.loads(line)
        except json.JSONDecodeError:
            logger.debug(f"Skipping line {line_count} due to JSON decode error.")
            continue
        
        parent_asin = data.get("parent_asin", "")
        if parent_asin in unique_items:
            matched_count += 1
            title = data.get("title", "")
            description_field = data.get("description", [])
            if isinstance(description_field, list):
                description = " ".join(description_field)
            else:
                description = str(description_field)
            
            details_field = data.get("details", {})
            if isinstance(details_field, dict):
                details_str = " ".join([f"{k}: {v}" for k,v in details_field.items()])
            else:
                details_str = str(details_field)
            
            combined_text = ". ".join(filter(None, [title, description, details_str]))
            selected_asins.append(parent_asin)
            selected_texts.append(combined_text)

logger.debug(f"Number of lines read from meta file: {line_count}")
logger.debug(f"Number of matched items in meta: {matched_count}")
logger.debug(f"selected_asins length: {len(selected_asins)}, selected_texts length: {len(selected_texts)}")

# ---------------------- Initialize ChromaDB ----------------------
logger.debug("Initializing ChromaDB client.")
chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)
existing_collections = chroma_client.list_collections()
logger.debug(f"Existing collections in ChromaDB: {existing_collections}")

if CHROMA_COLLECTION_NAME in existing_collections:
    logger.debug(f"Collection '{CHROMA_COLLECTION_NAME}' already exists. "
                 f"This could cause dimension conflicts if previously inserted embeddings had different dimensions.")
    collection_chroma = chroma_client.get_collection(name=CHROMA_COLLECTION_NAME)
else:
    logger.debug(f"Collection '{CHROMA_COLLECTION_NAME}' does not exist, creating a new one.")
    collection_chroma = chroma_client.create_collection(name=CHROMA_COLLECTION_NAME)
logger.debug("ChromaDB collection ready.")

# ---------------------- Embedding and Storing Directly in ChromaDB ----------------------
if not selected_texts:
    logger.warning("No texts to embed. Exiting.")
else:
    total_texts = len(selected_texts)
    logger.debug(f"Embedding {total_texts} texts in batches of {BATCH_SIZE} and adding directly to ChromaDB...")

    for i in range(0, total_texts, BATCH_SIZE):
        batch_texts = selected_texts[i:i+BATCH_SIZE]
        batch_asins = selected_asins[i:i+BATCH_SIZE]
        logger.debug(f"Processing batch {i//BATCH_SIZE + 1} with {len(batch_texts)} texts.")

        embeddings_list = embed_texts(batch_texts)
        logger.debug(f"Embedding list length: {len(embeddings_list)}. Example embedding length: {len(embeddings_list[0]) if embeddings_list else 'N/A'}")

        # Prepare documents for ChromaDB
        ids = [str(uuid.uuid4()) for _ in batch_texts]
        metadatas = [{"parent_asin": asin} for asin in batch_asins]

        # Log some examples to ensure data integrity
        if batch_texts:
            logger.debug(f"Example document: {batch_texts[0][:100]}... (truncated)")
            logger.debug(f"Example metadata: {metadatas[0]}")
        
        # Add to ChromaDB
        try:
            logger.debug("Attempting to add embeddings to ChromaDB collection...")
            collection_chroma.add(
                ids=ids,
                documents=batch_texts,
                embeddings=embeddings_list,
                metadatas=metadatas
            )
            logger.info(f"Inserted batch {(i//BATCH_SIZE)+1} into ChromaDB.")
        except Exception as e:
            logger.error(f"Error inserting documents into ChromaDB: {e}")
            logger.debug("Check if the embedding dimensions match the collection. If not, consider deleting the existing collection or using a fresh directory.")
            break  # If there's an error, stop processing further

    logger.info("Completed embedding and storing items directly into ChromaDB.")


INFO:__main__:Inserted batch 1 into ChromaDB.
INFO:__main__:Inserted batch 2 into ChromaDB.
INFO:__main__:Inserted batch 3 into ChromaDB.
INFO:__main__:Inserted batch 4 into ChromaDB.
INFO:__main__:Inserted batch 5 into ChromaDB.
INFO:__main__:Inserted batch 6 into ChromaDB.
INFO:__main__:Inserted batch 7 into ChromaDB.
INFO:__main__:Inserted batch 8 into ChromaDB.
INFO:__main__:Inserted batch 9 into ChromaDB.
INFO:__main__:Inserted batch 10 into ChromaDB.
INFO:__main__:Inserted batch 11 into ChromaDB.
INFO:__main__:Inserted batch 12 into ChromaDB.
INFO:__main__:Inserted batch 13 into ChromaDB.
INFO:__main__:Inserted batch 14 into ChromaDB.
INFO:__main__:Inserted batch 15 into ChromaDB.
INFO:__main__:Inserted batch 16 into ChromaDB.
INFO:__main__:Inserted batch 17 into ChromaDB.
INFO:__main__:Inserted batch 18 into ChromaDB.
INFO:__main__:Inserted batch 19 into ChromaDB.
INFO:__main__:Inserted batch 20 into ChromaDB.
INFO:__main__:Inserted batch 21 into ChromaDB.
INFO:__main__:Inserted

## Combine test and val set with train set

In [5]:
import json

def combine_data(train_val_data, test_data):
    # Create a dictionary from train_val_data keyed by user_id for quick look-up
    train_val_dict = { user["user_id"]: user for user in train_val_data }

    # Iterate over test_data users
    for test_user in test_data:
        user_id = test_user["user_id"]
        test_reviews = test_user["reviews"]

        if user_id in train_val_dict:
            # If user exists in train_val, append the new reviews
            train_val_dict[user_id]["reviews"].extend(test_reviews)
        else:
            # If user does not exist in train_val, add this new user entry
            train_val_dict[user_id] = {
                "user_id": user_id,
                "reviews": test_reviews
            }

    # Convert dictionary back to list
    combined_data = list(train_val_dict.values())
    return combined_data


# Example usage (assuming you've read the JSON files into variables):
with open("D:\Master_Thesis/final_pipeline/new_data/new_train_val_output.json", "r") as f:
    train_val_data = json.load(f)
with open("D:\Master_Thesis/final_pipeline/new_data/test_output.json", "r") as f:
    test_data = json.load(f)
#
combined = combine_data(train_val_data, test_data)
#
# # Write combined data to a new file if needed
with open("D:\Master_Thesis/final_pipeline/new_data/beauty_combined_output.json", "w") as f:
    json.dump(combined, f, indent=4)


In [7]:
import json
import torch
import logging
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import chromadb
import uuid

# ---------------------- Configuration ----------------------

#REDUCED_FILE = "data/Video_Games.reduced_300_users.json"   # The reduced file with 300 users
#META_FILE = "data/meta_Video_Games.jsonl"                  # The meta file for Video Games
#CHROMA_DB_PATH = "./chroma_db_video_games"                       # Path for ChromaDB
#CHROMA_COLLECTION_NAME = "video_games_product_embeddings_filtered"      # ChromaDB collection name
REDUCED_FILE = "D:\Master_Thesis/final_pipeline/new_data/beauty_combined_output.json"   # The reduced file with 300 users
META_FILE = "D:\Master_Thesis/final_pipeline/new_data/meta_All_Beauty.jsonl"                  # The meta file for Video Games
CHROMA_DB_PATH = "./chroma_db_beauty"                       # Path for ChromaDB
CHROMA_COLLECTION_NAME = "beauty_product_embeddings_filtered"      # ChromaDB collection name
BATCH_SIZE = 32                                            # Batch size for embedding
MODEL_NAME = "hyp1231/blair-roberta-large"                 # Model name for embeddings

# ---------------------- Logging ----------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ---------------------- Device Setup ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# ---------------------- Load Reduced Data ----------------------
try:
    logger.info(f"Loading reduced file: {REDUCED_FILE}")
    with open(REDUCED_FILE, 'r', encoding='utf-8') as f:
        reduced_data = json.load(f)
except FileNotFoundError as e:
    logger.error(f"File not found: {REDUCED_FILE}. Error: {e}")
    raise
except json.JSONDecodeError as e:
    logger.error(f"Error decoding JSON in file: {REDUCED_FILE}. Error: {e}")
    raise

# Extract all unique parent_asin from reduced_data
unique_items = set()
try:
    for user_entry in reduced_data:
        for review in user_entry.get("reviews", []):
            unique_items.add(review.get("parent_asin"))
    logger.info(f"Number of unique items in reduced data: {len(unique_items)}")
except KeyError as e:
    logger.error(f"Unexpected data format in reduced_data: {e}")
    raise

# ---------------------- Initialize Model ----------------------
try:
    logger.info(f"Loading model and tokenizer from {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(device)
    model.eval()
    logger.info("Model loaded and set to evaluation mode.")
except Exception as e:
    logger.error(f"Error loading model: {MODEL_NAME}. Error: {e}")
    raise

def embed_texts(texts):
    """Embed a list of texts using the model and return the embeddings."""
    try:
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs, return_dict=True)
            embeddings = outputs.last_hidden_state[:, 0]  # CLS token
            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # Normalize
        return embeddings.cpu().numpy().tolist()
    except Exception as e:
        logger.error(f"Error during text embedding: {e}")
        raise

# ---------------------- Read Meta File and Select Items ----------------------
try:
    logger.info(f"Reading meta file: {META_FILE}")
    selected_asins = []
    selected_texts = []
    with open(META_FILE, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading meta file"):
            line = line.strip()
            if not line:
                continue
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                logger.warning(f"Invalid JSON line skipped: {line}")
                continue

            parent_asin = data.get("parent_asin", "")
            if parent_asin in unique_items:
                title = data.get("title", "")
                description_field = data.get("description", [])
                description = " ".join(description_field) if isinstance(description_field, list) else str(description_field)

                details_field = data.get("details", {})
                details_str = " ".join([f"{k}: {v}" for k, v in details_field.items()]) if isinstance(details_field, dict) else str(details_field)

                combined_text = ". ".join(filter(None, [title, description, details_str]))
                selected_asins.append(parent_asin)
                selected_texts.append(combined_text)

    logger.info(f"Number of matched items in meta: {len(selected_asins)}")
except FileNotFoundError as e:
    logger.error(f"File not found: {META_FILE}. Error: {e}")
    raise
except Exception as e:
    logger.error(f"Unexpected error reading meta file: {META_FILE}. Error: {e}")
    raise

# ---------------------- Initialize ChromaDB ----------------------
# ---------------------- Initialize ChromaDB ----------------------
try:
    logger.info("Initializing ChromaDB client.")
    chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

    # Use get_or_create to avoid UniqueConstraintError
    collection_chroma = chroma_client.create_collection(
        name=CHROMA_COLLECTION_NAME,
        get_or_create=True  # Ensures collection is retrieved if it already exists
    )

    logger.info(f"ChromaDB collection '{CHROMA_COLLECTION_NAME}' initialized.")
except Exception as e:
    logger.error(f"Error initializing ChromaDB collection: {e}")
    raise


# ---------------------- Embedding and Storing Directly in ChromaDB ----------------------
if not selected_texts:
    logger.warning("No texts to embed. Exiting.")
else:
    try:
        total_texts = len(selected_texts)
        logger.info(f"Embedding {total_texts} texts in batches of {BATCH_SIZE} and adding directly to ChromaDB...")

        for i in range(0, total_texts, BATCH_SIZE):
            batch_texts = selected_texts[i:i + BATCH_SIZE]
            batch_asins = selected_asins[i:i + BATCH_SIZE]

            embeddings_list = embed_texts(batch_texts)

            ids = [str(uuid.uuid4()) for _ in batch_texts]
            metadatas = [{"parent_asin": asin} for asin in batch_asins]

            try:
                collection_chroma.add(
                    ids=ids,
                    documents=batch_texts,
                    embeddings=embeddings_list,
                    metadatas=metadatas
                )
                logger.info(f"Inserted batch {(i // BATCH_SIZE) + 1} into ChromaDB.")
            except Exception as e:
                logger.error(f"Error inserting documents into ChromaDB: {e}")
    except Exception as e:
        logger.error(f"Error during embedding or storing process: {e}")
    logger.info("Completed embedding and storing items directly into ChromaDB.")


INFO:__main__:Using device: cuda
INFO:__main__:Loading reduced file: D:\Master_Thesis/final_pipeline/new_data/beauty_combined_output.json
INFO:__main__:Number of unique items in reduced data: 356
INFO:__main__:Loading model and tokenizer from hyp1231/blair-roberta-large
INFO:__main__:Model loaded and set to evaluation mode.
INFO:__main__:Reading meta file: D:\Master_Thesis/final_pipeline/new_data/meta_All_Beauty.jsonl
Reading meta file: 112590it [00:01, 110164.02it/s]
INFO:__main__:Number of matched items in meta: 356
INFO:__main__:Initializing ChromaDB client.
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:__main__:ChromaDB collection 'beauty_product_embeddings_filtered' initialized.
INFO:__main__:Embedding 356 texts in batches of 32 and adding directly to ChromaDB...
INFO:__main__:Inserted batch 1 into ChromaDB.
INFO:__main__:Inserted batch 2 into ChromaDB.
INFO:__main__:Ins

In [1]:
# ChromaDB_Test.ipynb

# Import dependencies
import logging
from retrieval import initialize_chromadb, collect_results_per_product  # From your updated retrieval.py
from config import DATABASES  # Import the configurations

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Test parameters
test_product_names = ["The Last of Us", "Call of Duty", "Minecraft"]  # Example product names for video games
user_history = ["existing_asin_1", "existing_asin_2"]  # Example user history to avoid duplicates
max_products = 20  # Limit for the results

# Step 1: Test Video Games Collection
print("=== Testing Video Games Collection ===")

try:
    # Initialize Video Games collection
    video_games_collection = initialize_chromadb("video_games")
    
    # Run the collect_results_per_product function
    video_games_results = collect_results_per_product(
        product_names=test_product_names,
        collection=video_games_collection,
        user_history=user_history,
        max_products=max_products
    )
    
    # Output the results
    print("\nVideo Games Results:")
    if video_games_results == -1:
        print("No results found for Video Games collection.")
    else:
        for document, distance, metadata in video_games_results:
            print(f"Document: {document}")
            print(f"Distance: {distance}")
            print(f"Metadata: {metadata}\n")

except Exception as e:
    print(f"Error during Video Games collection test: {e}")



  from .autonotebook import tqdm as notebook_tqdm


=== Testing Video Games Collection ===


INFO:retrieval:Initialized model for database type: blair
INFO:retrieval:Initializing ChromaDB client for collection: video_games
INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO:retrieval:Using existing ChromaDB collection: video_games_product_embeddings_filtered


Cleaned product names: ['The Last of Us', 'Call of Duty', 'Minecraft']
Processing product name: 'The Last of Us'
Computed embedding for 'The Last of Us'
Query results for 'The Last of Us': 20 items retrieved.
Stored 20 results for 'The Last of Us'
Processing product name: 'Call of Duty'
Computed embedding for 'Call of Duty'
Query results for 'Call of Duty': 20 items retrieved.
Stored 20 results for 'Call of Duty'
Processing product name: 'Minecraft'
Computed embedding for 'Minecraft'
Query results for 'Minecraft': 20 items retrieved.
Stored 20 results for 'Minecraft'
Collecting the best item from each product name.
Added best document 'BEAVIIOO Wireless Gaming Headset with Microphone for PC PS4 PS5 Playstation 4 5, 2.4G Wireless Bluetooth USB Gamer Headphones with Mic for Laptop Computer. Average Battery Life (in hours): 5 	years Brand: BEAVIIOO Series: 2.4G Wireless Gaming Headset Item model number: HW01 Hardware Platform: PC, Gaming Console, Super Nintendo, Mac Item Weight: 15.5 ounc

# MPNET

In [5]:
import json
import torch
import logging
from tqdm import tqdm
from transformers import AutoModel, AutoTokenizer
import chromadb
import uuid
import sys

# Add the parent directory to Python path
sys.path.append("D:/Master_Thesis/final_pipeline")
from config import DATASET_CONFIGS_MPNET

# ---------------------- Configuration ----------------------

# Dataset selection
dataset = "video_games"  # Change to "video_games" as needed

# Dataset-specific file paths
DATA_PATHS = {
    'beauty': {
        'reduced_file': "new_data/beauty_combined_output.json",
        'meta_file': "new_data/meta_All_Beauty.jsonl",
    },
    'video_games': {
        'reduced_file': "new_data/Video_Games.reduced_300_users.json",
        'meta_file': "new_data/meta_Video_Games.jsonl",
    }
}

# Get configuration from config file
db_config = DATASET_CONFIGS_MPNET[dataset]
REDUCED_FILE = DATA_PATHS[dataset]['reduced_file']
META_FILE = DATA_PATHS[dataset]['meta_file']
CHROMA_DB_PATH = db_config['db_path']
CHROMA_COLLECTION_NAME = db_config['collection_name']

# Model configuration
BATCH_SIZE = 32
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2"

# ---------------------- Logging ----------------------
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ---------------------- Device Setup ----------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")
logger.info(f"Processing dataset: {dataset}")

# ---------------------- Load Reduced Data ----------------------
try:
    logger.info(f"Loading reduced file: {REDUCED_FILE}")
    with open(REDUCED_FILE, 'r', encoding='utf-8') as f:
        reduced_data = json.load(f)
except FileNotFoundError as e:
    logger.error(f"File not found: {REDUCED_FILE}. Error: {e}")
    raise
except json.JSONDecodeError as e:
    logger.error(f"Error decoding JSON in file: {REDUCED_FILE}. Error: {e}")
    raise

# Extract all unique parent_asin from reduced_data
unique_items = set()
try:
    for user_entry in reduced_data:
        for review in user_entry.get("reviews", []):
            unique_items.add(review.get("parent_asin"))
    logger.info(f"Number of unique items in reduced data: {len(unique_items)}")
except KeyError as e:
    logger.error(f"Unexpected data format in reduced_data: {e}")
    raise

# ---------------------- Initialize Model ----------------------
try:
    logger.info(f"Loading model and tokenizer from {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModel.from_pretrained(MODEL_NAME).to(device)
    model.eval()
    logger.info("Model loaded and set to evaluation mode.")
except Exception as e:
    logger.error(f"Error loading model: {MODEL_NAME}. Error: {e}")
    raise

def embed_texts(texts):
    """Embed a list of texts using the model and return the embeddings."""
    try:
        inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs, return_dict=True)
            embeddings = outputs.last_hidden_state[:, 0]  # CLS token
            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # Normalize
        return embeddings.cpu().numpy().tolist()
    except Exception as e:
        logger.error(f"Error during text embedding: {e}")
        raise

# ---------------------- Read Meta File and Select Items ----------------------
try:
    logger.info(f"Reading meta file: {META_FILE}")
    selected_asins = []
    selected_texts = []
    with open(META_FILE, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="Reading meta file"):
            line = line.strip()
            if not line:
                continue
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                logger.warning(f"Invalid JSON line skipped: {line}")
                continue

            parent_asin = data.get("parent_asin", "")
            if parent_asin in unique_items:
                title = data.get("title", "")
                description_field = data.get("description", [])
                description = " ".join(description_field) if isinstance(description_field, list) else str(description_field)

                details_field = data.get("details", {})
                details_str = " ".join([f"{k}: {v}" for k, v in details_field.items()]) if isinstance(details_field, dict) else str(details_field)

                combined_text = ". ".join(filter(None, [title, description, details_str]))
                selected_asins.append(parent_asin)
                selected_texts.append(combined_text)

    logger.info(f"Number of matched items in meta: {len(selected_asins)}")
except FileNotFoundError as e:
    logger.error(f"File not found: {META_FILE}. Error: {e}")
    raise
except Exception as e:
    logger.error(f"Unexpected error reading meta file: {META_FILE}. Error: {e}")
    raise

# ---------------------- Initialize ChromaDB ----------------------
try:
    logger.info("Initializing ChromaDB client.")
    chroma_client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

    collection_chroma = chroma_client.create_collection(
        name=CHROMA_COLLECTION_NAME,
        get_or_create=True
    )

    logger.info(f"ChromaDB collection '{CHROMA_COLLECTION_NAME}' initialized.")
except Exception as e:
    logger.error(f"Error initializing ChromaDB collection: {e}")
    raise

# ---------------------- Embedding and Storing Directly in ChromaDB ----------------------
if not selected_texts:
    logger.warning("No texts to embed. Exiting.")
else:
    try:
        total_texts = len(selected_texts)
        logger.info(f"Embedding {total_texts} texts in batches of {BATCH_SIZE} and adding directly to ChromaDB...")

        for i in range(0, total_texts, BATCH_SIZE):
            batch_texts = selected_texts[i:i + BATCH_SIZE]
            batch_asins = selected_asins[i:i + BATCH_SIZE]

            embeddings_list = embed_texts(batch_texts)

            ids = [str(uuid.uuid4()) for _ in batch_texts]
            metadatas = [{"parent_asin": asin} for asin in batch_asins]

            try:
                collection_chroma.add(
                    ids=ids,
                    documents=batch_texts,
                    embeddings=embeddings_list,
                    metadatas=metadatas
                )
                logger.info(f"Inserted batch {(i // BATCH_SIZE) + 1} into ChromaDB.")
            except Exception as e:
                logger.error(f"Error inserting documents into ChromaDB: {e}")
    except Exception as e:
        logger.error(f"Error during embedding or storing process: {e}")
    logger.info("Completed embedding and storing items directly into ChromaDB.")

INFO:__main__:Using device: cuda
INFO:__main__:Processing dataset: video_games
INFO:__main__:Loading reduced file: new_data/Video_Games.reduced_300_users.json
INFO:__main__:Number of unique items in reduced data: 2516
INFO:__main__:Loading model and tokenizer from sentence-transformers/all-mpnet-base-v2
INFO:__main__:Model loaded and set to evaluation mode.
INFO:__main__:Reading meta file: new_data/meta_Video_Games.jsonl
Reading meta file: 137269it [00:01, 70181.43it/s]
INFO:__main__:Number of matched items in meta: 2516
INFO:__main__:Initializing ChromaDB client.
INFO:__main__:ChromaDB collection 'video_games_product_embeddings_mpnet' initialized.
INFO:__main__:Embedding 2516 texts in batches of 32 and adding directly to ChromaDB...
INFO:__main__:Inserted batch 1 into ChromaDB.
INFO:__main__:Inserted batch 2 into ChromaDB.
INFO:__main__:Inserted batch 3 into ChromaDB.
INFO:__main__:Inserted batch 4 into ChromaDB.
INFO:__main__:Inserted batch 5 into ChromaDB.
INFO:__main__:Inserted bat

In [6]:
import logging
import chromadb
from config import DATASET_CONFIGS_MPNET
from transformers import AutoModel, AutoTokenizer
import torch

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def initialize_model():
    """
    Initialize the MPNet model and tokenizer for generating query embeddings.
    This ensures we use the same model as was used for creating the embeddings.
    """
    model_name = "sentence-transformers/all-mpnet-base-v2"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()
    
    return model, tokenizer, device

def generate_embedding(text, model, tokenizer, device):
    """
    Generate embeddings for a query text using the MPNet model.
    """
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs, return_dict=True)
        embeddings = outputs.last_hidden_state[:, 0]  # CLS token
        embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)  # Normalize
    
    return embeddings.cpu().numpy().tolist()[0]

def test_chromadb_collections():
    """
    Test function to check the contents and query functionality of ChromaDB collections.
    """
    # Initialize the model for querying
    model, tokenizer, device = initialize_model()
    
    # Test queries for each dataset
    test_queries = {
        "beauty": [
            "moisturizing face cream for dry skin",
            "natural organic shampoo",
            "anti-aging serum with vitamin C"
        ],
        "video_games": [
            "action adventure game with great story",
            "multiplayer strategy game",
            "role playing game with open world"
        ]
    }

    for dataset, config in DATASET_CONFIGS_MPNET.items():
        print(f"\n{'='*20} Testing {dataset.upper()} Collection {'='*20}")
        
        try:
            # Initialize ChromaDB client
            client = chromadb.PersistentClient(path=config['db_path'])
            collection = client.get_collection(name=config['collection_name'])
            
            # Print collection info
            collection_count = collection.count()
            print(f"\nTotal items in collection: {collection_count}")

            # Show sample items
            if collection_count > 0:
                print("\nSample of stored items:")
                sample = collection.peek(limit=2)
                for i, (id, metadata, document) in enumerate(zip(
                    sample['ids'],
                    sample['metadatas'],
                    sample['documents']
                )):
                    print(f"\nItem {i + 1}:")
                    print(f"ID: {id}")
                    print(f"Metadata: {metadata}")
                    print(f"Document preview: {document[:150]}...")

                # Perform test queries
                print(f"\nPerforming test queries for {dataset}:")
                for query in test_queries[dataset]:
                    print(f"\nQuery: '{query}'")
                    
                    # Generate embedding for the query
                    query_embedding = generate_embedding(query, model, tokenizer, device)
                    
                    # Query the collection
                    results = collection.query(
                        query_embeddings=[query_embedding],
                        n_results=3,
                        include=["documents", "metadatas", "distances"]
                    )
                    
                    # Display results
                    print("\nTop 3 matches:")
                    for i in range(len(results['distances'][0])):
                        print(f"\nMatch {i+1}:")
                        print(f"Distance: {results['distances'][0][i]:.4f}")
                        print(f"Metadata: {results['metadatas'][0][i]}")
                        print(f"Document preview: {results['documents'][0][i][:150]}...")
            
        except Exception as e:
            logger.error(f"Error while testing {dataset} collection: {str(e)}")
            raise

if __name__ == "__main__":
    print("Testing ChromaDB Collections with MPNet Embeddings and Queries")
    test_chromadb_collections()

Testing ChromaDB Collections with MPNet Embeddings and Queries


Total items in collection: 2516

Sample of stored items:

Item 1:
ID: f30dc03b-e8cd-4a07-adc2-5a1e058d6be5
Metadata: {'parent_asin': 'B00Z9TLVK0'}
Document preview: NBA 2K17 - Early Tip Off Edition - PlayStation 4. Following the record-breaking launch of NBA 2K16, the NBA 2K franchise continues to stake its claim ...

Item 2:
ID: d8745da1-6b86-40ed-85cc-043cfb6f4df5
Metadata: {'parent_asin': 'B00BJH85SW'}
Document preview: Turbo: Super Stunt Squad - Nintendo 3DS. Product Description Turbo: Super Stunt Squad is a high-velocity action racing game featuring the super-charge...

Performing test queries for video_games:

Query: 'action adventure game with great story'

Top 3 matches:

Match 1:
Distance: 1.1474
Metadata: {'parent_asin': 'B0812PG2ZH'}
Document preview: New Super Lucky's Tale - Nintendo Switch. Jump, burrow, and tail swipe your way to victory in this love letter to classic 3D platformers!Join Lucky on...

Match 2