In [18]:
import chromadb

# Initialize a Chroma client
client = chromadb.Client()  # Or, for a persistent client: chromadb.PersistentClient(path="./chroma_db")

# Specify the name of the collection you want to delete
collection_name = "peer_group_embeddings"  # Replace with the actual name

try:
    # Delete the collection
    client.delete_collection(name=collection_name)
    print(f"Collection '{collection_name}' deleted successfully.")
except ValueError as e:
    print(f"Error: Collection '{collection_name}' not found or could not be deleted.  Error Details: {e}")
except Exception as e:
    print(f"An unexpected error occurred while deleting collection '{collection_name}': {e}")



Collection 'peer_group_embeddings' deleted successfully.


In [19]:
import chromadb
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

# 1.  Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
if not os.environ["OPENAI_API_KEY"]:
    raise ValueError("OPENAI_API_KEY environment variable not set.")

# 2. Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings()

# 3. Chroma Setup
# --------------
chroma_client = chromadb.Client()
collection_name = "peer_group_embeddings"  # Choose a consistent collection name

if collection_name in chroma_client.list_collections():
    chroma_client.delete_collection(name=collection_name)
collection = chroma_client.create_collection(name=collection_name)

def create_chroma_collection(peer_group_definitions: pd.DataFrame, batch_size=2000):
    """
    Creates a Chroma collection and adds peer group descriptions in batches using OpenAI embeddings.

    Args:
        peer_group_definitions (pd.DataFrame): DataFrame containing peer group definitions.
        batch_size (int): The maximum number of items to add in each batch. Adjust as needed.
    """
    documents = []
    metadatas = []
    ids = []
    for _, row in peer_group_definitions.iterrows():
        documents.append(row['description'])
        metadatas.append({'peer_group_id': row['peer_group_id']})
        ids.append(row['peer_group_id'])  # Use peer_group_id as the ID

    # Process data in batches
    for i in range(0, len(documents), batch_size):
        batch_documents = documents[i:i + batch_size]
        batch_metadatas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]

        try:
            collection.add(
                documents=batch_documents,
                embeddings=embeddings.embed_documents(batch_documents),  # Embed each batch
                metadatas=batch_metadatas,
                ids=batch_ids
            )
            print(f"Added batch {i // batch_size + 1} to Chroma.")
        except Exception as e:
            print(f"Error adding batch {i // batch_size + 1} to Chroma: {e}")

# Load your peer group definitions
peer_group_definitions = pd.read_csv('/Users/vivekaradshan/Documents/AI peer benchmarking/ai-peer-benchmarching/peer_group_definitions.csv')

# Create the Chroma collection with batch processing
chroma_collection = create_chroma_collection(peer_group_definitions, batch_size=2000)  # Adjust batch_size if needed
print(f"Chroma collection '{collection_name}' created with embedding dimension: {embeddings.embedding_ctx_length}")

Error adding batch 1 to Chroma: Connection error.
Error adding batch 2 to Chroma: Connection error.
Error adding batch 3 to Chroma: Connection error.
Error adding batch 4 to Chroma: Connection error.
Chroma collection 'peer_group_embeddings' created with embedding dimension: 8191


In [57]:
pip install langchain_openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain_openai
  Downloading langchain_openai-0.3.17-py3-none-any.whl.metadata (2.3 kB)
Downloading langchain_openai-0.3.17-py3-none-any.whl (62 kB)
Installing collected packages: langchain_openai
Successfully installed langchain_openai-0.3.17
Note: you may need to restart the kernel to use updated packages.


In [70]:
import pandas as pd
from langchain_neo4j import Neo4jGraph
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
import os
# Import ChatOpenAI instead of OpenAI
from langchain_openai import ChatOpenAI # Changed import for the LLM itself
from dotenv import load_dotenv
import chromadb

load_dotenv()

# --- Configuration ---
# Assuming you have client_data_with_peers and peer_group_definitions DataFrames

# 1. Connect to Neo4j
# ----------------------
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USER = os.environ["NEO4J_USER"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# 2. Connect to Chroma
# ----------------------
embeddings = OpenAIEmbeddings()
chroma_client = chromadb.Client()
collection = chroma_client.get_collection("peer_group_embeddings")  # Get the existing collection

# 3. LLM Setup (OpenAI GPT-4)
# -----------------------------
# * You **MUST** set the OPENAI_API_KEY environment variable.
openai_api_key = os.environ.get("OPENAI_API_KEY")  # Get API key from environment
if not openai_api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set. You need to set your OpenAI API key.")

# Use ChatOpenAI for models like gpt-4
llm = ChatOpenAI(model_name="gpt-4", openai_api_key=openai_api_key, temperature=0.7) # Added temperature for creativity
# For older LLMChain compatibility, pass as a BaseLanguageModel
# or directly use newer LangChain Runnable interfaces if comfortable.

# 4. Prompt Template
# ------------------
prompt_template = """
You are a personal finance advisor. Use the information below to compare the client's financial situation to their peer group.
Query: {query}
Client Data: {client_data}
Peer Group Data: {peer_group_data}
Knowledge Graph Data: {kg_data}
Response:
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["query", "client_data", "peer_group_data", "kg_data"])
llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=True)


def get_peer_group_comparison_rag(query: str, client_id: str, client_data_df: pd.DataFrame):
    """
    Retrieves peer group data and generates a comparison using Graph RAG and OpenAI GPT-4.

    Args:
        query: The user's query.
        client_id: The ID of the client to compare.
        client_data_df (pd.DataFrame): The client data.

    Returns:
        str: The LLM-generated comparison text.
    """

    # 1. Retrieve client data from client_data_df
    client_row = client_data_df[client_data_df['client_id'] == client_id].iloc[0].to_dict()

    client_data = (
        f"Age: {client_row['age']},"
        f" Monthly Income: {client_row['monthly_income']},"
        f" Location: {client_row['location']},"
        f" Profession: {client_row['profession']},"
        f" Financial Goal: {client_row['financial_goal']}"
    )
    print(f"Client Data: {client_data}")

    # 2. Vector Store Query (Chroma) - Incorporating Client Context
    # ------------------------------------------------------------
    # Embed the user's query *along with* relevant client information
    contextual_query = (
        f"{query} For a client with age {client_row['age']},"
        f" monthly income {client_row['monthly_income']},"
        f" living in {client_row['location']}"
        f" and belongs to peer group {client_row['peer_group_id']}."
    )
    query_embedding = embeddings.embed_query(contextual_query)
    results = collection.query(query_embeddings=[query_embedding], n_results=3)

    if not results['ids'] or not results['ids'][0]:
        return "I'm sorry, I couldn't find any relevant peer groups."

    relevant_peer_group_ids = results['ids'][0]
    print(f"Found relevant peer group IDs: {relevant_peer_group_ids}")

    peer_group_data_list = []
    for pg_id in relevant_peer_group_ids:
        # Debugging: Print the peer_group_id being queried in Neo4j
        print(f"Attempting to query Neo4j for peer_group_id: '{pg_id}'")

        # Fetch detailed information about each relevant peer group from the KG.
        cypher_query = f"""
            MATCH (pg:PeerGroup {{peer_group_id: '{pg_id}'}})
            OPTIONAL MATCH (c:Client)-[:BELONGS_TO]->(pg)
            RETURN pg.description AS peer_group_description,
                   avg(c.age) AS avg_age,
                   avg(COALESCE(c.income, 0)) AS avg_monthly_income,
                   count(c) AS member_count
        """
        try:
            kg_results = graph.query(cypher_query)
            if kg_results and kg_results[0]['peer_group_description'] is not None:
                # Handle None values for averages/counts if no clients are matched
                print(f"Average age: {kg_results[0].get('avg_age')}, "
                      f"Average monthly income: {kg_results[0].get('avg_monthly_income')}, "
                      f"Member count: {kg_results[0].get('member_count')}")
                result_row = kg_results[0]
                result_row['avg_age'] = result_row.get('avg_age', 0) if result_row.get('avg_age') is not None else 0
                result_row['avg_monthly_income'] = result_row.get('avg_monthly_income', 0) if result_row.get('avg_monthly_income') is not None else 0
                result_row['member_count'] = result_row.get('member_count', 0) if result_row.get('member_count') is not None else 0

                peer_group_data_list.append(result_row)
                print(f"Successfully retrieved data for '{pg_id}'.")
            else:
                print(f"Neo4j query returned no data for peer_group_id '{pg_id}' or peer group description is None.")
        except Exception as e:
            print(f"Error executing Cypher query for peer_group_id '{pg_id}': {e}")


    if not peer_group_data_list:
        print("After KG queries, peer_group_data_list is empty.")
        return "I found similar peer groups, but I couldn't retrieve detailed information from our knowledge graph, potentially because those groups have no associated clients or the data structure is different."

    peer_group_data_str = "\n".join([
        f"Peer Group: {pg['peer_group_description']}, Avg Age: {pg['avg_age']:.0f}, Avg Monthly Income: ${pg['avg_monthly_income']:.2f}, Member Count: {int(pg['member_count'])}"
        for pg in peer_group_data_list
    ])
    kg_data = f"Relevant Peer Group Information:\n{peer_group_data_str}"

    # 4. Generate the LLM Response
    response = llm_chain.run(query=query, client_data=client_data, peer_group_data=peer_group_data_str, kg_data=kg_data)
    return response


# --- Example Usage ---
# Make sure client_data_with_peers.csv exists and is correctly populated
sample_client_id = "client_0369" # Or pick one that exists and has a peer group
sample_query = "How do my finances compare to others with similar income and age? Can you provide how much higher or lower my income is compared to my peers?"
try:
    client_data_with_peers = pd.read_csv('/Users/vivekaradshan/Documents/AI peer benchmarking/ai-peer-benchmarching/client_data_with_peers.csv')
except FileNotFoundError:
    print("Error: client_data_with_peers.csv not found. Please ensure the path is correct.")
    # Exit or handle gracefully if data isn't found
    exit()

# Ensure the client_data_with_peers DataFrame has 'monthly_income' column and
# the Neo4j client nodes have a 'monthly_income' property if you're querying for it.
if 'monthly_income' not in client_data_with_peers.columns:
    raise ValueError("DataFrame 'client_data_with_peers' must contain 'monthly_income' column.")


comparison_text = get_peer_group_comparison_rag(sample_query, sample_client_id, client_data_with_peers)
print("\n--- Comparison Text (GPT-4) ---")
print(comparison_text)

Client Data: Age: 41, Monthly Income: 95200, Location: Garrettland, WA, Profession: Lawyer, Financial Goal: Retirement
Found relevant peer group IDs: ['PG_1220', 'PG_630', 'PG_622']
Attempting to query Neo4j for peer_group_id: 'PG_1220'
Average age: None, Average monthly income: 0.0, Member count: 0
Successfully retrieved data for 'PG_1220'.
Attempting to query Neo4j for peer_group_id: 'PG_630'
Average age: None, Average monthly income: 0.0, Member count: 0
Successfully retrieved data for 'PG_630'.
Attempting to query Neo4j for peer_group_id: 'PG_622'
Average age: 42.0909090909091, Average monthly income: 71863.63636363635, Member count: 11
Successfully retrieved data for 'PG_622'.


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are a personal finance advisor. Use the information below to compare the client's financial situation to their peer group.
Query: How do my finances compare to others with similar income and age? Can you provide how much hi

In [None]:
import pandas as pd
from langchain_neo4j import Neo4jGraph
import os
from dotenv import load_dotenv

load_dotenv()

# --- Neo4j Connection ---
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USER = os.environ["NEO4J_USER"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

def debug_neo4j_peer_group_query(peer_group_id: str):
    """
    Executes the Neo4j query for a given peer group ID and prints the raw results
    to help debug why averages might be zero/None.

    Args:
        peer_group_id (str): The peer_group_id to query in Neo4j.
    """
    print(f"\n--- DEBUGGING NEO4J QUERY FOR PEER_GROUP_ID: '{peer_group_id}' ---")

    # Step 1: Verify the PeerGroup node exists and its description
    query_peer_group_existence = f"""
        MATCH (pg:PeerGroup {{peer_group_id: '{peer_group_id}'}})
        RETURN pg.description AS description, pg.peer_group_id AS id
    """
    print("\n[Step 1] Checking PeerGroup node existence:")
    try:
        results_pg = graph.query(query_peer_group_existence)
        if results_pg:
            print(f"  PeerGroup found: {results_pg[0]}")
        else:
            print(f"  ERROR: PeerGroup node with peer_group_id='{peer_group_id}' NOT FOUND in Neo4j.")
            print("  This means the ID from Chroma doesn't match an actual PeerGroup node in your KG.")
            return # Stop debugging for this ID if the node isn't found
    except Exception as e:
        print(f"  ERROR during PeerGroup existence check: {e}")
        return

    # Step 2: Query for Clients belonging to this PeerGroup and their monthly_income
    # Corrected: Removed type((c)-[:BELONGS_TO]->(pg)) directly from RETURN
    query_clients_and_income = f"""
        MATCH (pg:PeerGroup {{peer_group_id: '{peer_group_id}'}})
        OPTIONAL MATCH (c:Client)-[rel:BELONGS_TO]->(pg) // Named the relationship 'rel'
        RETURN
            pg.description AS peer_group_description,
            c.client_id AS client_id,
            c.age AS client_age,
            c.monthly_income AS client_monthly_income,
            c.income AS client_raw_income,
            type(rel) as relationship_type // Now it should work on the specific 'rel'
    """
    print("\n[Step 2] Querying Clients linked to this PeerGroup:")
    try:
        results_clients = graph.query(query_clients_and_income)
        if results_clients:
            print(f"  Found {len(results_clients)} rows (clients or peer_group_only if no client):")
            found_actual_client = False
            for row in results_clients:
                # Check if client_id is not None, indicating an actual client was matched
                if row['client_id'] is not None:
                    found_actual_client = True
                    print(f"    Client ID: {row['client_id']}, Monthly Income: {row['client_monthly_income']}, type of income {type(row['client_monthly_income'])} Raw Income: {row['client_raw_income']}, Rel Type: {row['relationship_type']}")
                else:
                    print(f"    PeerGroup '{peer_group_id}' exists, but no client found for this row (might be a null row from OPTIONAL MATCH).")

            if not found_actual_client:
                 print("  WARNING: PeerGroup exists, but no Clients were found connected with a BELONGS_TO relationship, or the relationship is missing/broken.")
                 print("  This would cause AVG() to return NULL/0 for this peer group.")
            else:
                print("  Clients successfully retrieved. Check their monthly_income values above (look for None or 0).")

        else:
            print(f"  No client relationship data returned at all for peer_group_id='{peer_group_id}'.")
            print("  This likely means the peer group exists but has no clients connected.")
    except Exception as e:
        print(f"  ERROR during client query: {e}")
        # Print the full query to debug it directly in Neo4j Browser if needed
        print(f"  Cypher query that caused error:\n{query_clients_and_income}")
        return


    # Step 3: Run the original aggregation query for direct inspection
    original_cypher_query = f"""
        MATCH (pg:PeerGroup {{peer_group_id: '{peer_group_id}'}})
        OPTIONAL MATCH (c:Client)-[:BELONGS_TO]->(pg)
        RETURN pg.description AS peer_group_description,
               avg(c.age) AS avg_age,
               avg(COALESCE(c.monthly_income, 0)) AS avg_monthly_income,
               count(c) AS member_count
    """
    print("\n[Step 3] Running the original aggregation query:")
    try:
        final_results = graph.query(original_cypher_query)
        if final_results:
            print(f"  Aggregation results: {final_results[0]}")
            # Ensure proper handling of 'None' for formatting
            avg_age = final_results[0].get('avg_age')
            avg_monthly_income = final_results[0].get('avg_monthly_income')
            member_count = final_results[0].get('member_count')

            print(f"  Description: {final_results[0].get('peer_group_description')}")
            print(f"  Avg Age: {avg_age if avg_age is not None else 'N/A'}")
            print(f"  Avg Monthly Income (COALESCED): {avg_monthly_income if avg_monthly_income is not None else 'N/A'}")
            print(f"  Member Count: {member_count if member_count is not None else 'N/A'}")


            if final_results[0].get('avg_monthly_income') == 0:
                print("  avg_monthly_income is 0, confirming the issue. This means all matched clients had 0 or NULL monthly_income OR no clients were linked.")
        else:
            print("  Aggregation query returned no results.")
    except Exception as e:
        print(f"  ERROR during aggregation query: {e}")
        print(f"  Cypher query that caused error:\n{original_cypher_query}")


# --- Example Usage to Debug ---
# You need a peer_group_id that Chroma is returning.
# From your last output, examples were: 'PG_1190', 'PG_1182', 'PG_1312'
# Let's try one of them:
sample_peer_group_id_to_debug = "PG_622" # Replace with an actual ID from your Chroma results that causes the issue

# Run the debugger
debug_neo4j_peer_group_query(sample_peer_group_id_to_debug)

# You can run it for other IDs too:
# debug_neo4j_peer_group_query("PG_1182")




--- DEBUGGING NEO4J QUERY FOR PEER_GROUP_ID: 'PG_622' ---

[Step 1] Checking PeerGroup node existence:
  PeerGroup found: {'description': 'Income: $8,759 - $144,500, Age: 33 - 48, State: WA, Goal: Retirement', 'id': 'PG_622'}

[Step 2] Querying Clients linked to this PeerGroup:




  Found 11 rows (clients or peer_group_only if no client):
    Client ID: client_0369, Monthly Income: None, Raw Income: 95200, Rel Type: BELONGS_TO
    Client ID: client_0475, Monthly Income: None, Raw Income: 65200, Rel Type: BELONGS_TO
    Client ID: client_0985, Monthly Income: None, Raw Income: 95500, Rel Type: BELONGS_TO
    Client ID: client_1268, Monthly Income: None, Raw Income: 46700, Rel Type: BELONGS_TO
    Client ID: client_1390, Monthly Income: None, Raw Income: 107300, Rel Type: BELONGS_TO
    Client ID: client_1511, Monthly Income: None, Raw Income: 49900, Rel Type: BELONGS_TO
    Client ID: client_1682, Monthly Income: None, Raw Income: 21700, Rel Type: BELONGS_TO
    Client ID: client_1931, Monthly Income: None, Raw Income: 50900, Rel Type: BELONGS_TO
    Client ID: client_2011, Monthly Income: None, Raw Income: 114600, Rel Type: BELONGS_TO
    Client ID: client_2130, Monthly Income: None, Raw Income: 26500, Rel Type: BELONGS_TO
    Client ID: client_3962, Monthly Inc

In [48]:
client_data_df = pd.read_csv('/Users/vivekaradshan/Documents/AI peer benchmarking/ai-peer-benchmarching/client_data_with_peers.csv')
client_row = client_data_df[client_data_df['client_id'] == "client_0001"].iloc[0].to_dict()
client_data = f"Age: {client_row['age']}, Income: {client_row['monthly_income']}, Location: {client_row['location']}, Profession: {client_row['profession']}, Financial Goal: {client_row['financial_goal']}"
print(f"Client Data: {client_data}")

Client Data: Age: 50, Income: 40400, Location: Bellburgh, HI, Profession: Data Scientist, Financial Goal: Debt Repayment


In [26]:
client_data_df

Unnamed: 0,client_id,age,gender,location,income,income_frequency,monthly_income,profession,financial_goal,loan_type,...,Savings,Housing,Food,Transportation,Utilities,Healthcare,Debt Payments,Entertainment,Miscellaneous,peer_group_id
0,client_0001,50,Male,"Bellburgh, HI",40400,Monthly,40400,Data Scientist,Debt Repayment,,...,10139.02,11259.92,4039.07,4663.94,1445.54,1771.45,6799.47,281.58,0.00,PG_1181
1,client_0002,40,Female,"Smithport, CO",167700,Annually,14000,Customer Service Representative,Retirement,,...,2565.23,4430.57,2046.58,1388.33,864.55,1363.58,1341.17,0.00,0.00,PG_2372
2,client_0003,46,Male,"East Robert, DC",81700,Monthly,81700,Electrician,Retirement,,...,14468.19,21234.73,9077.07,8373.36,4547.36,6153.57,5483.86,4138.90,8222.97,PG_612
3,client_0004,47,Male,"Port Jesse, WA",52700,Monthly,52700,Plumber,Emergency Fund Building,Mortgage,...,4876.84,15141.36,10001.11,3068.39,4064.09,2517.98,5795.14,2770.24,4464.86,PG_623
4,client_0005,26,Female,"Greenbury, FM",135400,Monthly,135400,Teacher,Travel,,...,42531.74,52304.83,23611.04,11216.83,5735.56,0.00,0.00,0.00,0.00,PG_044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,client_4996,48,Male,"Greggton, NE",62700,Monthly,62700,Plumber,Travel,Auto Loan,...,3785.08,23553.23,8912.82,5199.54,2453.26,4195.30,10206.89,4393.88,0.00,PG_844
4996,client_4997,21,Male,"Wardmouth, AK",69400,Monthly,69400,Software Engineer,Emergency Fund Building,Personal Loan,...,17078.11,19154.15,9935.35,8374.03,3950.62,3739.73,7168.00,0.00,0.00,PG_523
4997,client_4998,21,Female,"Jenniferhaven, UT",67900,Monthly,67900,Marketing Specialist,Debt Repayment,Auto Loan,...,9014.57,22607.10,7504.92,8084.39,5507.81,5256.53,9132.92,791.76,0.00,PG_111
4998,client_4999,43,Female,"Lake Jonathon, MO",89600,Monthly,89600,Business Analyst,Debt Repayment,Auto Loan,...,8034.28,24394.55,12877.93,12141.23,7583.48,3190.73,10554.37,1910.74,8912.69,PG_961


In [60]:
# print(client_data_df['peer_group_id'].value_counts().head())
client_data_df[client_data_df['peer_group_id'] == 'PG_622']

Unnamed: 0,client_id,age,gender,location,income,income_frequency,monthly_income,profession,financial_goal,loan_type,...,Savings,Housing,Food,Transportation,Utilities,Healthcare,Debt Payments,Entertainment,Miscellaneous,peer_group_id
368,client_0369,41,Male,"Garrettland, WA",95200,Monthly,95200,Lawyer,Retirement,,...,10091.45,34394.64,11776.28,6880.62,4930.88,7001.82,11266.85,3588.76,5268.69,PG_622
474,client_0475,36,Other,"Nathanberg, WA",65200,Monthly,65200,Teacher,Retirement,Credit Card Debt,...,12959.4,20574.41,10428.99,6355.32,4364.3,4653.41,4789.4,1074.77,0.0,PG_622
984,client_0985,39,Male,"Port Rachelville, WA",95500,Annually,8000,Customer Service Representative,Retirement,,...,1226.06,2466.58,1020.37,983.9,418.33,463.75,1359.76,61.25,0.0,PG_622
1267,client_1268,37,Female,"Smithville, WA",46700,Monthly,46700,Financial Analyst,Retirement,Credit Card Debt,...,2475.5,15535.51,7777.24,6551.39,2664.32,4115.7,7580.34,0.0,0.0,PG_622
1389,client_1390,45,Female,"Justintown, WA",107300,Monthly,107300,Doctor,Retirement,,...,19384.06,28019.25,18854.78,11434.66,7058.67,2450.19,20098.39,0.0,0.0,PG_622
1510,client_1511,46,Male,"South Daniel, WA",49900,Monthly,49900,Data Scientist,Retirement,Mortgage,...,4406.49,12783.51,5689.99,5414.87,3435.09,1397.5,3959.03,4633.11,8180.4,PG_622
1681,client_1682,47,Female,"Roseton, WA",21700,Annually,1800,Teacher,Retirement,,...,119.06,720.11,294.26,135.42,104.93,135.37,208.27,82.58,0.0,PG_622
1930,client_1931,47,Male,"East Sheila, WA",50900,Monthly,50900,Sales Manager,Retirement,,...,7546.25,14808.53,8611.36,3823.21,2938.67,4395.41,6549.1,2227.46,0.0,PG_622
2010,client_2011,38,Male,"Hamptonchester, WA",114600,Annually,9600,Registered Nurse,Retirement,,...,2152.93,3050.49,1439.76,1118.71,497.46,868.1,472.56,0.0,0.0,PG_622
2129,client_2130,44,Male,"New Nicholasfurt, WA",26500,Monthly,26500,Retail Manager,Retirement,Mortgage,...,4562.37,8085.44,3466.52,2520.69,2038.66,1663.99,4162.33,0.0,0.0,PG_622


In [51]:
embeddings = OpenAIEmbeddings()
chroma_client = chromadb.Client()
collection = chroma_client.get_collection("peer_group_embeddings")  # Get the existing collection

# query = "Fetch peer groups like me"
contextual_query = f"{query} For a client with age {client_row['age']}, monthly income {client_row['monthly_income']}, living in {client_row['location']} in peer group category {client_row['peer_group_id']}."
# contextual_query = f"{client_row['peer_group_id']}"
print(f"Contextual Query: {contextual_query}")
query_embedding = embeddings.embed_query(contextual_query)
print(f"Query Embedding: {query_embedding}")
results = collection.query(query_embeddings=[query_embedding], n_results=3)
print(f"Query Results: {results}")
if not results['ids']:
    print("I'm sorry, I couldn't find any relevant peer groups.")
peer_group_id = results['ids'][0]
print(f"Found peer group ID: {peer_group_id}")

Contextual Query: Fetch peer groups like me For a client with age 50, monthly income 40400, living in Bellburgh, HI in peer group category PG_1181.
Query Embedding: [-0.013209436404447305, -0.011940733725508794, -0.0007272144534730077, -0.03324408770507008, -0.004474382804267524, 0.006988043159540623, -0.02092342276202658, 0.033705435149763346, -0.021561166444294108, -0.025319783544579134, -0.02023140159498667, 0.004454029213139214, 0.010393866387693141, -0.026608840280307286, -0.002671399871610179, 0.004850923075987948, 0.01449170845457891, -0.02420712304642524, 0.009939303628596416, 0.0014518846498473104, -0.01633709450806135, 0.016119990156907807, 0.0023169093624172033, -0.04399075029320238, -0.012849857381056918, -0.011513309708798259, 0.0024746493444156143, -0.014369587839131708, -0.008609540641934305, 0.004766116834337762, 0.015495815845833373, -0.020109280979539467, -0.012883780809039656, -0.004379399999883848, -0.012863426752250013, -0.01208320839774564, 0.009294776192055003, -

In [45]:
import chromadb
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

# 1.  Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
if not os.environ["OPENAI_API_KEY"]:
    raise ValueError("OPENAI_API_KEY environment variable not set.")

# 2. Initialize OpenAI Embeddings
embeddings = OpenAIEmbeddings()

# 3. Chroma Client setup
chroma_client = chromadb.Client()
collection_name = "peer_group_embeddings"

def check_and_peek_collection(collection_name: str, client: chromadb.Client, peek_count: int = 10):
    """
    Checks if a Chroma collection exists and displays the top N results if it's not empty.

    Args:
        collection_name (str): The name of the Chroma collection.
        client (chromadb.Client): The Chroma client instance.
        peek_count (int): The number of top results to display if the collection is not empty.
    """
    try:
        collection = client.get_collection(name=collection_name)
        count = collection.count()
        print(f"\n--- Checking Chroma Collection: '{collection_name}' ---")
        print(f"Collection contains {count} documents.")

        if count > 0:
            print(f"\n--- Top {peek_count} Results from Collection ---")
            results = collection.peek(limit=peek_count)
            if results and results['ids']:
                for i in range(len(results['ids'])):
                    print(f"ID: {results['ids'][i]}")
                    if 'documents' in results and results['documents'] and len(results['documents']) > i:
                        print(f"  Document: {results['documents'][i]}")
                    if 'metadatas' in results and results['metadatas'] and len(results['metadatas']) > i:
                        print(f"  Metadata: {results['metadatas'][i]}")
                    # 'embeddings' and 'distances' are not included in peek by default
            else:
                print("No documents found in the collection.")
        else:
            print("Collection is empty.")

    except Exception as e:
        print(f"Error accessing Chroma collection '{collection_name}': {e}")
        print("Please ensure the collection name is correct.")

# Load your peer group definitions (this is just to ensure client_data_with_peers is available later if needed)
try:
    client_data_with_peers = pd.read_csv('/Users/vivekaradshan/Documents/AI peer benchmarking/ai-peer-benchmarching/client_data_with_peers.csv')
except FileNotFoundError:
    print("Error: client_data_with_peers.csv not found. Please check the path.")
    client_data_with_peers = pd.DataFrame()

# Check the Chroma collection
check_and_peek_collection(collection_name, chroma_client, peek_count=10)


--- Checking Chroma Collection: 'peer_group_embeddings' ---
Collection contains 7080 documents.

--- Top 10 Results from Collection ---
ID: PG_001
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Debt Repayment
  Metadata: {'peer_group_id': 'PG_001'}
ID: PG_002
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Retirement
  Metadata: {'peer_group_id': 'PG_002'}
ID: PG_003
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Emergency Fund Building
  Metadata: {'peer_group_id': 'PG_003'}
ID: PG_004
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Travel
  Metadata: {'peer_group_id': 'PG_004'}
ID: PG_005
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Home Purchase
  Metadata: {'peer_group_id': 'PG_005'}
ID: PG_006
  Document: Income: $8,759 - $144,500, Age: 17 - 33, State: HI, Goal: Investment Growth
  Metadata: {'peer_group_id': 'PG_006'}
ID: PG_007
  Document: Income: $8,759 - $144,500

In [44]:
import chromadb
import pandas as pd
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

# Load environment variables (ensure OPENAI_API_KEY is set in your .env file)
load_dotenv()

# --- Configuration ---
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY environment variable not set. Please set it in your .env file.")

COLLECTION_NAME = "peer_group_embeddings"
BATCH_SIZE = 2000 # Adjust this based on your ChromaDB and embedding model's limits
PEER_GROUP_DEFINITIONS_PATH = '/Users/vivekaradshan/Documents/AI peer benchmarking/ai-peer-benchmarching/peer_group_definitions.csv'

# --- Initialize ---
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) # Pass API key directly
chroma_client = chromadb.Client() # Assumes in-memory or default persistent client path

# --- **CRITICAL: Delete and Recreate Collection for a Clean Start** ---
try:
    # List existing collections
    existing_collections = chroma_client.list_collections()
    existing_collection_names = [col.name for col in existing_collections]

    if COLLECTION_NAME in existing_collection_names:
        chroma_client.delete_collection(name=COLLECTION_NAME)
        print(f"Deleted existing collection: '{COLLECTION_NAME}' for a clean start.")
    else:
        print(f"Collection '{COLLECTION_NAME}' does not exist, proceeding to create.")
except Exception as e:
    print(f"An error occurred while trying to check/delete collection '{COLLECTION_NAME}': {e}")
    # Depending on the error, you might want to exit or handle differently
    # For a fresh start, if deletion fails, recreating will also likely fail.

# Create the collection
# This line will only be reached if the collection didn't exist or was successfully deleted
collection = chroma_client.create_collection(name=COLLECTION_NAME)
print(f"Created new collection: '{COLLECTION_NAME}'")

# --- Function to Add Data in Batches ---
def add_data_to_chroma_collection(
    collection: chromadb.api.models.Collection.Collection,
    peer_group_definitions_df: pd.DataFrame,
    embeddings_model: OpenAIEmbeddings,
    batch_size: int
):
    """
    Adds peer group descriptions to the Chroma collection in batches.

    Args:
        collection (chromadb.api.models.Collection.Collection): The Chroma collection object.
        peer_group_definitions_df (pd.DataFrame): DataFrame containing peer group definitions.
        embeddings_model (OpenAIEmbeddings): The initialized OpenAI embeddings model.
        batch_size (int): The maximum number of items to add in each batch.
    """
    documents = []
    metadatas = []
    ids = []

    # Prepare data for batching
    for _, row in peer_group_definitions_df.iterrows():
        # Ensure description is a string and handle potential NaNs
        desc = str(row['description']) if pd.notna(row['description']) else "No description available"
        documents.append(desc)
        metadatas.append({'peer_group_id': row['peer_group_id']})
        ids.append(str(row['peer_group_id'])) # Ensure ID is string

    total_documents = len(documents)
    print(f"Preparing to add {total_documents} documents to Chroma in batches of {batch_size}...")

    # Process data in batches
    for i in range(0, total_documents, batch_size):
        batch_documents = documents[i:i + batch_size]
        batch_metadatas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]

        try:
            # Embed the current batch of documents
            batch_embeddings = embeddings_model.embed_documents(batch_documents)

            collection.add(
                documents=batch_documents,
                embeddings=batch_embeddings,
                metadatas=batch_metadatas,
                ids=batch_ids
            )
            print(f"Added batch {i // batch_size + 1}/{ (total_documents + batch_size - 1) // batch_size } ({len(batch_documents)} items).")
        except Exception as e:
            print(f"Error adding batch {i // batch_size + 1} to Chroma: {e}")
            print("Please ensure your OpenAI API key is valid and your internet connection is stable.")
            # Depending on error type, you might want to break or continue
            break # Stop processing if a batch fails

# --- Main Execution ---
try:
    peer_group_definitions = pd.read_csv(PEER_GROUP_DEFINITIONS_PATH)
    if peer_group_definitions.empty:
        print(f"Warning: The CSV file '{PEER_GROUP_DEFINITIONS_PATH}' is empty or contains no data. No embeddings will be added.")
    else:
        add_data_to_chroma_collection(
            collection=collection,
            peer_group_definitions_df=peer_group_definitions,
            embeddings_model=embeddings,
            batch_size=BATCH_SIZE
        )
        print(f"\nFinished adding data to Chroma. Total documents in collection: {collection.count()}")
except FileNotFoundError:
    print(f"Error: The file '{PEER_GROUP_DEFINITIONS_PATH}' was not found. Please check the path.")
except Exception as e:
    print(f"An unexpected error occurred during data loading or processing: {e}")

Deleted existing collection: 'peer_group_embeddings' for a clean start.
Created new collection: 'peer_group_embeddings'
Preparing to add 7080 documents to Chroma in batches of 2000...
Added batch 1/4 (2000 items).
Added batch 2/4 (2000 items).
Added batch 3/4 (2000 items).
Added batch 4/4 (1080 items).

Finished adding data to Chroma. Total documents in collection: 7080
