In [21]:
from IPython.core.display import display, HTML

display(HTML("<style>.jp-Cell-inputWrapper { overflow-x: auto; }</style>"))

  from IPython.core.display import display, HTML


In [22]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Initialize Faker for generating realistic-looking data
fake = Faker('en_US')

# Number of clients to generate
num_clients = 5000

# Median household income in the USA 2023 was around $80,610 (using a rounded figure)
MEDIAN_HOUSEHOLD_INCOME_2023 = 80600
MEDIAN_MONTHLY_HOUSEHOLD_INCOME_2023 = MEDIAN_HOUSEHOLD_INCOME_2023 / 12

# Function to generate income around the median
def generate_income():
    log_mean = np.log(MEDIAN_HOUSEHOLD_INCOME_2023)
    income = np.random.lognormal(mean=log_mean, sigma=0.6) # Adjust sigma for spread
    return int(round(income, -2))

# Lists to store client data
client_ids = [f"client_{i+1:04d}" for i in range(num_clients)]
ages = np.random.randint(18, 65, num_clients)
genders = np.random.choice(['Male', 'Female', 'Other'], num_clients, p=[0.49, 0.49, 0.02])
locations = [fake.city() + ", " + fake.state_abbr() for _ in range(num_clients)]

# Income Data (standardized to monthly)
income_frequencies = ['Monthly', 'Annually', 'Bi-Weekly']
income_frequency = np.random.choice(income_frequencies, num_clients, p=[0.7, 0.2, 0.1])
incomes = []
monthly_incomes = []
for freq in income_frequency:
    base_annual_income = generate_income() # Generate roughly annual income
    incomes.append(base_annual_income)
    if freq == 'Annually':
        monthly_incomes.append(int(round(base_annual_income / 12, -2)))
    elif freq == 'Bi-Weekly':
        monthly_incomes.append(int(round(base_annual_income * 26 / 12, -2)))
    else:  # Monthly
        monthly_incomes.append(int(round(base_annual_income / 1, -2)))

# Profession Data
professions = [
    'Software Engineer', 'Data Scientist', 'Financial Analyst', 'Registered Nurse',
    'Teacher', 'Accountant', 'Project Manager', 'Marketing Specialist',
    'Sales Manager', 'Business Analyst', 'Electrician', 'Plumber', 'Lawyer',
    'Doctor', 'Pharmacist', 'Truck Driver', 'Customer Service Representative',
    'Administrative Assistant', 'Retail Manager', 'Chef', 'Mechanic'
]
profession = np.random.choice(professions, num_clients)

# Loan Data (Corrected)
loan_types = ['Mortgage', 'Auto Loan', 'Personal Loan', 'Credit Card Debt', None]
has_loan = np.random.choice([True, False], num_clients, p=[0.4, 0.6])

loan_data = []
for i in range(num_clients):  # Iterate by index to access corresponding income
    if has_loan[i]:
        loan_type = np.random.choice(loan_types[:-1])
        principal_amount = np.random.uniform(5000, monthly_incomes[i] * 3 * 12, 1)[0] if loan_type != 'Credit Card Debt' else np.random.uniform(500, monthly_incomes[i] * 0.5 * 12, 1)[0]
        interest_rate = np.random.uniform(0.03, 0.15, 1)[0]
        loan_term_months = np.random.randint(12, 360, 1)[0] if loan_type != 'Credit Card Debt' else None
        if loan_type in ['Mortgage', 'Auto Loan', 'Personal Loan'] and loan_term_months is not None and interest_rate > 0:
            monthly_interest_rate = interest_rate / 12
            emi = (principal_amount * monthly_interest_rate * (1 + monthly_interest_rate)**loan_term_months) / ((1 + monthly_interest_rate)**loan_term_months - 1)
            emi = round(emi, 2)
        else:
            emi = None
        loan_data.append({
            'loan_type': loan_type,
            'principal_amount': round(principal_amount, 2),
            'interest_rate': round(interest_rate, 4),
            'loan_term_months': loan_term_months,
            'emi': emi
        })
    else:
        loan_data.append({
            'loan_type': None,
            'principal_amount': None,
            'interest_rate': None,
            'loan_term_months': None,
            'emi': None
        })

# Spending Pattern (adjusting based on monthly income and standardizing to monthly)
spending_categories = {
    'Housing': (0.25, 0.40),
    'Food': (0.10, 0.20),
    'Transportation': (0.05, 0.15),
    'Utilities': (0.03, 0.08),
    'Healthcare': (0.02, 0.10),
    'Debt Payments': (0.05, 0.20),
    'Entertainment': (0.02, 0.10),
    'Savings': (0.05, 0.35), # Increased max savings
    'Miscellaneous': (0.03, 0.15)
}

monthly_spending_patterns = []
for income in monthly_incomes:
    pattern = {}
    remaining_percentage = 1.0
    savings_percentage = np.random.uniform(0.05, min(0.35, income / (MEDIAN_MONTHLY_HOUSEHOLD_INCOME_2023 * 2))) # Savings increase with income (cap at 35%)
    pattern['Savings'] = round(income * savings_percentage, 2)
    remaining_percentage -= savings_percentage
    for category, (min_p, max_p) in spending_categories.items():
        if category == 'Savings':
            continue
        if category == 'Miscellaneous':
            percentage = remaining_percentage
        else:
            base_percentage = np.random.uniform(min_p, max_p)
            adjustment_factor = 0.9 + np.random.rand() * 0.2
            percentage = min(remaining_percentage, base_percentage * adjustment_factor)
            remaining_percentage -= percentage
        pattern[category] = round(income * percentage, 2)
    monthly_spending_patterns.append(pattern)

financial_goals_options = [
    'Retirement',
    'Home Purchase',
    'Education Funding (Child/Self)',
    'Debt Repayment',
    'Early Financial Freedom/FIRE',
    'Healthcare Savings',
    'Travel',
    'Investment Growth',
    'Emergency Fund Building',
    'Other'
]
financial_goal_probabilities = [
    0.25,
    0.15,
    0.12,
    0.10,
    0.08,
    0.07,
    0.08,
    0.08,
    0.05,
    0.02
]
financial_goals = np.random.choice(
    financial_goals_options,
    num_clients,
    p=financial_goal_probabilities
)

# Create a Pandas DataFrame
client_data = pd.DataFrame({
    'client_id': client_ids,
    'age': ages,
    'gender': genders,
    'location': locations,
    'income': incomes,
    'income_frequency': income_frequency,
    'monthly_income': monthly_incomes,
    'profession': profession,
    'financial_goal': financial_goals,
    'loan_type': [loan['loan_type'] for loan in loan_data],
    'principal_amount': [loan['principal_amount'] for loan in loan_data],
    'interest_rate': [loan['interest_rate'] for loan in loan_data],
    'loan_term_months': [loan['loan_term_months'] for loan in loan_data],
    'emi': [loan['emi'] for loan in loan_data]
})

# Add spending pattern columns (now monthly)
spending_df = pd.DataFrame(monthly_spending_patterns)
client_data = pd.concat([client_data, spending_df], axis=1)

print(client_data.head())
print(f"\nGenerated {len(client_data)} client data points with income and spending standardized to monthly.")

     client_id  age  gender         location  income income_frequency  \
0  client_0001   50    Male    Bellburgh, HI   40400          Monthly   
1  client_0002   40  Female    Smithport, CO  167700         Annually   
2  client_0003   46    Male  East Robert, DC   81700          Monthly   
3  client_0004   47    Male   Port Jesse, WA   52700          Monthly   
4  client_0005   26  Female    Greenbury, FM  135400          Monthly   

   monthly_income                       profession           financial_goal  \
0           40400                   Data Scientist           Debt Repayment   
1           14000  Customer Service Representative               Retirement   
2           81700                      Electrician               Retirement   
3           52700                          Plumber  Emergency Fund Building   
4          135400                          Teacher                   Travel   

  loan_type  ...       emi   Savings   Housing      Food  Transportation  \
0      Non

In [19]:
client_data

Unnamed: 0,client_id,age,gender,location,income,income_frequency,monthly_income,financial_goal,loan_type,principal_amount,...,emi,Savings,Housing,Food,Transportation,Utilities,Healthcare,Debt Payments,Entertainment,Miscellaneous
0,client_0001,27,Male,"Port Brittneymouth, CO",31500,Monthly,31500,Education Funding (Child/Self),,,...,,2734.97,12006.39,5027.94,3078.95,1376.45,3209.75,4065.56,0.00,0.00
1,client_0002,31,Male,"Christopherland, MI",118300,Monthly,118300,Retirement,Credit Card Debt,545036.04,...,,24223.06,43920.75,17726.06,10544.29,4485.76,8440.82,6380.91,2578.36,0.00
2,client_0003,56,Female,"Gardnermouth, VA",62600,Annually,5200,Retirement,Credit Card Debt,15409.96,...,,1704.19,2059.13,685.66,383.63,166.51,200.88,0.00,0.00,0.00
3,client_0004,45,Male,"South Rebeccamouth, PA",98000,Bi-Weekly,212300,Home Purchase,,,...,,63893.30,62743.01,28821.27,32625.32,11355.89,12861.21,0.00,0.00,0.00
4,client_0005,19,Female,"Lake Angelaton, MA",130800,Annually,10900,Home Purchase,Auto Loan,344761.34,...,2641.72,810.68,3548.54,1010.99,691.38,885.70,426.95,1780.21,1069.47,676.07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,client_4996,50,Female,"Schultzberg, NV",125500,Monthly,125500,Travel,,,...,,31747.97,53881.70,12704.83,9270.87,9439.34,8455.28,0.00,0.00,0.00
4996,client_4997,32,Female,"Lake Joseph, NY",111400,Annually,9300,Education Funding (Child/Self),,,...,,2533.25,2645.18,1442.28,691.07,664.74,420.50,902.99,0.00,0.00
4997,client_4998,49,Male,"Frankfort, PW",31000,Monthly,31000,Investment Growth,,,...,,6222.09,12411.17,6195.73,2661.02,1405.82,778.67,1325.49,0.00,0.00
4998,client_4999,22,Female,"Patelside, DC",122200,Monthly,122200,Other,,,...,,35144.53,35658.63,21852.21,13444.99,8878.07,7221.58,0.00,0.00,0.00


In [23]:
import pandas as pd

# Assuming you have the client_data DataFrame

def create_peer_groups(client_data: pd.DataFrame, income_bins=4, age_bins=3):
    """
    Creates peer group definitions based on monthly income, age, state, financial goals, and profession.

    Args:
        client_data (pd.DataFrame): DataFrame containing client data.
        income_bins (int): The number of income bins (applied to monthly income).
        age_bins (int): The number of age bins.

    Returns:
        tuple: A tuple containing:
            - pd.DataFrame: DataFrame with added 'peer_group_id' to client data.
            - pd.DataFrame: DataFrame defining peer group IDs and their descriptions.
    """
    income_cut = pd.cut(client_data['monthly_income'], bins=income_bins, labels=False, retbins=True)
    client_data['income_bin'] = income_cut[0]
    income_bin_edges = income_cut[1]

    age_cut = pd.cut(client_data['age'], bins=age_bins, labels=False, retbins=True)
    client_data['age_bin'] = age_cut[0]
    age_bin_edges = age_cut[1]

    unique_states = client_data['location'].str[-2:].unique()
    unique_goals = client_data['financial_goal'].unique()
    unique_professions = client_data['profession'].unique()  # Get unique professions

    peer_groups = []
    client_data['peer_group_id'] = ''
    group_counter = 1

    for i in range(income_bins):
        for j in range(age_bins):
            for state in unique_states:
                for goal in unique_goals:
                    for prof in unique_professions: # Iterate through professions
                        group_id = f"PG_{group_counter:03d}"
                        income_desc = f"Monthly Income: ${income_bin_edges[i]:,.0f} - ${income_bin_edges[i+1]:,.0f}"
                        age_desc = f"Age: {int(age_bin_edges[j])} - {int(age_bin_edges[j+1])}"
                        location_desc = f"State: {state}"
                        goal_desc = f"Goal: {goal}"
                        profession_desc = f"Profession: {prof}"  # Add profession to description
                        description = f"{income_desc}, {age_desc}, {location_desc}, {goal_desc}, {profession_desc}" # Include in combined description

                        peer_groups.append({'peer_group_id': group_id, 'description': description})

                        condition = (client_data['income_bin'] == i) & \
                                    (client_data['age_bin'] == j) & \
                                    (client_data['location'].str[-2:] == state) & \
                                    (client_data['financial_goal'] == goal) & \
                                    (client_data['profession'] == prof)  # Add profession to the condition

                        client_data.loc[condition, 'peer_group_id'] = group_id
                        group_counter += 1

    peer_group_df = pd.DataFrame(peer_groups)
    client_data = client_data.drop(columns=['income_bin', 'age_bin'])
    client_data['peer_group_id'] = client_data['peer_group_id'].replace('', None)

    return client_data, peer_group_df

# Generate sophisticated peer groups
client_data_with_peers, peer_group_definitions = create_sophisticated_peer_groups(client_data.copy(), income_bins=4, age_bins=3)

# Display the first few rows of client data with peer group IDs
print("Client Data with Peer Group IDs (including profession):")
print(client_data_with_peers[['client_id', 'age', 'monthly_income', 'location', 'profession', 'financial_goal', 'peer_group_id']].head())

# Display a sample of the peer group definitions
print("\nSample Peer Group Definitions (including profession):")
print(peer_group_definitions.head())

# Display the distribution of clients across the generated peer groups
print("\nDistribution of clients across peer groups (top counts, including profession):")
print(client_data_with_peers['peer_group_id'].value_counts().head())

print(f"\nTotal number of defined peer groups: {len(peer_group_definitions)}")
print(f"Number of clients assigned to a peer group: {client_data_with_peers['peer_group_id'].count()}")
print(f"Number of clients without a peer group: {client_data_with_peers['peer_group_id'].isnull().sum()}")


Client Data with Peer Group IDs (including profession):
     client_id  age  monthly_income         location  \
0  client_0001   50           40400    Bellburgh, HI   
1  client_0002   40           14000    Smithport, CO   
2  client_0003   46           81700  East Robert, DC   
3  client_0004   47           52700   Port Jesse, WA   
4  client_0005   26          135400    Greenbury, FM   

                        profession           financial_goal peer_group_id  
0                   Data Scientist           Debt Repayment       PG_1181  
1  Customer Service Representative               Retirement       PG_2372  
2                      Electrician               Retirement        PG_612  
3                          Plumber  Emergency Fund Building        PG_623  
4                          Teacher                   Travel        PG_044  

Sample Peer Group Definitions (including profession):
  peer_group_id  \
0        PG_001   
1        PG_002   
2        PG_003   
3        PG_004   
4

In [39]:
client_data_with_peers.to_csv('client_data_with_peers.csv', index=False)

In [41]:
peer_group_definitions.to_csv('peer_group_definitions.csv', index=False)

In [49]:
client_data_with_peers['peer_group_id'].value_counts()

peer_group_id
PG_622     11
PG_1242    11
PG_472     11
PG_342     11
PG_1655    10
           ..
PG_1867     1
PG_2591     1
PG_3421     1
PG_475      1
PG_844      1
Name: count, Length: 2136, dtype: int64

In [43]:
import pandas as pd
from langchain.graphs import Neo4jGraph
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings  # Or any other embedding model
from langchain.schema import Document
import os
from dotenv import load_dotenv

# Assuming you have client_data_with_peers and peer_group_definitions DataFrames

# 1. Knowledge Graph Setup (Neo4j)
# --------------------------------
#    * Ensure you have a Neo4j instance running and the Neo4j Python driver installed.
#    * Set up environment variables for your Neo4j connection:
#        * NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD

NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USER = os.environ["NEO4J_USER"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

def create_kg_nodes(client_data: pd.DataFrame):
    """
    Creates nodes in the knowledge graph for clients and peer groups.
    """
    for _, row in client_data.iterrows():
        client_id = row['client_id']
        age = row['age']
        gender = row['gender']
        location = row['location']
        income = row['income']
        income_frequency = row['income_frequency']
        profession = row['profession']
        financial_goal = row['financial_goal']

        #  Create Client node
        graph.query(f"""
            CREATE (c:Client {{
                client_id: '{client_id}',
                age: {age},
                gender: '{gender}',
                location: '{location}',
                income: {income},
                income_frequency: '{income_frequency}',
                profession: '{profession}',
                financial_goal: '{financial_goal}'
            }})
        """)

        # if row['peer_group_id']:
        #     peer_group_id = row['peer_group_id']
        #     # Link Client to PeerGroup
        #     graph.query(f"""
        #         MATCH (c:Client {{client_id: '{client_id}'}}),
        #               (pg:PeerGroup {{peer_group_id: '{peer_group_id}'}})
        #         CREATE (c)-[:BELONGS_TO]->(pg)
        #     """)

def create_kg_peer_groups(peer_group_definitions: pd.DataFrame):
    """
    Creates PeerGroup nodes in the knowledge graph.
    """
    for _, row in peer_group_definitions.iterrows():
        peer_group_id = row['peer_group_id']
        description = row['description']
        graph.query(f"""
            CREATE (pg:PeerGroup {{
                peer_group_id: '{peer_group_id}',
                description: '{description}'
            }})
        """)

# Create KG nodes
create_kg_nodes(client_data_with_peers)
create_kg_peer_groups(peer_group_definitions)


In [3]:
import pandas as pd
client_data_with_peers = pd.read_csv('client_data_with_peers.csv')
client_data_with_peers.columns


Index(['client_id', 'age', 'gender', 'location', 'income', 'income_frequency',
       'monthly_income', 'profession', 'financial_goal', 'loan_type',
       'principal_amount', 'interest_rate', 'loan_term_months', 'emi',
       'Savings', 'Housing', 'Food', 'Transportation', 'Utilities',
       'Healthcare', 'Debt Payments', 'Entertainment', 'Miscellaneous',
       'peer_group_id'],
      dtype='object')

In [48]:
from langchain_neo4j import Neo4jGraph
import os

# 1.  Connect to Neo4j
# ----------------------
NEO4J_URI = os.environ["NEO4J_URI"]
NEO4J_USER = os.environ["NEO4J_USER"]
NEO4J_PASSWORD = os.environ["NEO4J_PASSWORD"]

graph = Neo4jGraph(url=NEO4J_URI, username=NEO4J_USER, password=NEO4J_PASSWORD)

# 2. Add Unique Constraints
# --------------------------
def add_unique_constraints():
    """
    Adds unique constraints to the Client and PeerGroup nodes.
    """
    try:
        graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (c:Client) REQUIRE c.client_id IS UNIQUE")
        print("Unique constraint added for Client: client_id")
    except Exception as e:
        print(f"Error adding unique constraint for Client: {e}")

    try:
        graph.query("CREATE CONSTRAINT IF NOT EXISTS FOR (pg:PeerGroup) REQUIRE pg.peer_group_id IS UNIQUE")
        print("Unique constraint added for PeerGroup: peer_group_id")
    except Exception as e:
        print(f"Error adding unique constraint for PeerGroup: {e}")

# 3. Establish Relationships (BELONGS_TO)
# -----------------------------------------
def create_relationships(client_data: pd.DataFrame):
    """
    Establishes BELONGS_TO relationships between Client and PeerGroup nodes
    based on the peer_group_id in the client_data DataFrame.

    Args:
        client_data (pd.DataFrame): DataFrame containing client data with 'client_id' and 'peer_group_id'.
    """
    for _, row in client_data.iterrows():
        client_id = row['client_id']
        peer_group_id = row['peer_group_id']

        if peer_group_id:  # Only create relationships if peer_group_id is not None/empty
            try:
                graph.query(f"""
                    MATCH (c:Client {{client_id: '{client_id}'}}),
                          (pg:PeerGroup {{peer_group_id: '{peer_group_id}'}})
                    CREATE (c)-[:BELONGS_TO]->(pg)
                """)
                print(f"Relationship created: Client '{client_id}' BELONGS_TO PeerGroup '{peer_group_id}'")
            except Exception as e:
                print(f"Error creating relationship for Client '{client_id}': {e}")
        else:
            print(f"Client '{client_id}' not assigned to any peer group.")

# Assuming you have the client_data DataFrame available
# Add constraints
add_unique_constraints()

# Create relationships (assuming client_data_with_peers is available)
create_relationships(client_data_with_peers)

Unique constraint added for Client: client_id
Unique constraint added for PeerGroup: peer_group_id
Relationship created: Client 'client_0001' BELONGS_TO PeerGroup 'PG_1181'
Relationship created: Client 'client_0002' BELONGS_TO PeerGroup 'PG_2372'
Relationship created: Client 'client_0003' BELONGS_TO PeerGroup 'PG_612'
Relationship created: Client 'client_0004' BELONGS_TO PeerGroup 'PG_623'
Relationship created: Client 'client_0005' BELONGS_TO PeerGroup 'PG_044'
Relationship created: Client 'client_0006' BELONGS_TO PeerGroup 'PG_1235'
Relationship created: Client 'client_0007' BELONGS_TO PeerGroup 'PG_651'
Relationship created: Client 'client_0008' BELONGS_TO PeerGroup 'PG_1256'
Relationship created: Client 'client_0009' BELONGS_TO PeerGroup 'PG_083'
Relationship created: Client 'client_0010' BELONGS_TO PeerGroup 'PG_3042'
Relationship created: Client 'client_0011' BELONGS_TO PeerGroup 'PG_1245'
Relationship created: Client 'client_0012' BELONGS_TO PeerGroup 'PG_1281'
Relationship creat

In [47]:
pip install openai

Collecting openai
  Downloading openai-1.79.0-py3-none-any.whl.metadata (25 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Downloading openai-1.79.0-py3-none-any.whl (683 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m683.3/683.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jiter-0.10.0-cp312-cp312-macosx_11_0_arm64.whl (320 kB)
Installing collected packages: jiter, openai
Successfully installed jiter-0.10.0 openai-1.79.0
Note: you may need to restart the kernel to use updated packages.


In [33]:
# 2. Vector Database Setup (Chroma)
# ----------------------------------
#    * Ensure you have Chroma installed:  `pip install chromadb`
#    * Run a Chroma server, or use the in-memory option for this example.
embeddings = OpenAIEmbeddings()  #  Or any other embedding model

def create_chroma_collection(peer_group_definitions: pd.DataFrame):
    """
    Creates a Chroma collection and adds peer group descriptions.
    """
    chroma_client = chromadb.Client()  #  Or use: chromadb.PersistentClient(path="./chroma_db") for persistence
    collection = chroma_client.create_collection("peer_groups")

    documents = []
    metadatas = []
    ids = []
    for _, row in peer_group_definitions.iterrows():
        documents.append(row['description'])
        metadatas.append({'peer_group_id': row['peer_group_id']})
        ids.append(row['peer_group_id'])  # Use peer_group_id as the ID

    collection.add(
        documents=documents,
        metadatas=metadatas,
        ids=ids
    )
    return collection

import chromadb
chroma_collection = create_chroma_collection(peer_group_definitions)


NameError: name 'OpenAIEmbeddings' is not defined