# RAG demo level 2
In more advanced demonstration we will add hierarchical and graph approaches by extracting metadata, finding and storing relationships between documents and adding summarizations for aggregate questions.

## Step 4 - Creating and storing embeddings (vectors)

In [22]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Using postgresql://psqladmin:)ycxlsxlLRKks*g#@psql-graphrag-psbv.postgres.database.azure.com/demo?sslmode=require as the database connection string
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o-mini model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o model endpoint


In [86]:
import psycopg2
from psycopg2 import sql
from openai import AzureOpenAI 
import pandas as pd
import age

conn = psycopg2.connect(db_uri)

gpt_embedding_client = AzureOpenAI(
    azure_endpoint=EMBEDDINGS_ENDPOINT,
    api_key=EMBEDDINGS_KEY,
    api_version="2025-02-01-preview",
)

# Enable AGE for this connection
command = """
SET search_path = ag_catalog, "$user", public;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()


Define functions to create node type standard tables with vectors and to process texts in all node types

In [87]:
import time

def create_table(conn, name):
    command = f"""
    CREATE TABLE IF NOT EXISTS public.{name} (
        id BIGINT,
        embedding vector(2000)
    );
    """

    try:
        with conn.cursor() as cursor:
            cursor.execute(command)
            conn.commit()
    except psycopg2.Error as e:
        print(f"Error: {e}")
        conn.rollback()

def get_unprocessed_nodes(conn, node_name, text_column_name):
    command = f"""
    SELECT * FROM cypher('movies_graph', $$
        MATCH (n:{node_name})
        WHERE exists(n.{text_column_name}) AND n.{text_column_name} <> ''
        RETURN n.{text_column_name} AS content, ID(n) AS id
    $$) as (content text, id bigint)
    WHERE id NOT IN (SELECT id FROM public.{node_name});
    """
    try:
        with conn.cursor() as cursor:
            cursor.execute(command)
            rows = cursor.fetchall()
            return rows
    except psycopg2.Error as e:
        print(f"Error: {e}")
        return []

def create_embeddings(conn, node_name, text_column_name):
    # Create the table if it doesn't exist
    create_table(conn, node_name)

    # Get unprocessed nodes
    data_to_process = get_unprocessed_nodes(conn, node_name, text_column_name)
    if not data_to_process:
        print(f"No unprocessed nodes found for {node_name}.")
        return
    else:
        print(f"Found {len(data_to_process)} unprocessed nodes for {node_name}.")
    
    # Process the data in batches
    batch_size = 100
    max_retries = 10
    retry_delay = 10
    processed = 0
    total = len(data_to_process)
    print(f"Processing {total} nodes in batches of {batch_size}.")

    for i in range(0, len(data_to_process), batch_size):
        batch = data_to_process[i:i + batch_size]
        texts = [row[0] for row in batch]
        ids = [row[1] for row in batch]

        # Create embeddings using the Azure OpenAI client with retry on 429 errors
        for attempt in range(1, max_retries + 1):
            try:
                result = gpt_embedding_client.embeddings.create(
                    input=texts,
                    model="text-embedding-3-large",
                    dimensions=2000,
                )
                break
            except Exception as e:
                if hasattr(e, "status_code") and e.status_code == 429:
                    print(f"429 error encountered. Retry attempt {attempt} of {max_retries} in {retry_delay} seconds...")
                    time.sleep(retry_delay)
                else:
                    print(f"Embedding generation error: {e}")
                    raise e
        else:
            print("Failed to generate embeddings after multiple retries.")
            return

        embeddings = [item.embedding for item in result.data]

        # Insert the embeddings into the database
        insert_command = f"""
        INSERT INTO public.{node_name} (id, embedding) VALUES (%s, %s)
        """
        try:
            with conn.cursor() as cursor:
                cursor.executemany(insert_command, list(zip(ids, embeddings)))
                conn.commit()
        except psycopg2.Error as e:
            print(f"Error: {e}")
            conn.rollback()
        processed += len(ids)
        if i % 10 == 0:
            print(f"Processed {processed}/{total} nodes.")
    print(f"Finished processing {total} nodes.")


Generate embeddings

In [89]:
nodes = [
    {
        "name": "Movie",
        "text_column_name": "combined_text",
    },
    {
        "name": "Setting",
        "text_column_name": "summary",
    },
    {
        "name": "Character",
        "text_column_name": "summary",
    },
    {
        "name": "Genre",
        "text_column_name": "summary",
    },
    {
        "name": "Theme",
        "text_column_name": "summary",
    },
    {
        "name": "Series",
        "text_column_name": "summary",
    },
]
for node in nodes:
    create_embeddings(conn=conn, node_name=node["name"], text_column_name=node["text_column_name"])

No unprocessed nodes found for Movie.


No unprocessed nodes found for Setting.
No unprocessed nodes found for Character.
No unprocessed nodes found for Genre.
No unprocessed nodes found for Theme.
No unprocessed nodes found for Series.
