# RAG demo level 2
In more advanced demonstration we will add hierarchical and graph approaches by extracting metadata, finding and storing relationships between documents and adding summarizations for aggregate questions.

## Step 4 - Creating and storing embeddings (vectors)

In [22]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Using postgresql://psqladmin:)ycxlsxlLRKks*g#@psql-graphrag-psbv.postgres.database.azure.com/demo?sslmode=require as the database connection string
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o-mini model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o model endpoint


In [23]:
import psycopg2
from psycopg2 import sql
from openai import AzureOpenAI 
import pandas as pd
import age

conn = psycopg2.connect(db_uri)

gpt_embedding_client = AzureOpenAI(
    azure_endpoint=EMBEDDINGS_ENDPOINT,
    api_key=EMBEDDINGS_KEY,
    api_version="2025-02-01-preview",
)

# Enable AGE for this connection
command = """
SET search_path = ag_catalog, "$user", public;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()


In [37]:
def create_table(conn, name):
    command = f"""
    CREATE TABLE IF NOT EXISTS public.{name} (
        id BIGINT,
        embedding vector(2000)
    );
    """

    try:
        with conn.cursor() as cursor:
            cursor.execute(command)
            conn.commit()
    except psycopg2.Error as e:
        print(f"Error: {e}")
        conn.rollback()

def create_embeddings(conn, node_name, text_column_name):
    create_table(conn, node_name)
    command = f"""
    INSERT INTO public.{node_name} (id, embedding)
    SELECT id,
           azure_openai.create_embeddings('text-embedding-3-large', content, 2000, max_attempts => 5, retry_delay_ms => 500)
    FROM (
        SELECT * FROM cypher('movies_graph', $$
            MATCH (n:{node_name})
            RETURN n.{text_column_name} AS content, ID(n) AS id
        $$) as (content text, id bigint)
    ) sub;
    """

    try:
        with conn.cursor() as cursor:
            cursor.execute(command)
            conn.commit()
    except psycopg2.Error as e:
        print(f"Error: {e}")
        conn.rollback()

In [38]:
create_embeddings(conn=conn, node_name="Movie", text_column_name="combined_text")

Error: server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.
server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.



InterfaceError: connection already closed

In [19]:
command = """
ALTER TABLE movies_graph._ag_label_vertex ADD COLUMN embedding vector(2000);
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

In [21]:
import psycopg2

batch_size = 500  # Number of rows to update in each batch

try:
    with conn.cursor() as cursor:
        # Get the total number of rows to update
        cursor.execute("SELECT COUNT(*) FROM movies_graph._ag_label_vertex WHERE embedding IS NULL;")
        total_rows = cursor.fetchone()[0]
        print(f"Total rows to update: {total_rows}")

        # Update rows in batches
        offset = 0
        while offset < total_rows:
            command = f"""
            UPDATE movies_graph._ag_label_vertex
            SET embedding = azure_openai.create_embeddings('text-embedding-3-large', combined_text, 2000, max_attempts => 5, retry_delay_ms => 500)
            WHERE id IN (
                SELECT id
                FROM movies_graph._ag_label_vertex
                WHERE embedding IS NULL
                LIMIT {batch_size} OFFSET {offset}
            );
            """
            cursor.execute(command)
            conn.commit()
            offset += batch_size
            print(f"Updated {min(offset, total_rows)} of {total_rows} rows")

except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

Total rows to update: 31730
Error: column "combined_text" does not exist
LINE 3: ...penai.create_embeddings('text-embedding-3-large', combined_t...
                                                             ^



In [None]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

def compute_embedding(text, gpt_embedding_client):
   embedding = gpt_embedding_client.embeddings.create(
      input=[text],
      model="text-embedding-3-large",
      dimensions=2000,
   )
   embedding = embedding.data[0].embedding
   embedding_literal = '[' + ','.join(map(str, embedding)) + ']'  # format as vector literal
   return embedding

def add_embeddings(conn, gpt_embedding_client, node_name, text_column_name):
   query = f"""
   SELECT * FROM cypher('movies_graph', $$
      MATCH (n:{node_name})
      WHERE NOT EXISTS(n.embedding) AND EXISTS(n.{text_column_name})
      RETURN n.{text_column_name} as {text_column_name}, n.id as id
   $$) as ({text_column_name} text, id text);
   """
   with conn.cursor() as cursor:
      cursor.execute(query)
      nodes = cursor.fetchall()
   print(f"{len(nodes)} {node_name} nodes need embedding.")

   for text, id in nodes:
      embedding = compute_embedding(text, gpt_embedding_client)
      add_embedding(conn, gpt_embedding_client, node_name, text, id)
      break

   # with ThreadPoolExecutor(max_workers=2) as executor:
   #    futures = {
   #          executor.submit(
   #             add_embedding,
   #             conn, gpt_embedding_client, node_name, text, id
   #          ): id for text, id in nodes
   #    }
   #    for idx, future in enumerate(as_completed(futures)):
   #          id = futures[future]
   #          try:
   #             future.result()
   #          except Exception as e:
   #             print(f"Error processing node {id}: {e}")
   #          if (idx + 1) % 1000 == 0:
   #             print(f"Processed {idx + 1} nodes for {node_name}")

def add_embedding(conn, gpt_embedding_client, node_name, text, id):
    embedding = compute_embedding(text, gpt_embedding_client)
    update_query = """
    UPDATE movies_graph._ag_label_vertex
    SET embedding_vector = %s
    WHERE 'id' = %s
    """
    with conn.cursor() as cursor:
         cursor.execute(update_query, (embedding, id))
    conn.commit()



In [8]:
compute_embedding("hello", gpt_embedding_client)

'[-0.02762768603861332, -0.008475443348288536, 0.0044931150041520596, 0.006744192913174629, 0.005727139767259359, 0.0002874587953556329, -0.007182655856013298, 0.05894388630986214, 0.010821446776390076, 0.0635002851486206, -0.004945138934999704, -0.023830687627196312, -0.024282710626721382, -0.011653169989585876, -0.020033689215779305, 0.02265542559325695, 0.02703101560473442, 0.0066899498924613, 0.016932806000113487, -0.01339798141270876, 0.011933424510061741, -0.009121837094426155, 0.0012532356195151806, 0.03938030079007149, 0.011273469775915146, 0.035077035427093506, -0.023396745324134827, -0.019382774829864502, -0.05540001764893532, -0.03789766505360603, 0.0009842815343290567, 0.029544265940785408, 0.0021437222603708506, 0.02144400216639042, 0.03992272913455963, 0.008018899708986282, 0.02985164150595665, 0.004945138934999704, -0.02679596282541752, 0.025168677791953087, 0.030014371499419212, 0.009790832176804543, -0.027464957907795906, -0.0023460027296096087, -0.0018092247191816568,

In [15]:
add_embeddings(conn=conn, gpt_embedding_client=gpt_embedding_client, node_name="Movie", text_column_name="combined_text")

8551 Movie nodes need embedding.
Got embedding: [-0.033203255385160446,0.021531369537115097,-0.011526894755661488,0.024170229211449623,-0.02753405272960663,0.016804616898298264,-0.022894296795129776,-0.0034616931807249784,-0.037669021636247635,0.024025237187743187,0.004647005815058947,-0.03149234503507614,-0.023532263934612274,-0.005049359519034624,-0.031028369441628456,-0.0039002951234579086,-0.011410900391638279,0.017732568085193634,0.03334824740886688,-0.04375870153307915,0.04361370950937271,0.015775172039866447,0.03532014414668083,0.005973685998469591,-0.03479817137122154,0.027041079476475716,-0.022357825189828873,0.014673229306936264,0.001603978220373392,-0.01731209084391594,0.05245824530720711,-0.02145887166261673,0.015644678846001625,-0.04303373768925667,-0.030622391030192375,0.020951399579644203,-0.00585769210010767,-0.012309852987527847,0.010388704016804695,0.027403559535741806,-0.03056439384818077,-0.02535916678607464,-0.019023001194000244,0.019138993695378304,0.0239382423460

InternalError_: typecast 'vector' not supported
LINE 2:     SELECT * FROM cypher('movies_graph', $$
                                                  ^


In [None]:
add_embeddings(conn=conn, gpt_embedding_client=gpt_embedding_client, node_name="Character", text_column_name="name")

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [None]:
update_movie_embeddings(conn)
for trait in ["Character", "Genre", "Setting", "Theme", "Series"]:
    update_trait_embeddings(conn, trait)