# RAG demo level 2
In more advanced demonstration we will add hierarchical and graph approaches by extracting metadata, finding and storing relationships between documents and adding summarizations for aggregate questions.

## Step 2 - Storing graph in PostgreSQL using AGE extension

In [None]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Create clients for PostgreSQL and OpenAI models

In [36]:
import psycopg2
from psycopg2 import sql
from openai import AzureOpenAI 
import pandas as pd
import age

conn = psycopg2.connect(db_uri)

gpt_4o_client = AzureOpenAI(  
    azure_endpoint=GPT_4O_ENDPOINT,  
    api_key=GPT_4O_KEY,  
    api_version="2024-05-01-preview",
)

gpt_4o_mini_client = AzureOpenAI(
    azure_endpoint=GPT_4O_MINI_ENDPOINT,  
    api_key=GPT_4O_MINI_KEY,  
    api_version="2024-05-01-preview",
)

### Install and configure extensions

List extensions

In [20]:
command = """
SELECT * FROM pg_extension;
"""

with conn.cursor() as cursor:
    cursor.execute(command)
    result = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

pd.DataFrame(result, columns=columns)


Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,14258,plpgsql,10,11,False,1.0,,
1,24762,vector,10,2200,True,0.7.0,,
2,25082,pg_diskann,10,2200,False,0.4.0,,
3,25102,azure_ai,10,11,False,1.1.0,,
4,25184,age,10,25183,False,1.5.0,"[25185, 25197]","[, ]"


In [14]:
command = """
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_diskann CASCADE;
CREATE EXTENSION IF NOT EXISTS azure_ai;
CREATE EXTENSION IF NOT EXISTS age CASCADE;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

In [7]:
command = f"""
select azure_ai.set_setting('azure_openai.endpoint','{EMBEDDINGS_ENDPOINT}'); 
select azure_ai.set_setting('azure_openai.subscription_key', '{EMBEDDINGS_KEY}'); 
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

### Import data

In [37]:
# Enable AGE for this connection

command = """
SET search_path = ag_catalog, "$user", public;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

Load from file

In [25]:
movies_df = pd.read_json("data/test.json", orient="records")
print(f"Loaded {len(movies_df)} movies")

Loaded 3458 movies


Import movies

In [21]:
# Create movies graph

command = """
SELECT create_graph('movies_graph');
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

Error: graph "movies_graph" already exists



In [None]:
for idx, row in movies_df.iterrows():
    movie_id = str(row['id'])
    movie_title = str(row['title']).replace("'", "\\'") if row['title'] is not None else ""
    movie_overview = str(row['overview']).replace("'", "\\'") if row['overview'] is not None else ""

    command = f"""
    SELECT * FROM cypher('movies_graph', $$
        MERGE (m:Movie {{id: '{movie_id}'}})
        SET m.title = '{movie_title}',
            m.overview = '{movie_overview}'
        RETURN m
    $$) as (m agtype);
    """

    try:
        with conn.cursor() as cursor:
            cursor.execute(command)
            conn.commit()
    except Exception as e:
        print(f"Error: {e}")
        conn.rollback()

    if idx % 500 == 0:
        print(f"Inserted {idx+1}/{len(movies_df)} movies")

Insert traits (metadata, communities) into graph

In [41]:
traits = {
    "characters": ("Character", "FEATURES_CHARACTER"),
    "themes": ("Theme", "INCLUDES_THEME"),
    "setting": ("Setting", "SET_IN"),
    "series": ("Series", "PART_OF_SERIES"),
}

for idx, row in movies_df.iterrows():
    movie_id = str(row['id'])
    for trait_attr, (node_label, rel_type) in traits.items():
        trait_values = row.get(trait_attr)
        if not trait_values:
            continue
        for trait in trait_values:
            safe_trait = str(trait).replace("'", "\\'")
            cypher_query = f"""
            SELECT * FROM cypher('movies_graph', $$
                MERGE (g:{node_label} {{name: '{safe_trait}'}})
                WITH g
                MATCH (m:Movie {{id: '{movie_id}'}})
                MERGE (m)-[:{rel_type}]->(g)
                RETURN m, g$$) as (m agtype, g agtype);
            """
            try:
                with conn.cursor() as cursor:
                    cursor.execute(cypher_query)
                    conn.commit()
            except Exception as e:
                print(f"Error processing movie id {movie_id} for {trait_attr} '{trait}': {e}")
                conn.rollback()

    if idx % 200 == 0:
        print(f"Processed genres for {idx+1}/{len(movies_df)} movies")

Processed genres for 1/3458 movies
Processed genres for 201/3458 movies
Processed genres for 401/3458 movies
Processed genres for 601/3458 movies
Processed genres for 801/3458 movies
Processed genres for 1001/3458 movies
Processed genres for 1201/3458 movies
Processed genres for 1401/3458 movies
Processed genres for 1601/3458 movies
Processed genres for 1801/3458 movies
Processed genres for 2001/3458 movies
Processed genres for 2201/3458 movies
Processed genres for 2401/3458 movies
Processed genres for 2601/3458 movies
Processed genres for 2801/3458 movies
Processed genres for 3001/3458 movies
Processed genres for 3201/3458 movies
Processed genres for 3401/3458 movies
