# RAG demo level 2
In more advanced demonstration we will add hierarchical and graph approaches by extracting metadata, finding and storing relationships between documents and adding summarizations for aggregate questions.

## Step 3 - Adding LLM summarizations to entities (communities)

In [7]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Using postgresql://psqladmin:)ycxlsxlLRKks*g#@psql-graphrag-psbv.postgres.database.azure.com/demo?sslmode=require as the database connection string
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o-mini model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o model endpoint


Create clients for PostgreSQL and OpenAI models

In [13]:
import psycopg2
from psycopg2 import sql
from openai import AzureOpenAI 
import pandas as pd
import age

conn = psycopg2.connect(db_uri)

gpt_4o_client = AzureOpenAI(  
    azure_endpoint=GPT_4O_ENDPOINT,  
    api_key=GPT_4O_KEY,  
    api_version="2024-05-01-preview",
)

gpt_4o_mini_client = AzureOpenAI(
    azure_endpoint=GPT_4O_MINI_ENDPOINT,  
    api_key=GPT_4O_MINI_KEY,  
    api_version="2024-05-01-preview",
)

In [14]:
# Enable AGE for this connection

command = """
SET search_path = ag_catalog, "$user", public;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

In [10]:
import os
import psycopg2
import jinja2
import json
import tiktoken
from concurrent.futures import ThreadPoolExecutor, as_completed

def get_distinct_trait(conn, trait):
    query = f"""
    SELECT * FROM cypher('movies_graph', $$
        MATCH (t:{trait})
        RETURN DISTINCT t.name as name
    $$) as (name text);
    """
    with conn.cursor() as cursor:
        cursor.execute(query)
        results = cursor.fetchall()

    return [row[0] for row in results]

def get_distinct_unprocessed_trait(conn, trait):
    query = f"""
    SELECT * FROM cypher('movies_graph', $$
        MATCH (t:{trait})
        WHERE NOT EXISTS(t.summary)
        RETURN DISTINCT t.name as name
    $$) as (name text);
    """
    with conn.cursor() as cursor:
        cursor.execute(query)
        results = cursor.fetchall()

    return [row[0] for row in results]

def get_movies_by_trait(conn, trait, trait_name, edge_name):
    safe_trait_name = trait_name.replace("'", "\\'")
    query = f"""
    SELECT * FROM cypher('movies_graph', $$
        MATCH (m:Movie)-[:{edge_name}]->(t:{trait} {{name: '{safe_trait_name}'}})
        RETURN m.combined_text
    $$) as (combined_text text);
    """
    with conn.cursor() as cursor:
        cursor.execute(query)
        rows = cursor.fetchall()
    texts = [row[0] for row in rows if row[0]]
    return texts

def cap_tokens(input, max_tokens=128000, encoding='cl100k_base'):
    tokenizer = tiktoken.get_encoding(encoding)
    tokens = tokenizer.encode(input)
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return tokenizer.decode(tokens)

def get_summary(system_prompt_template, user_prompt_template, trait_name, combined_texts, llm_client, llm_model="gpt-4o"):
    system_prompt = system_prompt_template.render(name=trait_name)
    user_prompt = user_prompt_template.render(combined_texts=combined_texts)
    capped_user_prompt = cap_tokens(user_prompt, max_tokens=120000)
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": capped_user_prompt}
    ]
    completion = llm_client.beta.chat.completions.parse(  
        model=llm_model,
        messages=messages,
        max_tokens=4000,  
        temperature=0.7,
    )
    response_content = completion.choices[0].message.content
    return response_content

def store_summary(conn, trait, trait_name, summary):
    safe_summary = summary.replace("'", "\\'")
    safe_trait_name = trait_name.replace("'", "\\'")
    query = f"""
    SELECT * FROM cypher('movies_graph', $$
        MATCH (t:{trait} {{name: '{safe_trait_name}'}})
        SET t.summary = '{safe_summary}'
        RETURN t
    $$) as (t agtype);
    """
    with conn.cursor() as cursor:
        cursor.execute(query)
        result = cursor.fetchall()
    conn.commit()

def process_single_trait(trait, edge_name, trait_name, system_prompt_template, user_prompt_template, llm_client, db_conn, llm_model):
    combined_texts = get_movies_by_trait(conn=db_conn, trait=trait, trait_name=trait_name, edge_name=edge_name)
    summary = get_summary(system_prompt_template, user_prompt_template, trait_name, combined_texts, llm_client, llm_model)
    store_summary(conn=db_conn, trait=trait, trait_name=trait_name, summary=summary)
    return trait_name

def process_trait_parallel(conn, trait, edge_name, system_prompt_template_path, user_prompt_template_path, llm_client, llm_model="gpt-4o"):
    with open(system_prompt_template_path, 'r') as f:
        system_prompt_template = jinja2.Template(f.read())
    with open(user_prompt_template_path, 'r') as f:
        user_prompt_template = jinja2.Template(f.read())
    traits = get_distinct_unprocessed_trait(conn, trait)
    total = len(traits)
    print(f"Processing {total} unprocessed {trait}")
    
    with ThreadPoolExecutor(max_workers=400) as executor:
        futures = {executor.submit(process_single_trait, trait, edge_name, trait_name, system_prompt_template, user_prompt_template, llm_client, conn, llm_model): trait_name for trait_name in traits}
        for idx, future in enumerate(as_completed(futures)):
            trait_name = futures[future]
            try:
                _ = future.result()  # Processed trait summary stored
            except Exception as e:
                print(f"Error processing {trait_name}: {e}")
            if (idx + 1) % 1000 == 0:
                print(f"Processed {idx + 1} out of {total} of {trait}")


Summarize traits

In [None]:
# Genre
process_trait_parallel(conn=conn, trait="Genre", edge_name="IN_GENRE", system_prompt_template_path="prompts/summarize_genre.jinja2", user_prompt_template_path="prompts/summarize_user.jinja2", llm_client=gpt_4o_client, llm_model="gpt-4o")

In [None]:
# Series
process_trait_parallel(conn=conn, trait="Series", edge_name="PART_OF_SERIES", system_prompt_template_path="prompts/summarize_series.jinja2", user_prompt_template_path="prompts/summarize_user.jinja2", llm_client=gpt_4o_mini_client, llm_model="gpt-4o-mini")

In [None]:
# Character
process_trait_parallel(conn=conn, trait="Character", edge_name="FEATURES_CHARACTER", system_prompt_template_path="prompts/summarize_character.jinja2", user_prompt_template_path="prompts/summarize_user.jinja2", llm_client=gpt_4o_mini_client, llm_model="gpt-4o-mini")

In [None]:
# Theme
process_trait_parallel(conn=conn, trait="Theme", edge_name="INCLUDES_THEME", system_prompt_template_path="prompts/summarize_theme.jinja2", user_prompt_template_path="prompts/summarize_user.jinja2", llm_client=gpt_4o_mini_client, llm_model="gpt-4o-mini")

In [None]:
# Setting
process_trait_parallel(conn=conn, trait="Setting", edge_name="SET_IN", system_prompt_template_path="prompts/summarize_setting.jinja2", user_prompt_template_path="prompts/summarize_user.jinja2", llm_client=gpt_4o_mini_client, llm_model="gpt-4o-mini")