# RAG demo level 3

In [1]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    SQL_DATABASE = subprocess.run(['terraform', 'output', '-raw', 'SQL_DATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    SQL_USERNAME = subprocess.run(['terraform', 'output', '-raw', 'SQL_USERNAME'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    SQL_PASSWORD = subprocess.run(['terraform', 'output', '-raw', 'SQL_PASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    SQL_SERVER_FQDN = subprocess.run(['terraform', 'output', '-raw', 'SQL_SERVER_FQDN'], stdout=subprocess.PIPE).stdout.decode('utf-8')

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {SQL_SERVER_FQDN} as the database server")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Using sql-graphrag-psbv.database.windows.net as the database server
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o-mini model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o model endpoint


In [2]:
from openai import AzureOpenAI 
import pandas as pd
from mssql_python import connect

# Azure SQL Entra login option
conn_str = f"SERVER={SQL_SERVER_FQDN};DATABASE={SQL_DATABASE};UID=tokubica@microsoft.com;Authentication=ActiveDirectoryInteractive;Encrypt=yes;"

# Azure SQL password login option
# conn_str = f"SERVER={SQL_SERVER_FQDN};DATABASE={SQL_DATABASE};UID={SQL_USERNAME};PWD={SQL_PASSWORD};Encrypt=yes;"

conn = connect(conn_str)

gpt_embedding_client = AzureOpenAI(
    azure_endpoint=EMBEDDINGS_ENDPOINT,
    api_key=EMBEDDINGS_KEY,
    api_version="2025-02-01-preview",
)

gpt_4o_client = AzureOpenAI(  
    azure_endpoint=GPT_4O_ENDPOINT,  
    api_key=GPT_4O_KEY,  
    api_version="2024-05-01-preview",
)

gpt_4o_mini_client = AzureOpenAI(
    azure_endpoint=GPT_4O_MINI_ENDPOINT,  
    api_key=GPT_4O_MINI_KEY,  
    api_version="2024-05-01-preview",
)

Create nodes

In [3]:
command = """
IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Movie]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Movie]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END

IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Character]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Character]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END

IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Theme]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Theme]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END

IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Setting]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Setting]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END

IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Series]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Series]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END

IF NOT EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'[dbo].[Genre]') AND type in (N'U'))
BEGIN
    CREATE TABLE [dbo].[Genre]
    (
        [Id] INT PRIMARY KEY,
        [Name] VARCHAR(200) NOT NULL,
        [Content] VARCHAR(5000) NULL,
        [Embedding] VECTOR(1998) NULL
    ) AS NODE;
END
"""

cursor = conn.cursor()
cursor.execute(command)

Load from file

In [4]:
movies_df = pd.read_json("data/movies_graph.json", orient="records")
print(f"Loaded {len(movies_df)} movies")

Loaded 8551 movies


Embedding function

In [None]:
import time

max_retries = 20
retry_delay = 10

def get_embedding(text):
    for attempt in range(1, max_retries + 1):
        try:
            result = gpt_embedding_client.embeddings.create(
                input=text,
                model="text-embedding-3-large",
                dimensions=1998,
            )
            break
        except Exception as e:
            if hasattr(e, "status_code") and e.status_code == 429:
                time.sleep(retry_delay)
            else:
                print(f"Embedding generation error: {e}")
                raise e
    return json.dumps(result.data[0].embedding)

In [11]:
import concurrent.futures

max_workers = 30

def process_row(row):
    movie_id = str(row['id'])
    movie_name = str(row['title']) if row['title'] is not None else ""
    movie_overview = str(row['overview']) if row['overview'] is not None else ""
    movie_content = f"TITLE: {movie_name} OVERVIEW: {movie_overview}"

    local_cursor = conn.cursor()
    local_cursor.execute("SELECT Embedding FROM [dbo].[Movie] WHERE Id = ?", movie_id)
    result = local_cursor.fetchone()
    if result is not None:   # Movie already exists
        if result[0] is not None:   # Embedding already exists
            return movie_id
        else:  # Embedding does not exist, update it
            embedding = get_embedding(movie_content)

            command = f"""
            UPDATE [dbo].[Movie]
            SET Embedding = '{embedding}'
            WHERE Id = ?
            """

            local_cursor.execute(command, movie_id)
            return movie_id
    else: # Movie does not exist, insert it
        embedding = get_embedding(movie_content)

        command = f"""
        INSERT INTO [dbo].[Movie] ([Id], [Name], [Content], [Embedding])
        VALUES (?, ?, ?, '{embedding}')
        """

        local_cursor.execute(command, movie_id, movie_name, movie_content)
        return movie_id

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {executor.submit(process_row, row): idx for idx, row in movies_df.iterrows()}
    completed = 0
    for future in concurrent.futures.as_completed(futures):
        try:
            _ = future.result()
        except Exception as e:
            print(f"Error processing row: {e}")
        completed += 1
        if completed % 500 == 0:
            print(f"Inserted {completed}/{len(movies_df)} movies")


Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another command
Error processing row: Driver Error: General error; DDBC Error: [Microsoft]Connection is busy with results for another 