# GraphRAG Guide with PostgreSQL

## Loading data, indexing, creating embeddings

Import data CSV data to Pandas DataFrame

In [100]:
import pandas as pd
df = pd.read_csv('./data/movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,0,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,18.433,8.7,2763
1,1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,2020-07-31,8.439,8.7,1223
2,2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,65.57,8.7,18637
3,3,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,63.277,8.7,14052
4,4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,2020-11-19,26.691,8.7,773


Make sure to deploy Azure infrastructure using ```terraform apply``` command in terraform folder.

In [101]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")

finally:
    os.chdir(original_dir)

Using postgresql://psqladmin:)ycxlsxlLRKks*g#@psql-graphrag-psbv.postgres.database.azure.com/demo?sslmode=require as the database connection string
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint


In [112]:
import psycopg2
from psycopg2 import sql

conn = psycopg2.connect(db_uri)

### Install and configure extensions

List extensions

In [103]:
command = """
SELECT * FROM pg_extension;
"""

with conn.cursor() as cursor:
    cursor.execute(command)
    result = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

pd.DataFrame(result, columns=columns)


Unnamed: 0,oid,extname,extowner,extnamespace,extrelocatable,extversion,extconfig,extcondition
0,14258,plpgsql,10,11,False,1.0,,
1,24760,vector,10,2200,True,0.7.0,,
2,25080,pg_diskann,10,2200,False,0.3.2,,
3,25099,azure_ai,10,11,False,1.1.0,,


Install extensions

In [104]:
command = """
CREATE EXTENSION IF NOT EXISTS vector;
CREATE EXTENSION IF NOT EXISTS pg_diskann CASCADE;
CREATE EXTENSION IF NOT EXISTS azure_ai;
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

Configure extensions

In [105]:
command = f"""
select azure_ai.set_setting('azure_openai.endpoint','{EMBEDDINGS_ENDPOINT}'); 
select azure_ai.set_setting('azure_openai.subscription_key', '{EMBEDDINGS_KEY}'); 
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

In [106]:
command = f"""
select azure_ai.get_setting('azure_openai.subscription_key');
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        result = cursor.fetchone()
        print(result)
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

('bfe0df31766a4bd7ad5eda631f0c2391',)


Create table movies

In [107]:
command = """
CREATE TABLE IF NOT EXISTS movies (
    id SERIAL PRIMARY KEY,
    title VARCHAR(255),
    overview TEXT,
    combined_text TEXT,
    embedding vector(2000),
    full_text_search tsvector GENERATED ALWAYS AS (to_tsvector('english', combined_text)) STORED
);
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()


In [108]:
command = """
CREATE INDEX IF NOT EXISTS diskann_idx ON movies USING diskann (embedding vector_cosine_ops)
"""

try:
    with conn.cursor() as cursor:
        cursor.execute(command)
        conn.commit()
except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()


Insert data into movies table

In [109]:
import psycopg2.extras

with conn.cursor() as cursor:
    for _, row in df.iterrows():
        combined_text = f"TITLE: {row['title']} OVERVIEW: {row['overview']}"
        cursor.execute(
            """
            INSERT INTO movies (id, title, overview, combined_text)
            VALUES (%s, %s, %s, %s)
            ON CONFLICT (id) DO NOTHING;
            """,
            (row['id'], row['title'], row['overview'], combined_text)
        )
    conn.commit()

In [114]:
command = """
SELECT * FROM movies LIMIT 10;
"""

with conn.cursor() as cursor:
    cursor.execute(command)
    result = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

pd.DataFrame(result, columns=columns)

Unnamed: 0,id,title,overview,combined_text,embedding,full_text_search
0,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",TITLE: Dilwale Dulhania Le Jayenge OVERVIEW: R...,"[-0.033203255,0.02153137,-0.011526895,0.024170...",'adher':38 'baldev':25 'begin':77 'carefre':11...
1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,TITLE: Gabriel's Inferno Part II OVERVIEW: Pro...,,'anoth':61 'arm':63 'back':53 'come':23 'dant'...
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,TITLE: The Shawshank Redemption OVERVIEW: Fram...,,'1940s':9 'account':36 'admir':54 'amor':42 'a...
3,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",TITLE: The Godfather OVERVIEW: Spanning the ye...,,'1945':8 '1955':10 'american':18 'attempt':32 ...
4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,TITLE: Gabriel's Inferno Part III OVERVIEW: Th...,,'adapt':14 'anonym':26 'author':28 'canadian':...
5,872,Singin' in the Rain,"In 1927 Hollywood, a silent film production co...",TITLE: Singin' in the Rain OVERVIEW: In 1927 H...,,'1927':8 'cast':16 'compani':14 'difficult':19...
6,696374,Gabriel's Inferno,An intriguing and sinful exploration of seduct...,TITLE: Gabriel's Inferno OVERVIEW: An intrigui...,,'captiv':22 'earn':41 'escap':31 'explor':10 '...
7,791373,Zack Snyder's Justice League,Determined to ensure Superman's ultimate sacri...,TITLE: Zack Snyder's Justice League OVERVIEW: ...,,'align':21 'approach':40 'bruce':19 'catastrop...
8,399566,Godzilla vs. Kong,"In a time when monsters walk the Earth, humani...",TITLE: Godzilla vs. Kong OVERVIEW: In a time w...,,'age':48 'battl':45 'collid':41 'collis':26 'c...
9,441130,Wolfwalkers,"In a time of superstition and magic, when wolv...",TITLE: Wolfwalkers OVERVIEW: In a time of supe...,,'apprentic':26 'come':28 'demon':16 'destroy':...


Create embeddings

In [115]:
import psycopg2

batch_size = 500  # Number of rows to update in each batch

try:
    with conn.cursor() as cursor:
        # Get the total number of rows to update
        cursor.execute("SELECT COUNT(*) FROM movies WHERE embedding IS NULL;")
        total_rows = cursor.fetchone()[0]
        print(f"Total rows to update: {total_rows}")

        # Update rows in batches
        offset = 0
        while offset < total_rows:
            command = f"""
            UPDATE movies
            SET embedding = azure_openai.create_embeddings('text-embedding-3-large', combined_text, 2000, max_attempts => 5, retry_delay_ms => 500)
            WHERE id IN (
                SELECT id
                FROM movies
                WHERE embedding IS NULL
                LIMIT {batch_size} OFFSET {offset}
            );
            """
            cursor.execute(command)
            conn.commit()
            offset += batch_size
            print(f"Updated {min(offset, total_rows)} of {total_rows} rows")

except psycopg2.Error as e:
    print(f"Error: {e}")
    conn.rollback()

Total rows to update: 8550
Updated 500 of 8550 rows
Updated 1000 of 8550 rows
Updated 1500 of 8550 rows
Updated 2000 of 8550 rows
Updated 2500 of 8550 rows
Updated 3000 of 8550 rows
Updated 3500 of 8550 rows
Updated 4000 of 8550 rows
Updated 4500 of 8550 rows
Updated 5000 of 8550 rows
Updated 5500 of 8550 rows
Updated 6000 of 8550 rows
Updated 6500 of 8550 rows
Updated 7000 of 8550 rows
Updated 7500 of 8550 rows
Updated 8000 of 8550 rows
Updated 8500 of 8550 rows
Updated 8550 of 8550 rows


In [116]:
command = """
SELECT * FROM movies LIMIT 10;
"""

with conn.cursor() as cursor:
    cursor.execute(command)
    result = cursor.fetchall()
    columns = [desc[0] for desc in cursor.description]

pd.DataFrame(result, columns=columns)

Unnamed: 0,id,title,overview,combined_text,embedding,full_text_search
0,423612,Never Look Away,"The story of Kurt, a young art student who fal...",TITLE: Never Look Away OVERVIEW: The story of ...,"[0.010460264,0.03285887,-0.019344991,0.0469737...",'ago':64 'alreadi':55 'art':12 'away':4 'boyfr...
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",TITLE: Dilwale Dulhania Le Jayenge OVERVIEW: R...,"[-0.033203255,0.02153137,-0.011526895,0.024170...",'adher':38 'baldev':25 'begin':77 'carefre':11...
2,9900,Grandma's Boy,"Even though he's 35, Alex acts more like he's ...",TITLE: Grandma's Boy OVERVIEW: Even though he'...,"[0.0030696925,0.02457246,-0.022080898,0.021558...",'13':17 '35':10 'act':12 'alex':11 'apart':46 ...
3,78383,Nurse 3-D,"Abby Russell, a beautiful, dedicated nurse wit...","TITLE: Nurse 3-D OVERVIEW: Abby Russell, a bea...","[0.002974399,0.016073475,-0.006025722,0.021817...",'3':3 'abbi':6 'beauti':9 'd':4 'dedic':10 'di...
4,12657,Soul Plane,Following a ridiculously awful flight that lea...,TITLE: Soul Plane OVERVIEW: Following a ridicu...,"[-0.016872132,0.002083541,-0.010495951,-0.0399...",'african':50 'african-american':49 'airlin':24...
5,10216,Species II,"Having just returned from a mission to Mars, C...",TITLE: Species II OVERVIEW: Having just return...,"[-0.014371347,0.0046571563,-0.024638902,-0.017...","'alien':25,45,60 'assassin':52 'baker':49 'bea..."
6,256274,"As Above, So Below",When a team of explorers ventures into the cat...,"TITLE: As Above, So Below OVERVIEW: When a tea...","[-0.045271445,0.0042207493,-0.015132168,-0.014...",'beneath':18 'catacomb':15 'citi':32 'dark':26...
7,10154,Mickey Blue Eyes,An English auctioneer proposes to the daughter...,TITLE: Mickey Blue Eyes OVERVIEW: An English a...,"[-0.019462595,0.012176347,-0.019544097,0.03455...",'ask':25 'auction':8 'blue':3 'certain':21 'da...
8,6538,Charlie Wilson's War,The true story of Texas congressman Charlie Wi...,TITLE: Charlie Wilson's War OVERVIEW: The true...,"[0.035006616,0.010568285,-0.017317668,0.016561...","'afghanistan':19 'assist':24 'charli':2,13 'co..."
9,486131,Shaft,"JJ, aka John Shaft Jr., may be a cyber securit...","TITLE: Shaft OVERVIEW: JJ, aka John Shaft Jr.,...","[0.016566234,0.0061599636,-0.014157039,0.00814...",'absent':41 'agre':54 'aka':5 'behind':25 'bes...


## Querying

In [None]:
question1 = "What movie was about Jayenge?"
question2 = "I have seen all Star Wars movies and would like tips for something similar I can watch next."
question3 = "What is the most common genre in the movies where one of key actors is called Mark?"

### Full-text search only

### Semantic search only

### Hybrid search

### Hybrid search with reranking

### Graph RAG

### Cleanup

In [None]:
# command = "DROP TABLE IF EXISTS movies;"
# try:
#     with conn.cursor() as cursor:
#         cursor.execute(command)
#         conn.commit()
#     print("Cleanup completed: Table 'movies' has been dropped.")
# except psycopg2.Error as e:
#     print(f"Error: {e}")
#     conn.rollback()