# RAG demo level 2
In more advanced demonstration we will add hierarchical and graph approaches by extracting metadata, finding and storing relationships between documents and adding summarizations for aggregate questions.

## Step 1 - Extracting metadata

Import data CSV data to Pandas DataFrame

In [1]:
import pandas as pd
df = pd.read_csv('./data/movies.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,0,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,18.433,8.7,2763
1,1,724089,Gabriel's Inferno Part II,Professor Gabriel Emerson finally learns the t...,2020-07-31,8.439,8.7,1223
2,2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,1994-09-23,65.57,8.7,18637
3,3,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,63.277,8.7,14052
4,4,761053,Gabriel's Inferno Part III,The final part of the film adaption of the ero...,2020-11-19,26.691,8.7,773


Make sure to deploy Azure infrastructure using ```terraform apply``` command in terraform folder.

In [2]:
import subprocess
import os
import json

original_dir = os.getcwd()
try:
    # Jump into the terraform directory
    os.chdir('terraform')

    # Get the database connection string
    PGHOST = subprocess.run(['terraform', 'output', '-raw', 'PGHOST'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGDATABASE = subprocess.run(['terraform', 'output', '-raw', 'PGDATABASE'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGUSER = subprocess.run(['terraform', 'output', '-raw', 'PGUSER'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    PGPASSWORD = subprocess.run(['terraform', 'output', '-raw', 'PGPASSWORD'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    db_uri = f"postgresql://{PGUSER}:{PGPASSWORD}@{PGHOST}/{PGDATABASE}?sslmode=require"

    # Get the embedding model endpoint and key
    model_configurations = subprocess.run(['terraform', 'output', '-raw', 'model_configurations'], stdout=subprocess.PIPE).stdout.decode('utf-8')
    model_config = json.loads(model_configurations)
    embedding_model = model_config["models"]["text-embedding-3-large"]
    EMBEDDINGS_ENDPOINT = embedding_model["endpoint"]
    EMBEDDINGS_KEY = embedding_model["key"]
    gpt_4o_mini_model = model_config["models"]["gpt-4o-mini"]
    GPT_4O_MINI_ENDPOINT = gpt_4o_mini_model["endpoint"]
    GPT_4O_MINI_KEY = gpt_4o_mini_model["key"]
    gpt_4o_model = model_config["models"]["gpt-4o"]
    GPT_4O_ENDPOINT = gpt_4o_model["endpoint"]
    GPT_4O_KEY = gpt_4o_model["key"]

    print(f"Using {db_uri} as the database connection string")
    print(f"Using {EMBEDDINGS_ENDPOINT} as the embedding model endpoint")
    print(f"Using {GPT_4O_MINI_ENDPOINT} as the gpt-4o-mini model endpoint")
    print(f"Using {GPT_4O_ENDPOINT} as the gpt-4o model endpoint")

finally:
    os.chdir(original_dir)

Using postgresql://:)ycxlsxlLRKks*g#@/?sslmode=require as the database connection string
Using https://graphrag-psbv.openai.azure.com/ as the embedding model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o-mini model endpoint
Using https://graphrag-psbv.openai.azure.com/ as the gpt-4o model endpoint


Setup LLM connection and define extraction function

In [3]:
from openai import AzureOpenAI 

gpt_4o_client = AzureOpenAI(  
    azure_endpoint=GPT_4O_ENDPOINT,  
    api_key=GPT_4O_KEY,  
    api_version="2024-08-01-preview",
)

gpt_4o_mini_client = AzureOpenAI(
    azure_endpoint=GPT_4O_MINI_ENDPOINT,  
    api_key=GPT_4O_MINI_KEY,  
    api_version="2024-08-01-preview",
)

In [None]:
import jinja2
from MoviesDataClasses import EnhancedMovie
from pydantic import BaseModel, ValidationError

template_loader = jinja2.FileSystemLoader(searchpath="./prompts")
template_env = jinja2.Environment(loader=template_loader)
template_extract = template_env.get_template("extract_from_movie.jinja2")

def extract(title: str, description: str):
    messages = [
        {"role": "user", "content": template_extract.render(title=title, description=description)}
    ]
    try:
        completion = gpt_4o_client.beta.chat.completions.parse(  
            model="gpt-4o",
            messages=messages,
            max_tokens=200,  
            temperature=0.7,
            response_format=EnhancedMovie
        )
        response_content = completion.choices[0].message.parsed
        return response_content
    except Exception as e:
        return EnhancedMovie(
            genres=[],
            characters=[],
            themes=[],
            setting=[],
            series=[],
        )

Extract metadata from movies

In [15]:
import os
import glob
import re
import pandas as pd

start_index = 0
batch_size = 200
results = []
batch_index = start_index // batch_size

# Process rows starting from start_index
for idx in range(start_index, len(df)):
    row = df.iloc[idx]
    try:
        print(f"{idx}: Extracting {row['title']}...")
        movie = extract(row["title"], row["overview"])
        enhanced_row = row.to_dict()
        enhanced_row.update(movie.model_dump())
        results.append(enhanced_row)
    except Exception as e:
        print(f"Error processing row {idx}: {e}")
    
    # Export current batch every batch_size rows
    if ((idx + 1 - start_index) % batch_size) == 0:
        batch_filename = f"data/movies_graph_batch_{batch_index}.json"
        pd.DataFrame(results).to_json(batch_filename, orient="records", indent=2)
        print(f">>> Exported batch {batch_index} up to row {idx}")
        batch_index += 1
        results = []

# Export any remaining movies if they exist
if results:
    batch_filename = f"data/movies_graph_batch_{batch_index}.json"
    pd.DataFrame(results).to_json(batch_filename, orient="records", indent=2)
    print(f">>> Exported final batch {batch_index}")

0: Extracting Dilwale Dulhania Le Jayenge...
1: Extracting Gabriel's Inferno Part II...
2: Extracting The Shawshank Redemption...
3: Extracting The Godfather...
4: Extracting Gabriel's Inferno Part III...
5: Extracting Gabriel's Inferno...
6: Extracting Zack Snyder's Justice League...
7: Extracting Godzilla vs. Kong...
8: Extracting Wolfwalkers...
9: Extracting Schindler's List...
10: Extracting The Godfather: Part II...
11: Extracting Your Name....
12: Extracting Spirited Away...
13: Extracting Parasite...
14: Extracting The Green Mile...
15: Extracting Pulp Fiction...
16: Extracting Hamilton...
17: Extracting Dedicated to my ex...
18: Extracting 12 Angry Men...
19: Extracting My Hero Academia: Heroes Rising...
20: Extracting Rascal Does Not Dream of a Dreaming Girl...
21: Extracting Forrest Gump...
22: Extracting The Dark Knight...
23: Extracting Life Is Beautiful...
24: Extracting The Lord of the Rings: The Return of the King...
25: Extracting KonoSuba: God's Blessing on this Wonder

In [16]:
# Merge all batch files into a single JSON file
batch_files = sorted(
    glob.glob("data/movies_graph_batch_*.json"),
    key=lambda f: int(re.search(r'(\d+)', f).group(1))
)

all_data = []
for file in batch_files:
    df_batch = pd.read_json(file)
    all_data.extend(df_batch.to_dict(orient="records"))
    
merged_df = pd.DataFrame(all_data)
merged_output_file = "data/movies_graph.json"
merged_df.to_json(merged_output_file, orient="records", indent=2)
print(f"Merged all batches into {merged_output_file}")

# Delete individual batch files after a successful merge
# for file in batch_files:
#     os.remove(file)
#     print(f"Deleted {file}")

Merged all batches into data/movies_graph.json
