## Synthetic RAG evaluation data using Azure OpenAI

### 1.1 Prepare source documents

In [13]:
from langchain_community.document_loaders import PyPDFLoader

file_path = './data/r1t-owner-guide.pdf'
loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

print(f'The PDF has {len(pages)} pages.')


The PDF has 446 pages.


In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
# split documents into text and embeddings

text_splitter = RecursiveCharacterTextSplitter(
   chunk_size=1024, 
   chunk_overlap=200,
   length_function=len,
   is_separator_regex=False
)

pages_sample = pages[70:85]

chunks = text_splitter.split_documents(pages_sample)

In [15]:
# Print the first chunk
print(chunks[0].page_content)

71OCCUPANCY  DETECTION
To ensure  that  the Occupant  Classification  System  (OCS)  is properly  classifying  the front  seat  occupant,  the 
occupant  must:
  lSit upright  on the center  of the seat  cushion,  with  shoulders  and  back  resting  against  the seat  back,  and  with  
legs  comfortably  extended  on the floor.
  lRemain  seated  on the center  of the seat  cushion  for the duration  of the trip.
The  OCS  may  not classify  the front  seat  occupant  properly  if the occupant  does  any of the following:
  lFails  to sit in an upright  position.
  lLeans  against  the door  or center  console.
  lSits  toward  the front  edge  of the seat.
  lPuts  their  legs  and  feet  on the dashboard.
  lPulls  the seat  belt  all the way  out activating  the Automatic  Locking  Retractor  (ALR).
If an occupant  of adult  size  is sitting  in the front  passenger  seat,  but the Passenger  Airbag  Off indicator  on the


### 1.2 Set up Azure Open AI agent for Question-Answer generation

In [16]:
from openai import AzureOpenAI
import os
import dotenv

dotenv.load_dotenv('.env')


True

In [17]:

client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2024-02-01",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

In [18]:
system_prompt = """
Your task is to write a factoid question and an answer as JSON given a context.
Your factoid question should be answerable with a specific, concise piece of factual information from the context.
Your factoid question should be formulated in the same style as questions users could ask in a search engine.
This means that your factoid question MUST NOT mention something like "according to the passage" or "context".

Provide your answer in JSON format as follows:

Output:::
{"question": (your factoid question), "answer": (your answer to the factoid question)}
"""

In [19]:
def call_azure_openai(client, system_prompt, context):
    try:
        response = client.chat.completions.create(
            model="gpt4-1106", # Model = should match the deployment name you chose for your 0125-Preview model deployment
            response_format={ "type": "json_object" },
            messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": f"""Here is the context. Context: {context} \n"""}
        ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(e)


### 1.3 Generating Q&A pairs 

In [20]:
import random
import pandas as pd
from tqdm.auto import tqdm
import json

n_samples = 20  # We intentionally generate only 10 QA couples here for cost and time considerations

print(f"Generating {n_samples} QA couples...")

outputs = []
for sampled_context in tqdm(random.sample(chunks, n_samples)):
    # Generate QA couple
    output_QA_couple = call_azure_openai(client, system_prompt, sampled_context.page_content)
    try:
        output_QA_couple = json.loads(output_QA_couple)
        output_QA_couple["context"] = sampled_context.page_content
        outputs.append(output_QA_couple)
    except:
        print("Error in generating QA couple. Skipping...")
        continue

  from .autonotebook import tqdm as notebook_tqdm


Generating 20 QA couples...


100%|██████████| 20/20 [01:58<00:00,  5.92s/it]


In [21]:
import pandas as pd

samples_df = pd.DataFrame(outputs)
samples_df.head(10)

Unnamed: 0,question,answer,context
0,What is the recommended age for children to be...,12 years old and younger,74Children in Rear Seats\nAlthough your ve...
1,At what speed do side mirrors automatically un...,15 mph (25 km/h),81NOTES \n lIn below freezing temperature...
2,Where is the Tire and Loading Information labe...,on the driver's door pillar,85Load Capacity\nThe Tire and Loading Inf...
3,How do you express close a window in a car?,Pull the window switch up past the second notc...,76WINDOWS\nOpen or Close Windows\nOPEN OR C...
4,Where is the indicator light that shows the st...,on the center display,73\nDANGER \n lFollow the proper seating ...
5,What does the Window Locks feature remember on...,The previous setting,switch for that window on the driver's doo...
6,What triggers the side mirrors to automaticall...,The ambient lighting sensors in the rearview m...,the side mirrors unfold automatically when...
7,What should a Rivian owner do if the airbag st...,Contact Rivian immediately to have the airbag ...,Rivian immediately to have the airbag syst...
8,Where should a toll device be placed to avoid ...,On the interior of the windshield behind the r...,79Toll Devices\nTo determine where to place...
9,How do you lock both rear passenger windows us...,"Choose the Vehicle button, then in the Access ...",77Lock or Unlock Passenger Windows\nTo lock...


### 1.4 Saving outputs as a jsonl dataset

In [10]:
file_path = './data/qa_couples.jsonl'
samples_df.to_json(file_path, orient='records', lines=True)

### 1.5 Write documents to Azure AI Search using Langchain

In [5]:
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential


search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
search_credential = os.environ["AZURE_SEARCH_ADMIN_KEY"]
index_name = "rivian-owner-guide-recursivesplitter"

In [6]:
from langchain_openai import AzureOpenAIEmbeddings

# Use API key if provided, otherwise use RBAC authentication
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    openai_api_version="2024-02-01",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
)   

In [7]:
from langchain.vectorstores.azuresearch import AzureSearch

vector_store = AzureSearch(
    azure_search_endpoint=search_endpoint,
    azure_search_key=search_credential,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
    semantic_configuration_name="default"
)

In [11]:
results = vector_store.add_documents(documents=chunks)
print(f"Indexed {len(chunks)} chunks")

Indexed 25 chunks


### 1.6 Test the newly created Azure AI Search index

In [26]:
import random
# Assuming 'df' is your DataFrame
random_row = samples_df.sample(n=1)

# Access the values of the random row
test_question = random_row['question'].values[0]

print(test_question)

What may cause the OCS to not classify the front seat occupant properly?


In [27]:
#Perform a hybrid search
docs = vector_store.similarity_search(
    query=test_question,
    k=3, 
    search_type="hybrid"
)
docs = docs[:3]
for doc in docs:
    print("-" * 80)  
    print(f"Source: {doc.metadata['source']}")
    print(f"Chunk Content: {doc.page_content}")

--------------------------------------------------------------------------------
Source: ./data/r1t-owner-guide.pdf
Chunk Content: 71OCCUPANCY  DETECTION
To ensure  that  the Occupant  Classification  System  (OCS)  is properly  classifying  the front  seat  occupant,  the 
occupant  must:
  lSit upright  on the center  of the seat  cushion,  with  shoulders  and  back  resting  against  the seat  back,  and  with  
legs  comfortably  extended  on the floor.
  lRemain  seated  on the center  of the seat  cushion  for the duration  of the trip.
The  OCS  may  not classify  the front  seat  occupant  properly  if the occupant  does  any of the following:
  lFails  to sit in an upright  position.
  lLeans  against  the door  or center  console.
  lSits  toward  the front  edge  of the seat.
  lPuts  their  legs  and  feet  on the dashboard.
  lPulls  the seat  belt  all the way  out activating  the Automatic  Locking  Retractor  (ALR).
If an occupant  of adult  size  is sitting  in the fr