# Synthetic Data Generation Using RAGAS - RAG Evaluation with LangSmith

In [20]:
import os
import getpass

openai_api_key = os.getenv('OPENAI_API_KEY')
langchain_api_key = os.getenv('LANGCHAIN_API_KEY')
ragas_app_token = os.getenv('RAGAS_APP_TOKEN')

os.environ["LANGCHAIN_TRACING_V2"] = "true"

In [21]:
from uuid import uuid4

os.environ["LANGCHAIN_PROJECT"] = f"AIM - SDG - {uuid4().hex[0:8]}"

In [22]:
from dotenv import load_dotenv
load_dotenv()

os.environ["RAGAS_APP_TOKEN"] = ragas_app_token

In [23]:
from langchain_community.document_loaders import DirectoryLoader

path = "data/"
loader = DirectoryLoader(path, glob="*.pdf")
docs = loader.load()

In [24]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
generator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

In [25]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)
dataset = generator.generate_with_langchain_docs(docs, testset_size=10)

Generating personas: 100%|██████████| 2/2 [00:00<00:00,  2.67it/s]                                           
Generating Scenarios: 100%|██████████| 3/3 [00:03<00:00,  1.31s/it]
Generating Samples: 100%|██████████| 12/12 [00:03<00:00,  3.39it/s]


In [26]:
dataset.to_pandas()

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,Cud yu pleese explane how Refersion is used in...,"[Subscription attributes id string, max chars=...",Refersion uses the 'created_from_ip' field to ...,single_hop_specifc_query_synthesizer
1,How does the Chargebee API handle subscription...,[updated with a new timestamp in milliseconds ...,The Chargebee API synchronizes subscription da...,single_hop_specifc_query_synthesizer
2,How can curl be used to create a subscription ...,[free period in association with the billing p...,Curl can be used to create a subscription for ...,single_hop_specifc_query_synthesizer
3,What are the prerequisites for backdating a su...,[called backdating the subscription creation a...,Backdating a subscription in Chargebee is allo...,single_hop_specifc_query_synthesizer
4,How does API versioning in Stripe ensure backw...,[<1-hop>\n\ndefines your place in the list. Fo...,API versioning in Stripe ensures backward comp...,multi_hop_abstract_query_synthesizer
5,How does the Stripe API handle authentication ...,"[<1-hop>\n\n2/23/25, 5:51 PM Stripe API Refere...",The Stripe API uses API keys to authenticate r...,multi_hop_abstract_query_synthesizer
6,How does API versioning in Stripe ensure backw...,[<1-hop>\n\ndefines your place in the list. Fo...,API versioning in Stripe ensures backward comp...,multi_hop_abstract_query_synthesizer
7,How does API authentication impact the handlin...,"[<1-hop>\n\n2/23/25, 5:51 PM Stripe API Refere...",API authentication in Stripe's RESTful API is ...,multi_hop_abstract_query_synthesizer
8,How can a developer manage subscription billin...,"[<1-hop>\n\nInput Parameters limit optional, i...",A developer can manage subscription billing cy...,multi_hop_specific_query_synthesizer
9,How does the use of PaymentIntents and idempot...,[<1-hop>\n\nRelated guide: File upload guide E...,The use of PaymentIntents in Stripe's API enha...,multi_hop_specific_query_synthesizer


In [27]:
dataset.upload()

Testset uploaded! View at https://app.ragas.io/dashboard/alignment/testset/1f5dfea3-8dba-4477-85ae-1e704c8edcf4


'https://app.ragas.io/dashboard/alignment/testset/1f5dfea3-8dba-4477-85ae-1e704c8edcf4'

In [30]:
from langsmith import Client

client = Client()

dataset_name = "Billing API v2"

langsmith_dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Billing API v2"
)

In [31]:
for data_row in dataset.to_pandas().iterrows():
  client.create_example(
      inputs={
          "question": data_row[1]["user_input"]
      },
      outputs={
          "answer": data_row[1]["reference"]
      },
      metadata={
          "context": data_row[1]["reference_contexts"]
      },
      dataset_id=langsmith_dataset.id
  )

In [32]:
rag_documents = docs

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 750,
    chunk_overlap = 100
)

rag_documents = text_splitter.split_documents(rag_documents)
len(rag_documents)

441

In [36]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [37]:
from langchain_community.vectorstores import Qdrant

vectorstore = Qdrant.from_documents(
    documents=rag_documents,
    embedding=embeddings,
    location=":memory:",
    collection_name="Billing APIs"
)

In [38]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 10})

In [49]:
def retrieve(state):
  retrieved_docs = retriever.invoke(state["question"])
  return {"context" : retrieved_docs}

In [50]:
from langchain.prompts import ChatPromptTemplate

RAG_PROMPT = """\
Given a provided context and question, you must answer the question based only on context.

If you cannot answer the question based on the context - you must say "I don't know".

Context: {context}
Question: {question}
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [51]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [52]:
def generate(state):
  docs_content = "\n\n".join(doc.page_content for doc in state["context"])
  messages = rag_prompt.format_messages(question=state["question"], context=docs_content)
  response = llm.invoke(messages)
  return {"response" : response.content}

In [53]:
from langgraph.graph import START, StateGraph
from typing_extensions import List, TypedDict
from langchain_core.documents import Document

class State(TypedDict):
  question: str
  context: List[Document]
  response: str

In [54]:
graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [55]:
response = graph.invoke({"question" : "How does Stripe handle errors?"})

In [56]:
response["response"]

'Stripe handles errors by raising exceptions for various reasons, such as failed charges, invalid parameters, authentication issues, and network unavailability. They recommend writing code that gracefully handles all possible API exceptions. The types of errors include api_error, card_error, idempotency_error, and invalid_request_error. Each error type provides specific information that can be used to diagnose and address the issues.'

In [57]:
for test_row in dataset:
  response = graph.invoke({"question" : test_row.eval_sample.user_input})
  test_row.eval_sample.response = response["response"]
  test_row.eval_sample.retrieved_contexts = [context.page_content for context in response["context"]]

In [58]:
dataset.to_pandas()

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,synthesizer_name
0,Cud yu pleese explane how Refersion is used in...,"[2/24/25, 7:45 PM\n\nSubscriptions | Chargebee...","[Subscription attributes id string, max chars=...",I don't know.,Refersion uses the 'created_from_ip' field to ...,single_hop_specifc_query_synthesizer
1,How does the Chargebee API handle subscription...,[play_store The object data is synchronized wi...,[updated with a new timestamp in milliseconds ...,The Chargebee API handles subscription data sy...,The Chargebee API synchronizes subscription da...,single_hop_specifc_query_synthesizer
2,How can curl be used to create a subscription ...,"[2/24/25, 7:45 PM\n\nSubscriptions | Chargebee...",[free period in association with the billing p...,To create a subscription for items in Chargebe...,Curl can be used to create a subscription for ...,single_hop_specifc_query_synthesizer
3,What are the prerequisites for backdating a su...,[Backdating must be enabled for subscription r...,[called backdating the subscription creation a...,The prerequisites for backdating a subscriptio...,Backdating a subscription in Chargebee is allo...,single_hop_specifc_query_synthesizer
4,How does API versioning in Stripe ensure backw...,[01-27.acacia. For information on all API vers...,[<1-hop>\n\ndefines your place in the list. Fo...,API versioning in Stripe ensures backward comp...,API versioning in Stripe ensures backward comp...,multi_hop_abstract_query_synthesizer
5,How does the Stripe API handle authentication ...,[Server Errors\n\nSomething went wrong on Stri...,"[<1-hop>\n\n2/23/25, 5:51 PM Stripe API Refere...",The Stripe API handles authentication errors b...,The Stripe API uses API keys to authenticate r...,multi_hop_abstract_query_synthesizer
6,How does API versioning in Stripe ensure backw...,[01-27.acacia. For information on all API vers...,[<1-hop>\n\ndefines your place in the list. Fo...,API versioning in Stripe ensures backward comp...,API versioning in Stripe ensures backward comp...,multi_hop_abstract_query_synthesizer
7,How does API authentication impact the handlin...,[Server Errors\n\nSomething went wrong on Stri...,"[<1-hop>\n\n2/23/25, 5:51 PM Stripe API Refere...",I don't know.,API authentication in Stripe's RESTful API is ...,multi_hop_abstract_query_synthesizer
8,How can a developer manage subscription billin...,"[2/24/25, 7:45 PM\n\nSubscriptions | Chargebee...","[<1-hop>\n\nInput Parameters limit optional, i...",I don't know.,A developer can manage subscription billing cy...,multi_hop_specific_query_synthesizer
9,How does the use of PaymentIntents and idempot...,[Accept all\n\nhttps://docs.stripe.com/api?lan...,[<1-hop>\n\nRelated guide: File upload guide E...,The use of PaymentIntents in Stripe's API guid...,The use of PaymentIntents in Stripe's API enha...,multi_hop_specific_query_synthesizer


In [59]:
from ragas import EvaluationDataset

evaluation_dataset = EvaluationDataset.from_pandas(dataset.to_pandas())

In [60]:
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))

In [61]:
from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, ResponseRelevancy, ContextEntityRecall, NoiseSensitivity
from ragas import evaluate, RunConfig

custom_run_config = RunConfig(timeout=360)

result = evaluate(
    dataset=evaluation_dataset,
    metrics=[LLMContextRecall(), Faithfulness(), FactualCorrectness(), ResponseRelevancy(), ContextEntityRecall(), NoiseSensitivity()],
    llm=evaluator_llm,
    run_config=custom_run_config
)
result

Evaluating:  17%|█▋        | 12/72 [00:05<00:24,  2.44it/s]Exception raised in Job[2]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  56%|█████▌    | 40/72 [00:26<00:21,  1.52it/s]Exception raised in Job[44]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating:  67%|██████▋   | 48/72 [00:36<00:32,  1.35s/it]Exception raised in Job[50]: TypeError(ufunc 'invert' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe'')
Evaluating: 100%|██████████| 72/72 [03:27<00:00,  2.89s/it]


{'context_recall': 0.8000, 'faithfulness': 0.6648, 'factual_correctness': 0.7589, 'answer_relevancy': 0.7297, 'context_entity_recall': 0.3265, 'noise_sensitivity_relevant': 0.2287}