In [None]:
from dotenv import load_dotenv

load_dotenv()

# Example Use Case: RAG Q/A System with Langchain

Based on [this tutorial](https://python.langchain.com/docs/tutorials/pdf_qa/) from Langchain


In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./nike-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    documents=splits, embedding=OpenAIEmbeddings()
)

retriever = vectorstore.as_retriever()

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "What was Nike's revenue in 2023?"})

results

# Optimizing with Weavel


## Initializing Weavel


In [None]:
import os
from weavel import Weavel

os.environ["WEAVEL_TESTMODE"] = "true"
wv = Weavel(
    base_url="http://localhost:8000"
)

## Base Prompt to Optimize


In [None]:
from ape.common import Prompt

base_prompt = Prompt(
    messages=[
        {
            "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": "{input}",
            }
        ],
    model="gpt-4o",
)

## Setting up the Generator

A Generator can be implemented by subclassing `BaseGenerator` and implementing the `generate` method.  
Given a `Prompt` and inputs, the `generate` method should use your LLM logic (while using the given `Prompt` and `inputs`) and return the outputs. (string / dictionary)


In [None]:
from typing import Any, Dict, List
from typing_extensions import TypedDict
from langchain_core.documents import Document
from ape.common import Prompt, BaseGenerator

class AgentResponse(TypedDict):
    context: List[str]
    answer: str

class LangchainAgentGenerator(BaseGenerator):
    def generate(self,
        prompt: Prompt,
        inputs: Dict[str, Any] = {}
    ) -> AgentResponse:
        lc_prompt = ChatPromptTemplate.from_messages(
            [
                (m["role"], m["content"]) for m in prompt.format().messages
            ]
        )
        question_answer_chain = create_stuff_documents_chain(llm, lc_prompt)
        rag_chain = create_retrieval_chain(retriever, question_answer_chain)

        results = rag_chain.invoke({"input": inputs["input"]})

        return AgentResponse(
            context=[doc.page_content for doc in results["context"]],
            answer=results["answer"],
        )


Generator is a `functor` (a callable class), so you can instantiate it and call it like a function.
This is used internally by Weavel in the process of prompt optimization.


In [None]:
generator = LangchainAgentGenerator()

res = await generator(
    prompt=base_prompt,
    inputs={"input": "What was Nike's revenue in 2023?"},
)
res

## Datasets

In this example, we'll evaluate the RAG system with precision and recall metrics, which don't require a ground truth label.

We'll create a trainset and testset of 20 queries each.


In [None]:
import asyncio
import textwrap
import random

dataset_generator = Prompt(
    messages=[
        {
            "role": "system",
            "content": textwrap.dedent(
            """
            You are an AI assistant tasked with generating user queries.
            Given the following chunk of a document, generate a relevant
            user query that could be answered using the information in
            this chunk. The query should be specific and directly related
            to the content provided.

            Generate a single, concise user query based on this information.
            """
            )
        },
        {
            "role": "user",
            "content": textwrap.dedent(
            """
            Please generate a user query based on the given document chunk.

            Document chunk:
            {context}
            """
            )
        },
    ],
    model="gpt-4o-mini",
)

generated_queries = []

async def _task(context):
    res = await dataset_generator(context=context)
    generated_queries.append(res)

tasks = [
    _task(doc.page_content)
    for doc in random.sample(splits, 40)
]

await asyncio.gather(*tasks)

print(generated_queries)

In [None]:
from ape.common import DatasetItem

trainset = [DatasetItem(inputs={"input": query}) for query in generated_queries[:20]]
testset = [DatasetItem(inputs={"input": query}) for query in generated_queries[20:]]

## Metrics

We'll use precision and recall metrics in this tutorial.  
A Metric can be implemented by subclassing `BaseMetric` and implementing the `evaluate` method.


We'll define LLM judge prompts for precision and recall in a `.prompt` file.  
`ape.common.Prompt` objects can load and dump from `.prompt` files.  
Syntax highlighting for `.prompt` files is supported with the [Promptfile Intellisense](https://marketplace.visualstudio.com/items?itemName=Weavel.promptfile-intellisense) extension.


In [None]:
from ape.common import BaseMetric, MetricResult
from pysbd import Segmenter
from pysbd.utils import TextSpan

class RAGMetric(BaseMetric):
    def __init__(self):
        self.segmenter = Segmenter(language="en", clean=False, char_span=True)

        self.analyze_statements = Prompt.load_file("./judge-prompts/statement-analysis.prompt")
        self.score_precision = Prompt.load_file("./judge-prompts/precision-judge.prompt")
        self.score_recall = Prompt.load_file("./judge-prompts/recall-judge.prompt")

    async def compute(self,
        dataset_item: DatasetItem,
        pred: AgentResponse,
    ) -> MetricResult:
        sentences: List[TextSpan] = self.segmenter.segment(pred["answer"])
        sentences: List[str] = [sentence.sent for sentence in sentences]
        sentences = [
            sentence.strip()
            for sentence in sentences
            if sentence.strip().endswith(".")
        ]
        sentences = "\n".join([f"{i}:{x}" for i, x in enumerate(sentences)])

        res = await self.analyze_statements(
            question=dataset_item["inputs"]["input"],
            answer=pred["answer"],
            sentences=sentences,
        )
        statements = [
            statement
            for analysis in res["analysis"]
            for statement in analysis["simpler_statements"]
        ]

        ground_truth = "\n\n".join(pred["context"])

        precision_task = self.score_precision(
            ground_truth=ground_truth,
            statements=statements,
        )
        recall_task = self.score_recall(
            ground_truth=ground_truth,
            statements=statements,
        )

        precision_res, recall_res = await asyncio.gather(precision_task, recall_task)

        precision_score = sum(answer['verdict'] for answer in precision_res['answer']) / len(precision_res['answer']) if precision_res['answer'] else 0
        recall_score = sum(answer['verdict'] for answer in recall_res['answer']) / len(recall_res['answer']) if recall_res['answer'] else 0

        return MetricResult(
            score = (precision_score + recall_score) / 2,
            trace={
                "precision": precision_score,
                "recall": recall_score,
            }
        )



In [None]:
metric = RAGMetric()

await metric(
    dataset_item=trainset[0],
    pred=await generator(
        prompt=base_prompt,
        inputs=trainset[0]["inputs"]
    )
)

## Optimize

`wv.optimize` will return an optimized prompt based on provided parameters. If you pass in multiple models, it will return the highest performing prompt + model, and one optimized prompt will be saved per each model.

An optimization task can take a while to complete, ranging from 5 minutes to around an hour depending on the prompt, metric, and parameters.

You can view the optimized prompts on the [Weavel dashboard](https://app.weavel.ai/).


In [None]:
import logging
from weavel.utils import logger

logger.setLevel(logging.DEBUG)

We currently support the following algorithms: **dspy_mipro**, **few_shot**, **text_gradient**, **optuna**, **expel**

We are currently in the process of benchmarking these algorithms to determine the best for each use case. It will all be open sourced in [ape-core](https://github.com/weavel-ai/Ape).


In [None]:
optimized_prompt = await wv.optimize(
    base_prompt=base_prompt,
    models=["gpt-4o"],
    trainset=trainset,
    metric=metric,
    generator=generator,
)

with open("./optimized-prompt.prompt", "w") as f:
    f.write(optimized_prompt.dump())

## Evaluator

Use `Evaluator` to evaluate a prompt on a testset.


In [None]:
from ape.common import Evaluator

evaluate = Evaluator(
    testset=testset,
    metric=metric,
    generator=generator,
    display_table=True,
    display_progress=True,
)

In [None]:
await evaluate(base_prompt)

In [None]:
await evaluate(optimized_prompt)