### 環境設定

In [2]:
from dotenv import load_dotenv
import os

# .env ファイルを読み込む
path_env="C:\\Users\\Yuichi Katogi\\.env"
load_dotenv(path_env)

# 環境変数を取得
langsmith_tracing_v2 = os.getenv("LANGCHAIN_TRACING_V2")
langsmith_endpoint = os.getenv("LANGSMITH_ENDPOINT")
langsmith_api_key = os.getenv("LANGSMITH_API_KEY")
langsmith_project = os.getenv("LANGSMITH_PROJECT")

# Langsmithがただしく読み込めているか確認。なぜかtracingがNoneだができてる
print(f"LANGSMITH_TRACING_V2: {langsmith_tracing_v2}")
print(f"LANGSMITH_ENDPOINT: {langsmith_endpoint}")
print(f"LANGSMITH_PROJECT: {langsmith_project}")

LANGSMITH_TRACING_V2: None
LANGSMITH_ENDPOINT: https://api.smith.langchain.com
LANGSMITH_PROJECT: agent-book


### 7.4 Ragasによる合成テストデータの生成

In [None]:
#%pip install langchain-core==0.2.30 langchain-openai==0.1.21 langchain-community==0.2.12 GitPython==3.1.43 langchain-chroma==0.1.2 chromadb==0.5.3 ragas==0.1.14 nest-asyncio==1.6.0

#### 検索対象のドキュメントのロード
- Langchainの公式ドキュメントを使用する

In [3]:
from langchain_community.document_loaders import GitLoader


def file_filter(file_path: str) -> bool:
    return file_path.endswith(".mdx")


loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./langchain",
    branch="langchain==0.2.13",
    file_filter=file_filter,
)

documents = loader.load()
print(len(documents))

280


#### Ragas による合成テストデータ生成の実行

In [4]:
for document in documents:
    document.metadata["filename"] = document.metadata["source"]

In [7]:
import nest_asyncio
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

nest_asyncio.apply()

generator = TestsetGenerator.from_langchain(
    generator_llm=ChatOpenAI(model="gpt-4o-mini"),
    critic_llm=ChatOpenAI(model="gpt-4o-mini"),
    embeddings=OpenAIEmbeddings(),
)

testset = generator.generate_with_langchain_docs(
    documents,
    test_size=4,  # 生成するテストデータの数
    distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25}, # テストデータの生成に使用するテストセットの分布
)

# テストセットは単純な質問2/4、推論が必要な質問1/4、回答に複数の情報源が必要な質問1/4となるように設定
# gpt-4oだと1分あたりのトークン制限に達してしまった。昔と比べてDocument loaderで参照する情報量が増えている。推論力はやや劣るがgpt-4o-miniを使用する。

Generating: 100%|██████████| 4/4 [00:12<00:00,  3.18s/it]         


In [8]:
testset.to_pandas()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the purpose of PromptLayer in the cont...,[# PromptLayer\n\n>[PromptLayer](https://docs....,PromptLayer is a platform for prompt engineeri...,simple,[{'source': 'docs\docs\integrations\providers\...,True
1,What features does Telegram Messenger provide ...,[# Telegram\n\n>[Telegram Messenger](https://w...,Telegram Messenger provides features such as o...,simple,[{'source': 'docs\docs\integrations\providers\...,True
2,"How to set up PromptLayer with LangChain, incl...",[# PromptLayer\n\n>[PromptLayer](https://docs....,"To set up PromptLayer with LangChain, you need...",reasoning,[{'source': 'docs\docs\integrations\providers\...,True
3,How does LLM observability boost LangChain sec...,[# PromptLayer\n\n>[PromptLayer](https://docs....,The context does not provide specific informat...,multi_context,[{'source': 'docs\docs\integrations\providers\...,True


#### LangSmith の Dataset の作成
- LangSmithでデータセットを管理する「Dataset」というオブジェクトを作成する

In [9]:
from langsmith import Client

dataset_name = "agent-book"

client = Client()

if client.has_dataset(dataset_name=dataset_name):
    client.delete_dataset(dataset_name=dataset_name)

dataset = client.create_dataset(dataset_name=dataset_name)

#### 合成テストデータの保存

In [10]:
inputs = []
outputs = []
metadatas = []

for testset_record in testset.test_data:
    inputs.append(
        {
            "question": testset_record.question,
        }
    )
    outputs.append(
        {
            "contexts": testset_record.contexts,
            "ground_truth": testset_record.ground_truth,
        }
    )
    metadatas.append(
        {
            "source": testset_record.metadata[0]["source"],
            "evolution_type": testset_record.evolution_type,
        }
    )

In [11]:
client.create_examples(
    inputs=inputs,
    outputs=outputs,
    metadata=metadatas,
    dataset_id=dataset.id,
)

### 7.5. LangSmith と Ragas を使ったオフライン評価の実装

#### カスタムEvaluatorの実装

In [12]:
from typing import Any

from langchain_core.embeddings import Embeddings
from langchain_core.language_models import BaseChatModel
from langsmith.schemas import Example, Run
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper
from ragas.metrics.base import Metric, MetricWithEmbeddings, MetricWithLLM


class RagasMetricEvaluator:
    def __init__(self, metric: Metric, llm: BaseChatModel, embeddings: Embeddings):
        self.metric = metric

        # LLMとEmbeddingsをMetricに設定
        if isinstance(self.metric, MetricWithLLM):
            self.metric.llm = LangchainLLMWrapper(llm)
        if isinstance(self.metric, MetricWithEmbeddings):
            self.metric.embeddings = LangchainEmbeddingsWrapper(embeddings)

    def evaluate(self, run: Run, example: Example) -> dict[str, Any]:
        context_strs = [doc.page_content for doc in run.outputs["contexts"]]

        # Ragasの評価メトリクスのscoreメソッドでスコアを算出
        score = self.metric.score(
            {
                "question": example.inputs["question"],
                "answer": run.outputs["answer"],
                "contexts": context_strs,
                "ground_truth": example.outputs["ground_truth"],
            },
        )
        return {"key": self.metric.name, "score": score}

In [13]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.metrics import answer_relevancy, context_precision

metrics = [context_precision, answer_relevancy]

llm = ChatOpenAI(model="gpt-4o", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

evaluators = [
    RagasMetricEvaluator(metric, llm, embeddings).evaluate
    for metric in metrics
]

#### 推論の関数の実行

In [14]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = Chroma.from_documents(documents, embeddings)

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI

prompt = ChatPromptTemplate.from_template('''\
以下の文脈だけを踏まえて質問に回答してください。

文脈: """
{context}
"""

質問: {question}
''')

model = ChatOpenAI(model="gpt-4o-mini", temperature=0)

retriever = db.as_retriever()

chain = RunnableParallel(
    {
        "question": RunnablePassthrough(),
        "context": retriever,
    }
).assign(answer=prompt | model | StrOutputParser())

In [16]:
def predict(inputs: dict[str, Any]) -> dict[str, Any]:
    question = inputs["question"]
    output = chain.invoke(question)
    return {
        "contexts": output["context"],
        "answer": output["answer"],
    }

#### オフライン評価の実行

In [None]:
from langsmith.evaluation import evaluate

evaluate(
    predict,
    data="agent-book",
    evaluators=evaluators,
)