In [None]:
!uv add ragas langchain-openai langchain-community pandas ipywidgets

## 1.1 下载和加载中文语料库

### 1.1.1 准备中文数据集（比如中国四大名著）

In [None]:
# !git clone https://github.com/tennessine/corpus.git

In [None]:
!tree

### 1.1.2 使用LangChain加载中文文档

In [None]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader

# 指向包含中文文本的目录
path = "corpus/"
loader = DirectoryLoader(path, glob="**/*.md", loader_cls=TextLoader)
docs = loader.load()

In [None]:
import pandas as pd

df = pd.DataFrame(docs)
df.head()

## 1.2 初始化支持中文的模型

In [None]:
from pydantic_settings import BaseSettings, SettingsConfigDict
from pydantic import SecretStr


class Settings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file=".env", env_file_encoding="utf-8", extra="ignore", case_sensitive=False
    )
    openai_api_key: SecretStr
    openai_base_url: str
    openai_model: str  # 评估用LLM
    openai_embedding_model: str  # 嵌入模型
    temperature: int = 0  # 固定随机种子，确保评估结果稳定


config = Settings()

In [None]:
config.model_dump()

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI, OpenAIEmbeddings


llm = ChatOpenAI(
    base_url=config.openai_base_url,
    api_key=config.openai_api_key,
    model=config.openai_model,
    temperature=config.temperature,
)
embeddings = OpenAIEmbeddings(
    model=config.openai_embedding_model,
    base_url=config.openai_base_url,
    api_key=config.openai_api_key,
)

generator_llm = LangchainLLMWrapper(llm)
generator_embedding = LangchainEmbeddingsWrapper(embeddings=embeddings)

## 1.3 设置中文角色和转换工具
### 1.3.1 定义中文场景的用户角色

In [None]:
from ragas.testset.persona import Persona

personas = [
    Persona(
        name="中文四大名著学习者",
        role_description="  一位对中国古典文学四大名著（《红楼梦》、《三国演义》、《水浒传》、《西游记》）感兴趣的学习者，希望通过查询深入了解这些作品的内容、背景和文学价值。",
    ),
]

### 1.3.2 配置中文适用的转换工具（如标题分割、实体提取）

In [None]:
from ragas.testset.transforms.extractors.llm_based import NERExtractor
from ragas.testset.transforms.splitters import HeadlineSplitter

transforms = [HeadlineSplitter(), NERExtractor(llm=generator_llm)]

## 1.4 初始化测试生成器

In [None]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(
    llm=generator_llm,
    embedding_model=generator_embedding,
    persona_list=personas,
)

## 1.5 加载查询类型并适配中文
### 1.5.1 定义单跳查询生成器并适配中文

In [None]:
from ragas.testset.synthesizers.single_hop.specific import (
    SingleHopSpecificQuerySynthesizer,
)


distribution = [
    (SingleHopSpecificQuerySynthesizer(llm=generator_llm), 1.0),
]
# 将查询提示词适配为中文
for query, _ in distribution:
    prompts = await query.adapt_prompts(
        "chinese", llm=generator_llm
    )  # 指定目标语言为中文
    query.set_prompts(**prompts)

## 1.6 生成中文测试集
### 1.6.1 基于中文文档生成查询

In [None]:
dataset = generator.generate_with_langchain_docs(
    docs,
    testset_size=3,  # 生成3条中文查询
    transforms=transforms,
    query_distribution=distribution,
)

### 1.6.2 转换为评估数据集并查看结果

In [None]:
eval_dataset = dataset.to_evaluation_dataset()
# 打印第一条中文查询和参考文本
print("用户查询:", eval_dataset[0].user_input)
print("参考回答:", eval_dataset[0].reference)