In [None]:
%pip  install llama-index
%pip install llama-index-llms-ollama llama-index-embeddings-ollama

In [2]:
from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

# 设置 LLM 模型（如 llama3.1）
Settings.llm = Ollama(model="qwen2.5:7b", request_timeout=360.0)

# 设置嵌入模型（如 nomic-embed-text）
Settings.embed_model = OllamaEmbedding(model_name="bge-m3")

# 1. 创建支持流式的LLM实例（关键修改）
custom_llm = Ollama(
    model="qwen2.5:7b",
    streaming=True,  # 必须启用流式标志[1,4](@ref)
    temperature=0.7,  # 控制输出多样性[1](@ref)
    request_timeout=120.0  # 避免超时中断[6](@ref)
)

# 2. 删除冗余调用（custom_llm.predict() 不必要）
# 直接使用流式接口

# 3. 正确的流式调用方式
print("测试流式输出:")
for token in custom_llm.stream_complete(prompt="你是谁？"):  # 使用stream_complete[1,2](@ref)
    print(token.delta, end="", flush=True)  # 使用delta获取增量内容[2](@ref)


测试流式输出:
我是Qwen，一个由阿里云开发的语言模型助手。我在这里为了提供帮助和回答问题而设计的，希望能成为你与知识之间的桥梁！如果你有任何问题或需要帮助，请随时告诉我。

In [3]:
# Later, load the index
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine()
response = query_engine.query("请问文章内容是关于什么的？")
print(response)


这篇文章内容主要涉及代理人工智能（Agentic AI）及其在不同领域的应用，如自动化科研论文写作、果园机器人协同作业、医院ICU中的临床决策支持以及企业网络安全响应等。同时提到了多智能体游戏AI和自适应工作流的应用。


## 借助LlamaDebugHandler进行
* 记录所有的调用过程，一些历史记录，时间，token消耗，输入输出等等

In [4]:
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler

# 初始化调试处理器
llama_debug = LlamaDebugHandler(
    print_trace_on_end=True,    # 执行结束后自动打印摘要
)

# 集成到回调系统
callback_manager = CallbackManager([llama_debug])

In [12]:
# 设置为某个组件级别的跟踪器
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="storage")
index = load_index_from_storage(
    storage_context,
    # we can optionally override the embed_model here
    # it's important to use the same embed_model as the one used to build the index
)

# 设置为全局的
Settings.callback_manager = callback_manager

Loading llama_index.core.storage.kvstore.simple_kvstore from storage\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from storage\index_store.json.


In [21]:
# 4. 查询并获取调试信息
query_engine = index.as_query_engine()
response = query_engine.query("文档的主要内容是什么?")

chat_engine =  index.as_chat_engine()
response = chat_engine.chat("请问文章内容关于什么？")
print(response)
# 5. 分析调试数据
print("\n===== 事件时间线 =====")

# 获取所有事件的时间线
event_pairs = llama_debug.get_event_pairs()

# 遍历事件并打印信息
for event_pair in event_pairs:
    start_event, end_event = event_pair
    if end_event:  # 确保有结束事件
        print(f"{end_event.time} _ {start_event.time}")
        
        # 对于LLM事件，可以获取token使用情况
        if start_event.event_type == "llm":
            print("="*10 + "LLM事件" + "="*10)
            token_usage = end_event.payload.get("token_usage", {})
            prompt_tokens = token_usage.get("prompt_tokens", 0)
            completion_tokens = token_usage.get("completion_tokens", 0)
            print(f"  Prompt tokens: {prompt_tokens}")
            print(f"  Completion tokens: {completion_tokens}")

**********
Trace: query
    |_query -> 2.670604 seconds
      |_synthesize -> 2.264138 seconds
        |_templating -> 0.0 seconds
        |_llm -> 2.259139 seconds
**********
**********
Trace: chat
    |_agent_step -> 6.26025 seconds
      |_llm -> 0.949324 seconds
      |_function_call -> 0.79055 seconds
        |_query -> 0.789524 seconds
          |_synthesize -> 0.598661 seconds
            |_templating -> 0.0 seconds
            |_llm -> 0.589502 seconds
      |_llm -> 0.875682 seconds
      |_function_call -> 1.734953 seconds
        |_query -> 1.733937 seconds
          |_synthesize -> 1.567575 seconds
            |_templating -> 0.0 seconds
            |_llm -> 1.562576 seconds
      |_llm -> 1.905706 seconds
**********
这篇文章主要探讨了AI代理和生成性人工智能（Agentic AI）的相关领域，包括工具增强的大型语言模型如Toolfive、协同平台Gentopia及Metta GPT等多智能体框架的应用与挑战；还涉及到了Web代理、查询解析与分析以及科研辅助系统等方面的研究。这些研究旨在从基础理论到实际应用的各种探索，以推动下一代AI技术的发展。

===== 事件时间线 =====
06/10/2025, 16:05:37.931706 _ 06/10/2025, 16:05:35.444762
06/10/2025, 16:0

## 借助第三方的跟踪和调试平台

### langfuse
* 可以使用在线平台， 或者在本地搭建平台
* 与上述相同设置全局callback_manager相似，直接直接集成
* https://langfuse.com/docs/integrations/llama-index/get-started

In [25]:
%pip install --quiet langfuse openinference-instrumentation-llama-index

Note: you may need to restart the kernel to use updated packages.


In [24]:

# 旧版，无法使用 languse.callback
# from langfuse import LlamaIndexCallbackHandler 

import os
#设置 Langfuse 平台的 API Key，参考上方申请
os.environ["LANGFUSE_SECRET_KEY"] = "sk-lf-84d45f77-cea6-46dc-90d9-b52cf6135cc6" 
os.environ["LANGFUSE_PUBLIC_KEY"] = "pk-lf-19583d27-37c4-46f6-958c-ff70e48c6260" 
os.environ["LANGFUSE_HOST"] = "https://us.cloud.langfuse.com" 

from langfuse import get_client

langfuse = get_client()

# Verify connection
if langfuse.auth_check():
    print("Langfuse client is authenticated and ready!")
else:
    print("Authentication failed. Please check your credentials and host.")

Langfuse client is authenticated and ready!


In [29]:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor

# Initialize LlamaIndex instrumentation
LlamaIndexInstrumentor().instrument()


## 使用llama_index封装好的llm等，比如custom_llm
with langfuse.start_as_current_span(name="llama-index-trace"):
    # response = custom_llm.complete("Hello, world!")
    # print(response)

    stream_res = chat_engine.stream_chat("文中关于Ai Agentic的区别与AI agent之间？")
    for chunk in stream_res.response_gen:
        print(chunk, end="", flush=True)
        

langfuse.flush()

Attempting to instrument while already instrumented


**********
Trace: chat
    |_agent_step -> 0.475552 seconds
      |_llm -> 0.0 seconds
**********
 文章中提到，AI Agent 和 Agentic AI 的主要区别在于它们处理复杂任务的能力不同。AI Agents 是能够通过自然语言理解和生成来执行特定任务的智能体，但当面对需要持续上下文保持和动态环境适应能力的复杂任务时，单一的 AI Agent 可能会遇到挑战。而 Agentic AI 则是由多个具有特定功能的代理组成的架构，这些代理能够通过共享信息、协作以及在必要时角色互换来共同完成复杂的多步骤任务或目标。这意味着 Agentic AI 能够更好地适应复杂和动态的变化，提供更高的灵活性和可扩展性。

因此，简而言之：
- **AI Agents** 是独立执行任务的个体智能体。
- **Agentic AI** 则是通过多个代理间的协作来应对更为复杂的问题和环境变化。

### deepeval
* 快速结合：https://deepeval.com/integrations/frameworks/llamaindex

In [None]:
%pip install --quiet -U deepeval

In [34]:
#!deepeval login 自行在控制台登录

^C


#### 测试用例


In [7]:
# 新建测试用例

query_engine = index.as_query_engine()

from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

# 设置deepEval支持的Ollama
from deepeval.models import OllamaModel
eval_model = OllamaModel(
        model="qwen2.5:32b"
    )
# An example input to your RAG application
user_input = "What is LlamaIndex?"

# LlamaIndex returns a response object that contains
# both the output string and retrieved nodes
response_object = query_engine.query(user_input)

# Process the response object to get the output string
# and retrieved nodes
if response_object is not None:
    actual_output = response_object.response
    retrieval_context = [node.get_content() for node in response_object.source_nodes]



# Create a test case and metric as usual
test_case = LLMTestCase(
    input=user_input,
    actual_output=actual_output,
    retrieval_context=retrieval_context
)

# 设置不同的模型作为评估模型
answer_relevancy_metric = AnswerRelevancyMetric(
    model = eval_model
)

# Evaluate
answer_relevancy_metric.measure(test_case)
print(answer_relevancy_metric.score)
print(answer_relevancy_metric.reason)

0.0
The score is 0.00 because the output contains no relevant information about LlamaIndex and includes unrelated discussions on RAG and Tool-Augmented Reasoning, which do not help in defining what LlamaIndex is.


#### 单元测试


In [23]:
import pytest
from deepeval import assert_test
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset, Golden

rag_application = index.as_chat_engine()

example_golden = Golden(input="What is Agentic?")

dataset = EvaluationDataset(goldens=[example_golden])

@pytest.mark.parametrize(
    "golden",
    dataset.goldens,
)
def test_rag(golden: Golden):
    # LlamaIndex returns a response object that contains
    # both the output string and retrieved nodes
    response_object = rag_application.query(golden.input)

    # Process the response object to get the output string
    # and retrieved nodes
    if response_object is not None:
        actual_output = response_object.response
        retrieval_context = [node.get_content() for node in response_object.source_nodes]

    test_case = LLMTestCase(
        input=golden.input,
        actual_output=actual_output,
        retrieval_context=retrieval_context
    )
    answer_relevancy_metric = AnswerRelevancyMetric(threshold=0.5, model=eval_model)
    assert_test(test_case, [answer_relevancy_metric])


#### 整合llama_index

In [None]:
#该部分源代码有问题，建议使用上述的替代即可

# from deepeval.integrations.llama_index import DeepEvalFaithfulnessEvaluator

# # An example input to your RAG application
# user_input = "What is LlamaIndex?"

# # LlamaIndex returns a response object that contains
# # both the output string and retrieved nodes
# response_object = rag_application.query(user_input)

# evaluator = DeepEvalFaithfulnessEvaluator()
# evaluation_result = evaluator.evaluate_response(
#     query=user_input, response=response_object
# )
# print(evaluation_result)

#### 远端拉取数据集测试

In [21]:
#!deepeval login
# 建议使用.py命令行运行

^C
