# 环境构建

In [None]:
# 查看当前环境
!echo "当前环境:"
!echo $CONDA_DEFAULT_ENV

In [None]:
# 创建环境
!conda create --name evalscope python=3.11
!conda init
!source ~/.bashrc
!conda activate evalscope
!conda install jupyterlab
!conda install ipykernel
!python -m ipykernel install --user --name evalscope --display-name "evalscope"

In [None]:
# 查看当前环境
!echo "当前环境:"
!echo $CONDA_DEFAULT_ENV

In [None]:
# 安装evalscope和依赖项
!pip install evalscope # 安装 Native backend (默认)
!pip install 'evalscope[opencompass]' # 安装 OpenCompass backend
!pip install 'evalscope[vlmeval]' # 安装 VLMEvalKit backend
!pip install 'evalscope[rag]' # 安装 RAGEval backend
!pip install 'evalscope[perf]' # 安装 模型压测模块 依赖
!pip install 'evalscope[app]' # 安装 可视化 相关依赖

# 性能评估
    采用了EvalScope专门为Qwen3准备的 modelscope/EvalScope-Qwen3-Test 数据集进行评测，会
    围绕模型的推理、指令跟随、代理能力和多语言支持方面能力进行测试，该数据包含 mmlu_pro 、ifeval 、 live_code_bench 、 math_500 、 aime24 等各著名评估数据集。
    数据集地址：https://modelscope.cn/datasets/modelscope/EvalScope-Qwen3-Test/summary

### 1.用vllm启动模型
    !vllm serve ../output/sft_merge --port 8801

### 2.访问启动的服务
    Qwen3 系列在 vLLM 中：/v1/chat/completions 依赖 chat template
    ! curl http://localhost:8002/v1/completions \
      -H "Content-Type: application/json" \
      -d '{
        "model": "Qwen/Qwen3-0.6B",
        "prompt": "Give me a short introduction to large language models.",
        "max_tokens": 128
      }'
### 3.问题
    localhost修改为127.0.0.1访问不通
    curl http://127.0.0.1:8002/v1/completions \
          -H "Content-Type: application/json" \
          -d '{
            "model": "Qwen/Qwen3-0.6B",
            "prompt": "Give me a short introduction to large language models.",
            "max_tokens": 128
          }'
    原因：
        1. curl 会自动读取这些环境变量：
            http_proxy
            https_proxy
            all_proxy
            no_proxy / NO_PROXY
        2.常见配置长这样（尤其是科研/公司/梯子环境）：
            http_proxy=http://127.0.0.1:7890
            https_proxy=http://127.0.0.1:7890
            NO_PROXY=localhost,::1

        3.注意这里：
            ✅ localhost 在 NO_PROXY → 不走代理
            ❌ 127.0.0.1 不在 NO_PROXY → 走代理
        4.于是：
            localhost  → 直连 vLLM → OK
            127.0.0.1 → 走代理 → 代理连不上 → 卡 / 失败
    解决：
        export NO_PROXY=localhost,127.0.0.1
        export no_proxy=localhost,127.0.0.1

In [None]:
from evalscope import TaskConfig, run_task
task_cfg = TaskConfig(
    model='Qwen/Qwen3-0.6B',
    api_url='http://localhost:8801/v1/chat/completions',
    eval_type='openai_api',
    datasets=[
        'data_collection',
    ],
    dataset_args={
        'data_collection': {
            'dataset_id': 'evalscope/Qwen3-Test-Collection',
            'filters': {'remove_until': '</think>'}  # Filter out the content of thinking
        }
    },
    eval_batch_size=128,
    generation_config={
        'max_tokens': 1024,  # Max number of generated tokens, suggested to set a large value to avoid output truncation
        'temperature': 0.6,  # Sampling temperature (recommended value per Qwen report)
        'top_p': 0.95,  # top-p sampling (recommended value per Qwen report)
        'top_k': 20,  # top-k sampling (recommended value per Qwen report)
        'n': 1,  # Number of replies generated per request
    },
    timeout=60000,  # Timeout
    stream=True,  # Use streaming output
    limit=100,  # Set to 100 samples for testing
)

run_task(task_cfg=task_cfg)

2025-12-22 16:11:29 - evalscope - [32mINFO[0m: Args: Task config is provided with TaskConfig type.[0m
2025-12-22 16:11:29 - evalscope - [32mINFO[0m: Running with native backend[0m
2025-12-22 16:11:29 - evalscope - [32mINFO[0m: Dump task config to ./outputs/20251222_161129/configs/task_config_0268b5.yaml[0m
2025-12-22 16:11:29 - evalscope - [32mINFO[0m: {
    "model": "Qwen/Qwen3-0.6B",
    "model_id": "Qwen3-0.6B",
    "model_args": {},
    "model_task": "text_generation",
    "chat_template": null,
    "datasets": [
        "data_collection"
    ],
    "dataset_args": {
        "data_collection": {
            "dataset_id": "evalscope/Qwen3-Test-Collection",
            "filters": {
                "remove_until": "</think>"
            }
        }
    },
    "dataset_dir": "/home/liuxq/.cache/modelscope/hub/datasets",
    "dataset_hub": "modelscope",
    "repeats": 1,
    "generation_config": {
        "timeout": 60000,
        "batch_size": 128,
        "stream": true,
  

#### 查看模型能力评测结果
    执行：  evalscope app
    访问url：http://localhost:7860/

# 压力测试

### 压力测试命令：

    export NO_PROXY=localhost,127.0.0.1
    export no_proxy=localhost,127.0.0.1
    evalscope perf  --url "http://127.0.0.1:8002/v1/chat/completions"  --parallel 5 --model Qwen/Qwen3-0.6B --number 20 --api openai --dataset openqa --stream
    或
    evalscope perf  --url "http://localhost:8002/v1/chat/completions"  --parallel 5 --model Qwen/Qwen3-0.6B --number 20 --api openai --dataset openqa --stream

### 测试结果样例：

    Benchmarking summary:
    +-----------------------------------+----------+
    | Key                               |    Value |
    +===================================+==========+
    | Time taken for tests (s)          |  23.553  |
    +-----------------------------------+----------+
    | Number of concurrency             |   5      |
    +-----------------------------------+----------+
    | Total requests                    |  20      |
    +-----------------------------------+----------+
    | Succeed requests                  |  20      |
    +-----------------------------------+----------+
    | Failed requests                   |   0      |
    +-----------------------------------+----------+
    | Output token throughput (tok/s)   | 558.529  |
    +-----------------------------------+----------+
    | Total token throughput (tok/s)    | 583.366  |
    +-----------------------------------+----------+
    | Request throughput (req/s)        |   0.8492 |
    +-----------------------------------+----------+
    | Average latency (s)               |   5.0502 |
    +-----------------------------------+----------+
    | Average time to first token (s)   |   0.0325 |
    +-----------------------------------+----------+
    | Average time per output token (s) |   0.0077 |
    +-----------------------------------+----------+
    | Average inter-token latency (s)   |   0.0076 |
    +-----------------------------------+----------+
    | Average input tokens per request  |  29.25   |
    +-----------------------------------+----------+
    | Average output tokens per request | 657.75   |
    +-----------------------------------+----------+
    2025-12-22 15:23:25 - evalscope - INFO:
    Percentile results:
    +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
    | Percentiles | TTFT (s) | ITL (s) | TPOT (s) | Latency (s) | Input tokens | Output tokens | Output (tok/s) | Total (tok/s) |
    +-------------+----------+---------+----------+-------------+--------------+---------------+----------------+---------------+
    |     10%     |  0.0221  | 0.0064  |  0.0072  |   2.4394    |      21      |      332      |    114.631     |   120.2604    |