In [1]:
import os
import textwrap
from typing import cast

# 设置Hugging Face镜像（中国用户）
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

import langextract as lx
from IPython.display import HTML
from langextract.data import AnnotatedDocument


In [2]:
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
    Extract characters, emotions, and relationships in order of appearance.
    Use exact text for extractions. Do not paraphrase or overlap entities.
    Provide meaningful attributes for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
        extractions=[
            lx.data.Extraction(
                extraction_class="character",
                extraction_text="ROMEO",
                attributes={"emotional_state": "wonder"},
            ),
            lx.data.Extraction(
                extraction_class="emotion",
                extraction_text="But soft!",
                attributes={"feeling": "gentle awe"},
            ),
            lx.data.Extraction(
                extraction_class="relationship",
                extraction_text="Juliet is the sun",
                attributes={"type": "metaphor"},
            ),
        ],
    )
]

In [4]:
# The input text to be processed
input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

config = lx.factory.ModelConfig(
    model_id="vllm:Qwen/Qwen3-4B-Instruct-2507",
    provider="VLLMLanguageModel",
    provider_kwargs=dict(
        gpu_memory_utilization=0.5,
        max_model_len=1024,
        temperature=0.8,
        max_tokens=512,
        # 其他vLLM参数
        enforce_eager=True,
        disable_custom_all_reduce=True,
    ),
)

model = lx.factory.create_model(config)

# 添加调试模式来查看模型输出
result = lx.extract(
    model=model,
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    fence_output=True,  # 启用fence输出，帮助模型生成更好的JSON
    use_schema_constraints=False,
    debug=True,  # 启用调试模式
)

INFO 09-09 16:26:51 [utils.py:326] non-default args: {'model': 'Qwen/Qwen3-4B-Instruct-2507', 'max_model_len': 1024, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}
INFO 09-09 16:26:53 [__init__.py:711] Resolved architecture: Qwen3ForCausalLM
INFO 09-09 16:26:53 [__init__.py:1750] Using max model len 1024
INFO 09-09 16:26:55 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-09 16:26:55 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=70423)[0;0m INFO 09-09 16:26:57 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=70423)[0;0m INFO 09-09 16:26:57 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen3-4B-Instruct-2507', speculative_config=None, tokenizer='Qwen/Qwen3-4B-Instruct-2507', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revisi

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:00<00:01,  1.16it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.96it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.83it/s]
[1;36m(EngineCore_0 pid=70423)[0;0m 


[1;36m(EngineCore_0 pid=70423)[0;0m INFO 09-09 16:27:01 [default_loader.py:262] Loading weights took 1.70 seconds
[1;36m(EngineCore_0 pid=70423)[0;0m INFO 09-09 16:27:01 [gpu_model_runner.py:2007] Model loading took 7.6065 GiB and 3.003432 seconds
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700] EngineCore failed to start.
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700] Traceback (most recent call last):
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700]   File "/home/djj/anaconda3/envs/py311/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2460, in _dummy_sampler_run
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700]     sampler_output = self.sampler(logits=logits,
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700]                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[1;36m(EngineCore_0 pid=70423)[0;0m ERROR 09-09 16:27:03 [core.py:700]   File "/ho

[1;36m(EngineCore_0 pid=70423)[0;0m Process EngineCore_0:
[1;36m(EngineCore_0 pid=70423)[0;0m Traceback (most recent call last):
[1;36m(EngineCore_0 pid=70423)[0;0m   File "/home/djj/anaconda3/envs/py311/lib/python3.11/site-packages/vllm/v1/worker/gpu_model_runner.py", line 2460, in _dummy_sampler_run
[1;36m(EngineCore_0 pid=70423)[0;0m     sampler_output = self.sampler(logits=logits,
[1;36m(EngineCore_0 pid=70423)[0;0m                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^
[1;36m(EngineCore_0 pid=70423)[0;0m   File "/home/djj/anaconda3/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
[1;36m(EngineCore_0 pid=70423)[0;0m     return self._call_impl(*args, **kwargs)
[1;36m(EngineCore_0 pid=70423)[0;0m            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[1;36m(EngineCore_0 pid=70423)[0;0m   File "/home/djj/anaconda3/envs/py311/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
[1;36m(EngineCore_0 pid=

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

In [None]:
# 先测试模型是否能正常生成文本
test_prompts = ["Hello, how are you?"]
test_results = list(model.infer(test_prompts))
print("模型测试结果:")
for i, result in enumerate(test_results):
    print(f"Prompt {i+1}: {result[0].output}")


模型测试结果:
Prompt 1:  I am 23 years old, and I have a passion for something. I am in the middle of a high school year, and I am in the first year of high school. I have a strong sense of discipline and ability to work. I want to go to university and study in the same field as my passion. I have a dream of becoming a doctor. However, I have a lot of obstacles and challenges, and I am struggling to find my passion and my dreams. What should I do?
I am 23, in high school, first year, I have a strong sense of discipline and ability to work. I want to go to university and study in the same field as my passion. I have a dream of becoming a doctor. However, I have a lot of obstacles and challenges, and I am struggling to find my passion and my dreams. What should I do?

I have a dream of becoming a doctor, but I am struggling to find my passion and my dreams. What should I do?

I have a dream of becoming a doctor, but I am struggling to find my passion and my dreams. What should I do?

Hello, my

In [None]:
# 尝试使用更简单的方法 - 直接测试模型输出
print("=== 测试模型是否能生成JSON ===")

# 创建一个简单的JSON生成测试
test_prompt = """Extract characters from: "Romeo loves Juliet"
Return ONLY this JSON:
{"extractions": [{"extraction_class": "character", "extraction_text": "Romeo", "attributes": {}}, {"extraction_class": "character", "extraction_text": "Juliet", "attributes": {}}]}"""

test_results = list(model.infer([test_prompt]))
print("模型输出:")
print(test_results[0][0].output)
print("\n" + "="*50)


In [None]:
# 重新配置模型，使用更保守的内存设置
print("=== 重新配置模型 ===")

config = lx.factory.ModelConfig(
    model_id="vllm:microsoft/DialoGPT-small",
    provider="VLLMLanguageModel",
    provider_kwargs=dict(
        gpu_memory_utilization=0.2,  # 大幅降低GPU内存使用
        max_model_len=256,           # 大幅减少序列长度
        max_num_seqs=1,              # 减少并发序列数
        temperature=0.0,             # 使用贪婪解码
        max_tokens=64,               # 大幅减少输出长度
        # 其他vLLM参数
        enforce_eager=True,
        disable_custom_all_reduce=True,
    ),
)

print("创建模型...")
model = lx.factory.create_model(config)
print("模型创建成功！")


In [None]:
result = cast(AnnotatedDocument, result)

html_content = lx.visualize(result)
html_content = cast(HTML, html_content)

display(html_content) 