In [None]:
import os
import textwrap
from typing import cast



import langextract as lx
from IPython.display import HTML
from langextract.data import AnnotatedDocument


In [2]:
# 1. Define the prompt and extraction rules
prompt = textwrap.dedent("""\
    Extract characters, emotions, and relationships in order of appearance.
    Use exact text for extractions. Do not paraphrase or overlap entities.
    Provide meaningful attributes for each entity to add context.""")

# 2. Provide a high-quality example to guide the model
examples = [
    lx.data.ExampleData(
        text="ROMEO. But soft! What light through yonder window breaks? It is the east, and Juliet is the sun.",
        extractions=[
            lx.data.Extraction(
                extraction_class="character",
                extraction_text="ROMEO",
                attributes={"emotional_state": "wonder"},
            ),
            lx.data.Extraction(
                extraction_class="emotion",
                extraction_text="But soft!",
                attributes={"feeling": "gentle awe"},
            ),
            lx.data.Extraction(
                extraction_class="relationship",
                extraction_text="Juliet is the sun",
                attributes={"type": "metaphor"},
            ),
        ],
    )
]

In [3]:
# The input text to be processed
input_text = "Lady Juliet gazed longingly at the stars, her heart aching for Romeo"

config = lx.factory.ModelConfig(
    model_id="vllm:microsoft/Phi-3-mini-4k-instruct",
    
    # model_id="vllm:Qwen/Qwen3-4B-Instruct-2507",
    provider="VLLMLanguageModel",
    provider_kwargs=dict(
        gpu_memory_utilization=0.5,
        max_model_len=1024,
        temperature=0.8,
        max_tokens=1024,
        # 其他vLLM参数
        tensor_parallel_size = 2,
        enforce_eager=True,
        disable_custom_all_reduce=True,
    ),
)

model = lx.factory.create_model(config)

# 添加调试模式来查看模型输出
result = lx.extract(
    model=model,
    text_or_documents=input_text,
    prompt_description=prompt,
    examples=examples,
    fence_output=False,  # 启用fence输出，帮助模型生成更好的JSON
    use_schema_constraints=False,
    debug=False,  # 启用调试模式
)

INFO 09-09 22:55:55 [__init__.py:241] Automatically detected platform cuda.
INFO 09-09 22:55:57 [utils.py:326] non-default args: {'model': 'microsoft/Phi-3-mini-4k-instruct', 'max_model_len': 1024, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.5, 'disable_log_stats': True, 'enforce_eager': True, 'disable_custom_all_reduce': True}
INFO 09-09 22:56:05 [__init__.py:711] Resolved architecture: Phi3ForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-09 22:56:05 [__init__.py:1750] Using max model len 1024
INFO 09-09 22:56:07 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 09-09 22:56:07 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=93769)[0;0m INFO 09-09 22:56:08 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=93769)[0;0m INFO 09-09 22:56:08 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='microsoft/Phi-3-mini-4k-instruct', speculative_config=None, tokenizer='microsoft/Phi-3-mini-4k-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConf

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP1 pid=93793)[0;0m INFO 09-09 22:56:20 [weight_utils.py:312] Time spent downloading weights for microsoft/Phi-3-mini-4k-instruct: 0.646233 seconds
[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP0 pid=93791)[0;0m INFO 09-09 22:56:20 [default_loader.py:262] Loading weights took 0.95 seconds
[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP0 pid=93791)[0;0m INFO 09-09 22:56:20 [gpu_model_runner.py:2007] Model loading took 3.5911 GiB and 3.298806 seconds
[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP1 pid=93793)[0;0m INFO 09-09 22:56:21 [default_loader.py:262] Loading weights took 0.89 seconds
[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP1 pid=93793)[0;0m INFO 09-09 22:56:22 [gpu_model_runner.py:2007] Model loading took 3.5911 GiB and 4.400769 seconds
[1;36m(EngineCore_0 pid=93769)[0;0m [1;36m(VllmWorker TP1 pid=93793)[0;0m INFO 09-09 22:56:25 [gpu_worker.py:276] Available KV

In [4]:
# result 已经是一个 AnnotatedDocument 对象，不需要索引访问
result = cast(AnnotatedDocument, result)

html_content = lx.visualize(result)
html_content = cast(HTML, html_content)

display(html_content)

