# Day 7 HW

## Define IDP

In [None]:
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
from docling.pipeline.vlm_pipeline import VlmPipeline


vllm_hostname = "ws-01.wade0426.me/v1"
model_name = "allenai/olmOCR-2-7B-1025-FP8"

def process_rapidocr(source_file):
    """
    Use RapidOCR to process file. Return a MD.
    """
    pdf_options = PdfPipelineOptions(
        do_ocr=True,
    )

    # 建立文件轉換器
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)
        }
    )

    # 轉換 PDF 文件
    result = doc_converter.convert(source_file)

    return result.document.export_to_markdown()


def olmocr2_vlm_options(
    model: str = "allenai/olmOCR-2-7B-1025-FP8",
    hostname_and_port: str = "https://ws-01.wade0426.me/v1/",
    prompt: str = "Convert this page to markdown.",
    max_tokens: int = 4096,
    temperature: float = 0.0,
    api_key: str = "",) -> ApiVlmOptions:
    """
    Options for olmOCR
    """
    headers = {}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
   
    options = ApiVlmOptions(
        url=f"http://{hostname_and_port}/chat/completions",
        params=dict(
            model=model,
            max_tokens=max_tokens,
        ),
        headers=headers,
        prompt=prompt,
        timeout=120,  # olmocr2 可能需要較長處理時間
        scale=2.0,  # 圖片縮放比例
        temperature=temperature,
        response_format=ResponseFormat.MARKDOWN,
    )
    return options

def process_olmocr(source_file):
    """
    Use olmOCR to process file. Return a MD.
    """
    # 配置 VLM pipeline 選項
    pipeline_options = VlmPipelineOptions(
        enable_remote_services=True  # 必須啟用以呼叫遠端 API
    )

    # 設定 olmocr2 的 VLM 選項
    pipeline_options.vlm_options = olmocr2_vlm_options(
        model=model_name,
        hostname_and_port=vllm_hostname,
        prompt="Convert this page to clean, readable markdown format.",
        temperature=0.0,  # olmocr2 建議使用較低的溫度
    )

    # 建立文件轉換器
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
                pipeline_cls=VlmPipeline,
            )
        }
    )

    olm_result = doc_converter.convert(source="./HW/3.pdf")

    return olm_result.document.export_to_markdown()




## Define Qdrant

In [2]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct


collection_name = "hwd7"

client = QdrantClient(host="localhost", port=6333)

if not client.collection_exists(collection_name=collection_name):
    # If not, create the collection with specified parameters
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=100, distance=Distance.COSINE), # Adjust size and distance as needed
    )
    print(f"Collection '{collection_name}' created.")
else:
    print(f"Collection '{collection_name}' already exists. Skipping creation.")



Collection 'hwd7' created.


## Define LLMGuard

In [25]:
from llm_guard.input_scanners import PromptInjection, Anonymize
from llm_guard.input_scanners.prompt_injection import MatchType

class SimplePDFScanner:
    def __init__(self):
        self.converter = DocumentConverter()
        # 降低閾值以提高敏感度
        self.scanner = PromptInjection(threshold=0.75, match_type=MatchType.SENTENCE)
   
    def scan_pdf(self, pdf_path):
        """掃描 PDF 檔案中的 Prompt Injection 攻擊"""
        print(f"\n{'='*60}")
        print(f"[*] 正在處理: {pdf_path}")
        print(f"{'='*60}\n")
       
        # Step 1: 提取 PDF 內容
        print("[1/3] 提取文檔內容中...")
        result = self.converter.convert(pdf_path)
        content = result.document.export_to_markdown()
        print(f"[OK] 提取完成 ({len(content)} 字元)\n")
       
        # Step 2: 分段掃描（提高檢測率）
        print("[2/3] 掃描 Prompt Injection (分段檢測)...\n")
        sections = self._split_content(content)
        detections = []
        max_risk = 0.0
        total_sections = len(sections)
       
        for i, section in enumerate(sections, 1):
            sanitized, is_safe, risk_score = self.scanner.scan(section)
           
            if not is_safe or risk_score > 0.3:
                detections.append({
                    'section': i,
                    'risk_score': risk_score,
                    'content': section[:200],
                    'is_safe': is_safe
                })
                max_risk = max(max_risk, risk_score)
                print(f"  [!] 段落 {i}/{total_sections}: 風險分數 {risk_score:.2f} {'(未通過)' if not is_safe else '(警告)'}")
       
        # Step 3: 顯示結果
        print(f"\n{'='*60}")
        print("[3/3] 掃描結果")
        print(f"{'='*60}")
        print(f"總段落數: {total_sections}")
        print(f"可疑段落數: {len(detections)}")
        print(f"最高風險分數: {max_risk:.2f}")
        print(f"安全狀態: {'[PASS] 安全' if len(detections) == 0 else '[FAIL] 偵測到風險'}")
       
        if detections:
            print(f"\n--- 可疑內容詳情 ---")
            for det in detections[:5]:  # 最多顯示 5 個
                print(f"\n[段落 {det['section']}] 風險分數: {det['risk_score']:.2f}")
                print(f"內容預覽: {det['content'][:150]}...")
        else:
            print(f"\n[INFO] 未發現可疑內容")
            print(f"內容預覽 (前 300 字):")
            print("-"*60)
            print(content[:300] + "...")
       
        print(f"\n{'='*60}\n")
       
        return {
            'safe': len(detections) == 0,
            'max_risk_score': max_risk,
            'detections': len(detections),
            'content': content,
            'details': detections
        }
   
    def _split_content(self, content, chunk_size=1000):
        """將內容分段以提高檢測率"""
        # 先按段落分割
        paragraphs = content.split('\n\n')
        sections = []
        current_section = ""
       
        for para in paragraphs:
            if len(current_section) + len(para) > chunk_size:
                if current_section:
                    sections.append(current_section.strip())
                current_section = para
            else:
                current_section += "\n\n" + para if current_section else para
       
        if current_section:
            sections.append(current_section.strip())
       
        # 如果沒有段落，直接按字元切割
        if len(sections) == 0:
            sections = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
       
        return [s for s in sections if len(s.strip()) > 50]

scan_pdf = SimplePDFScanner()
scan_pdf.scan_pdf(pdf_path="./HW/3.pdf")


[2m2026-02-11 16:31:26[0m [[32m[1mdebug    [0m] [1mInitialized classification model[0m [36mdevice[0m=[35mdevice(type='cpu')[0m [36mmodel[0m=[35mModel(path='protectai/deberta-v3-base-prompt-injection-v2', subfolder='', revision='89b085cd330414d3e7d9dd787870f315957e1e9f', onnx_path='ProtectAI/deberta-v3-base-prompt-injection-v2', onnx_revision='89b085cd330414d3e7d9dd787870f315957e1e9f', onnx_subfolder='onnx', onnx_filename='model.onnx', kwargs={}, pipeline_kwargs={'batch_size': 1, 'device': device(type='cpu'), 'return_token_type_ids': False, 'max_length': 512, 'truncation': True}, tokenizer_kwargs={})[0m


Device set to use cpu



[*] 正在處理: ./HW/3.pdf

[1/3] 提取文檔內容中...


[32m[INFO] 2026-02-11 16:31:26,619 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-11 16:31:26,650 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\Winter4T2H\source\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-11 16:31:26,651 [RapidOCR] main.py:53: Using C:\Users\Winter4T2H\source\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2026-02-11 16:31:26,915 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2026-02-11 16:31:26,925 [RapidOCR] download_file.py:60: File exists and is valid: C:\Users\Winter4T2H\source\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2026-02-11 16:31:26,927 [RapidOCR] main.py:53: Using C:\Users\Winter4T2H\source\repo\nutc2504lab_hw\.venv\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32

[OK] 提取完成 (0 字元)

[2/3] 掃描 Prompt Injection (分段檢測)...


[3/3] 掃描結果
總段落數: 0
可疑段落數: 0
最高風險分數: 0.00
安全狀態: [PASS] 安全

[INFO] 未發現可疑內容
內容預覽 (前 300 字):
------------------------------------------------------------
...




{'safe': True,
 'max_risk_score': 0.0,
 'detections': 0,
 'content': '',
 'details': []}