In [1]:
%pip install sentence_transformers chromadb gdown

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting chromadb
  Downloading chromadb-1.3.5-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence_transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting scikit-learn (from sentence_transformers)
  Downloading scikit_learn-1.7.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting scipy (from sentence_transformers)
  Downloading scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (62 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence_transformers)
  Downloading regex-2025.11.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<=0.23

In [2]:
import gdown

file_id = "1jhmp7gYItMN4Vq3XDy7aQpdYxzHFcvRr"
output = "preprocessed_data.jsonl"  # 저장할 파일 이름 지정
gdown.download(id=file_id, output=output, quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1jhmp7gYItMN4Vq3XDy7aQpdYxzHFcvRr
From (redirected): https://drive.google.com/uc?id=1jhmp7gYItMN4Vq3XDy7aQpdYxzHFcvRr&confirm=t&uuid=1f4822b6-75e8-4496-bc51-c5ac14c4b119
To: /workspace/preprocessed_data.jsonl
100%|██████████| 289M/289M [00:09<00:00, 32.0MB/s] 


'preprocessed_data.jsonl'

In [4]:
import os
# 1. (중요) GPU 메모리 단편화 방지 설정 (torch import 전에 해야 함)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import chromadb
from sentence_transformers import SentenceTransformer
import json
import torch
from tqdm.auto import tqdm
import gc # 가비지 컬렉션용

# =========================================================
# 2. 모델 로드
# =========================================================
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

print("Loading model...")
model = SentenceTransformer("dragonkue/BGE-m3-ko").to(device)

# =========================================================
# 3. ChromaDB 준비
# =========================================================
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(
    name="patent_claims",
    metadata={"hnsw:space": "cosine"}
)

# =========================================================
# 4. 완성된 배치 처리 함수 (안정성 강화 버전)
# =========================================================
def batch_add_jsonl_to_chroma_safe(
    jsonl_path,
    collection,
    model,
    batch_size=10,          # (변경됨) 32 -> 10 (안정성을 위해 줄임)
    processing_chunk_size=5000, 
):
    all_ids = []
    all_documents = []
    all_metadatas = []
    total_processed_count = 0

    print(f"Reading data from {jsonl_path}...")

    # --- 내부 처리 함수 ---
    def process_current_chunk():
        nonlocal all_ids, all_documents, all_metadatas, total_processed_count
        
        if not all_documents: 
            return

        current_count = len(all_documents)
        print(f"\n[Processing] Embedding {current_count} docs (Batch size: {batch_size})...")

        try:
            # 1. 임베딩 (GPU)
            with torch.no_grad():
                chunk_embeddings = model.encode(
                    all_documents,
                    batch_size=batch_size, # 줄어든 배치 사이즈 사용
                    show_progress_bar=True,
                    device=device,
                    convert_to_numpy=True
                )

            # 2. DB 저장
            collection.add(
                ids=all_ids,
                documents=all_documents,
                embeddings=chunk_embeddings.tolist(),
                metadatas=all_metadatas,
            )
            
            total_processed_count += current_count
            print(f"[Done] Saved. Total: {total_processed_count}")

            # 3. 메모리 정리
            del chunk_embeddings
        
        except Exception as e:
            print(f"\n[CRITICAL ERROR] Chunk processing failed: {e}")
            print("Skipping this chunk to prevent infinite loop...")
            # 에러가 나도 아래에서 리스트를 비우도록 흐름을 유지함
            
        finally:
            # 4. (중요) 성공하든 실패하든 반드시 메모리와 리스트를 비움
            all_ids = []
            all_documents = []
            all_metadatas = []
            
            if device == "cuda":
                torch.cuda.empty_cache()
                gc.collect() # 파이썬 가비지 컬렉션도 강제 실행

    # --- 파일 읽기 루프 ---
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Reading JSONL"):
            try:
                line = line.strip()
                if not line: continue
                
                patent = json.loads(line)
                
                # 데이터 파싱
                pat_id = patent["id"]
                embed_data = patent["embed"]
                claim_dict = embed_data.get("claim", {})
                meta_base = patent.get("metadata", {})
                
                ipc_str = ", ".join(meta_base.get("ipc", []))
                base_metadata = {
                    "patent_id": str(pat_id),
                    "priority": meta_base.get("priority", ""),
                    "register": meta_base.get("register", ""),
                    "link": meta_base.get("link", ""),
                    "ipc": ipc_str,
                    "title": embed_data.get("name", ""),
                }

                for claim_key, claim_text in claim_dict.items():
                    claim_no = int(claim_key.replace("claim", ""))
                    doc_id = f"{pat_id}_claim{claim_no}"

                    metadata = base_metadata.copy()
                    metadata["claim_no"] = claim_no

                    all_ids.append(doc_id)
                    all_documents.append(claim_text)
                    all_metadatas.append(metadata)

                    # 청크가 찼을 때 처리
                    if len(all_documents) >= processing_chunk_size:
                        process_current_chunk()

            except json.JSONDecodeError:
                continue
            except Exception as e:
                print(f"Line Error: {e}")

    # 남은 데이터 처리
    if all_documents:
        print("\nProcessing final remaining chunk...")
        process_current_chunk()

    print(f"\n🎉 All Done! Total {total_processed_count} documents.")

# =========================================================
# 5. 실행
# =========================================================
if __name__ == "__main__":
    batch_add_jsonl_to_chroma_safe(
        jsonl_path="preprocessed_data.jsonl", # 파일명 확인
        collection=collection,
        model=model,
        batch_size=16, # 안전하게 10으로 설정 (속도가 너무 느리면 16으로 시도)
        processing_chunk_size=5000
    )

Using device: cuda
Loading model...
Reading data from preprocessed_data.jsonl...


Reading JSONL: 0it [00:00, ?it/s]


[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 5000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 10000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 15000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 20000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 25000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 30000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 35000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 40000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 45000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 50000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 55000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 60000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 65000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 70000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 75000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 80000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 85000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 90000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 95000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 100000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 105000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 110000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 115000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 120000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 125000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 130000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 135000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 140000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 145000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 150000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 155000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 160000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 165000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 170000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 175000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 180000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 185000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 190000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 195000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 200000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 205000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 210000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 215000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 220000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 225000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 230000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 235000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 240000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 245000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 250000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 255000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 260000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 265000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 270000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 275000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 280000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 285000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 290000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 295000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 300000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 305000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 310000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 315000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 320000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 325000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 330000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 335000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 340000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 345000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 350000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 355000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 360000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 365000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 370000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 375000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 380000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 385000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 390000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 395000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 400000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 405000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 410000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 415000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 420000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 425000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 430000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 435000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 440000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 445000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 450000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 455000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 460000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 465000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 470000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 475000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 480000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 485000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 490000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 495000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 500000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 505000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 510000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 515000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 520000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 525000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 530000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 535000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 540000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 545000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 550000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 555000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 560000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 565000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 570000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 575000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 580000

[Processing] Embedding 5000 docs (Batch size: 16)...


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

[Done] Saved. Total: 585000

Processing final remaining chunk...

[Processing] Embedding 4049 docs (Batch size: 16)...


Batches:   0%|          | 0/254 [00:00<?, ?it/s]

[Done] Saved. Total: 589049

🎉 All Done! Total 589049 documents.


In [7]:
%pip install huggingface_hub

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0mNote: you may need to restart the kernel to use updated packages.


In [8]:
# chroma_db 폴더를 chroma_db.tar.gz로 압축
!tar -czvf chroma_db.tar.gz ./chroma_db

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


./chroma_db/
./chroma_db/chroma.sqlite3
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/header.bin
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/data_level0.bin
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/length.bin
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/link_lists.bin
./chroma_db/692dc437-fd5d-4a09-8fdc-494a305e500a/index_metadata.pickle


In [6]:
query = "컴퓨터 비전으로 알츠하이머 진단"
query_emb = model.encode(query).tolist()

search_result = collection.query(query_embeddings=[query_emb], n_results=5)

for meatadata in search_result["documents"][0]:
    print(meatadata)
    print()

print(search_result["distances"])

for metadata in search_result["metadatas"][0]:
    print(metadata["patent_id"])
    print(metadata["title"])

치료를 적용(administer)하고 알츠하이머병을 진단받은 피험자의 상기 치료의 예상되는 임상 결과 점수를 계산하는 방법에 있어서, 컴퓨터 구현 방법은:(a) 상기 피험자의 뇌에 감마 진동 유도 비침습적 감각 자극의 치료 용량(therapeutic dose)을 적용하는 단계;(b) 복수의 생체 전기 신호를 측정하기 위해 상기 피험자의 뇌에 대해 뇌파 검사(encephalogram)를 수행하는 단계; (c) 머신 러닝 알고리즘을 사용하여, 상기 복수의 생체 전기 신호에 기초하여 상기 피험자의 예상되는 임상 결과 점수를 계산하는 단계; 및(d) 상기 예상되는 임상 결과 점수에 적어도 부분적으로 기초하여 상기 치료 용량을 조정하는 단계를 포함하는, 방법.

컴퓨팅 디바이스는 컴퓨터 비전 및 기계 학습을 이용하여 각각의 로봇식 암, 상기 손잡이 조작기 및 상기 내시경 액추에이터의 이동 및 포지션을 제어하는, 로봇-제어식 이중-암 가요성 내시경.

컴퓨터 비전 및 기계 학습을 이용하여 각각의 로봇식 암, 상기 손잡이 조작기 및 상기 내시경 액추에이터의 이동 및 포지션을 제어하는 단계를 더 포함하는, 내시경 시술 수행 방법.

상기 측정은 간이 정신 상태 검사(Mini-Mental State Examination; MMSE), 알츠하이머병 평가 척도(Alzheimer's Disease Assessment Scale; ADAS-Cog), 임상적 치매 등급(Clinical Dementia Rating; CDR), 알츠하이머병 협력 연구 - 일상 생활의 활동(Alzheimer's Disease Cooperative Study - Activities of Daily Living; ADCS-ADL), 신경정신행동검사(Neuropsychiatric Inventory; NPI), 양전자 방출 단층 촬영(positron emission tomography; PET), 또는 자기 공명 이미징(magnetic resonance imaging; MRI) 부피 데이터 평가를 포함하는 것인,

In [10]:
def group_results_by_patent(results):
    counts = {}
    metas = results["metadatas"][0]

    for m in metas:
        pat_id = m["patent_id"]
        counts[pat_id] = counts.get(pat_id, 0) + 1

    return counts

In [11]:
# 1. 쿼리 텍스트 정의
query = "편광판을 이용한 액정"

# 2. 데이터를 저장했던 *동일한 모델*로 쿼리를 직접 임베딩
#    (이때 model 변수가 이전에 로드되어 있어야 합니다)
query_emb = model.encode(query).tolist()

# 3. query_texts 대신 query_embeddings 파라미터 사용
results = collection.query(query_embeddings=[query_emb], n_results=10)

# 4. 이후 로직은 동일
patent_hit_map = group_results_by_patent(results)
print("특허별 히트 수:", patent_hit_map)

특허별 히트 수: {'1020247042361': 1, '1020160153262': 1, '1020170024706': 2, '1020247013771': 1, '1020257027309': 1, '1020237001747': 1, '1020250024294': 1, '1020220037297': 1, '1020257023376': 1}


In [14]:
results

{'ids': [['1020247042361_claim8',
   '1020160153262_claim5',
   '1020170024706_claim6',
   '1020247013771_claim4',
   '1020257027309_claim10',
   '1020237001747_claim3',
   '1020250024294_claim2',
   '1020220037297_claim7',
   '1020170024706_claim1',
   '1020257023376_claim7']],
 'embeddings': None,
 'documents': [['제 6 항에 기재한 편광판을 갖는 액정 표시 장치.',
   '액정 셀의 적어도 한쪽의 면에, 제1항 또는 제2항에 기재된 편광판이 배치된 액정 패널.',
   '제1항 내지 제3항 중 어느 한 항에 기재한 편광판 세트와 액정 셀을 구비하고, 상기 액정 셀의 두께가 0.4 mm 이하인 액정 패널.',
   '상기 산성 용액이, 염산, 황산 또는 질산인, 편광판의 제조 방법.',
   '상기 액정 배향막이, 편광 자외선을 조사하여 얻어지는, 액정 배향막.',
   '제1항 또는 상기 편광자층이, 중합성 액정 화합물과 상기 2색성 유기 색소를 함유하는 것을 특징으로 하는 편광판.',
   '편광판용인, 접착제 조성물.',
   '편광판인, 광학 부재.',
   '액정 셀의 시인측에 배치되는 제1 편광판과, 상기 액정 셀의 배면측에 배치되는 제2 편광판을 갖는 편광판 세트로서, 상기 제2 편광판은 반사형 편광 필름을 갖고, 상기 제1 편광판을 80℃에서 4시간 유지했을 때의 흡수축 방향에 있어서의 폭 2 mm당 수축력을 F1로 하고, 투과축 방향에 있어서의 폭 2 mm당 수축력을 F2로 하며, 상기 제2 편광판을 80℃에서 4시간 유지했을 때의 흡수축 방향에 있어서의 폭 2 mm당 수축력을 F3으로 하고, 투과축 방향에 있어서의 폭 2 mm당 수축력을 F4로 했을 때, (F1×F2)/(F3×F4)가 0.5∼