In [1]:
import os
import json
import chromadb
from chromadb.utils import embedding_functions

# 현재 작업 폴더 확인 (ipc/rag 여야 정상)
print("현재 CWD:", os.getcwd())

# 1) Chroma DB 폴더 (ipc/rag → ../chroma_db)
CHROMA_DB_DIR = os.path.abspath("../chroma_db")
print("Chroma DB 경로:", CHROMA_DB_DIR)

# 폴더 없으면 생성
os.makedirs(CHROMA_DB_DIR, exist_ok=True)

# 2) IPC JSON 파일 (ipc/rag → ../data/processed_data.json)
IPC_JSON_PATH = os.path.abspath("../data/processed_data.json")
print("IPC JSON 경로:", IPC_JSON_PATH)

# 3) 컬렉션 이름
COLLECTION_NAME = "ipc_clean"

# 4) OpenAI 임베딩 함수
openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=os.getenv("OPENAI_API_KEY"),
    model_name="text-embedding-3-small",
)

# 5) Chroma 클라이언트 생성
client = chromadb.PersistentClient(path=CHROMA_DB_DIR)

# 6) 컬렉션 생성
collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    embedding_function=openai_ef,
)

print("현재 컬렉션들:", client.list_collections())
print("초기 count:", collection.count())


현재 CWD: c:\projects\SKN19-3rd-2team\ipc\rag
Chroma DB 경로: c:\projects\SKN19-3rd-2team\ipc\chroma_db
IPC JSON 경로: c:\projects\SKN19-3rd-2team\ipc\data\processed_data.json
현재 컬렉션들: [Collection(name=ipc_clean)]
초기 count: 0


In [2]:
with open(IPC_JSON_PATH, "r", encoding="utf-8") as f:
    ipc_list = json.load(f)

print("IPC 개수:", len(ipc_list))
ipc_list[0]


IPC 개수: 69337


{'ipc_code': 'A',
 'title_en': 'HUMAN NECESSITIES',
 'kind': 's',
 'parent': None,
 'path': ['A']}

In [4]:
BATCH_SIZE = 400

def gen_batches(data, size):
    for i in range(0, len(data), size):
        yield data[i:i+size]

for batch in gen_batches(ipc_list, BATCH_SIZE):
    ids = []
    texts = []
    metas = []

    for row in batch:
        ids.append(row["ipc_code"])
        texts.append(row["title_en"])

        # path 리스트를 "A > A01 > A01B" 이런 문자열로 변환
        path_list = row.get("path", [])
        if isinstance(path_list, list):
            path_str = " > ".join(path_list)
        else:
            path_str = str(path_list) if path_list is not None else None

        metas.append({
            "kind": row.get("kind"),        # 보통 's', 'c', 'm', 'g' 같은 문자열
            "parent": row.get("parent"),    # 문자열 또는 None
            "path": path_str,               # ✅ 이제 문자열
            "type": "ipc",
        })

    collection.upsert(
        ids=ids,
        documents=texts,
        metadatas=metas
    )

print("업서트 후 count:", collection.count())


업서트 후 count: 69337


In [16]:
query = openai_ef(
    "computer vision pipeline with image/video transformation, object detection, input/output setup, and integrated processing"

)[0].tolist()

raw_results = collection.query(
    query_embeddings=[query],
    n_results=5
)

print("raw ids:", raw_results["ids"])
print("raw docs:", raw_results["documents"])
print("raw metas:", raw_results["metadatas"])




raw ids: [['G06V10/20', 'G06V10/10', 'G06V10/25', 'G06V10/22', 'G06V10/24']]
raw docs: [['PHYSICS COMPUTING CALCULATING OR COUNTING IMAGE OR VIDEO RECOGNITION OR UNDERSTANDING Arrangements for image or video recognition or understanding Image preprocessing', 'PHYSICS COMPUTING CALCULATING OR COUNTING IMAGE OR VIDEO RECOGNITION OR UNDERSTANDING Arrangements for image or video recognition or understanding Image acquisition', 'PHYSICS COMPUTING CALCULATING OR COUNTING IMAGE OR VIDEO RECOGNITION OR UNDERSTANDING Arrangements for image or video recognition or understanding Image preprocessing Determination of region of interest [ROI] or a volume of interest [VOI]', 'PHYSICS COMPUTING CALCULATING OR COUNTING IMAGE OR VIDEO RECOGNITION OR UNDERSTANDING Arrangements for image or video recognition or understanding Image preprocessing by selection of a specific region containing or referencing a pattern Locating or processing of specific regions to guide the detection or recognition', 'PHYSICS C