# Chroma

## 1. Chroma + Ensemble retrieve

In [1]:
from langchain_chroma import Chroma
from langchain.retrievers import EnsembleRetriever
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader

In [2]:
emb = OpenAIEmbeddings(model='text-embedding-3-small')

In [3]:
samsung_vision_2024_path = '../data/Sustainability_report_2024_kr.pdf'
samsung_vision_2025_path = '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf'

docs_2024 = PyPDFLoader(samsung_vision_2024_path).load()
docs_2025 = PyPDFLoader(samsung_vision_2025_path).load()

In [4]:
print(len(docs_2024))
print(len(docs_2025))

83
87


In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

chunks_2024 = text_splitter.split_documents(docs_2024)
chunks_2025 = text_splitter.split_documents(docs_2025)

print(len(chunks_2024))
print(len(chunks_2025))

207
237


In [22]:
db_2024 = Chroma.from_documents(documents=chunks_2024, collection_name='sv_2024', persist_directory='../vectorstore/chroma_samsung', embedding=emb)
db_2025 = Chroma.from_documents(documents=chunks_2025, collection_name='sv_2025', persist_directory='../vectorstore/chroma_samsung', embedding=emb)

NameError: name 'chunks_2024' is not defined

In [9]:
retriever_2024 = db_2024.as_retriever()
retriever_2025 = db_2025.as_retriever()

In [None]:
result_2024 = retriever_2024.invoke('삼성 경영 전망에 대해 알려줘', k=3)
result_2025 = retriever_2025.invoke('삼성 경영 전망에 대해 알려줘', k=7)

In [None]:
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_2024, retriever_2025], weights=[0.5, 0.5]) # 앞에서 문서수로 가중치를 조절
result_total = ensemble_retriever.invoke('앞으로 삼성 경영 전망에 대해 알려줘')
print(result_total)

[Document(id='843edbad-aea6-4df3-a4a9-c4ce239551b1', metadata={'creator': 'Adobe InDesign 15.1 (Macintosh)', 'trapped': '/False', 'page_label': '30', 'moddate': '2024-11-25T11:10:46+09:00', 'source': '../data/Sustainability_report_2024_kr.pdf', 'creationdate': '2024-11-25T11:10:32+09:00', 'producer': 'Adobe PDF Library 15.0', 'page': 29, 'total_pages': 83}, page_content='삼성전자 지속가능경영보고서 2024 30Our Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People\n사회적 책임을 다하며 미래로 함께 나아갑니다.\nPeople\n31    임직원\n39    공급망\n45    사회공헌\n48    개인정보보호/보안\n50    고객의 안전/품질'), Document(id='0f5be0eb-372c-432f-9cc0-4a82663232ad', metadata={'creator': 'Adobe InDesign 15.1 (Macintosh)', 'page': 2, 'producer': 'Adobe PDF Library 15.0', 'creationdate': '2025-07-10T16:11:16+09:00', 'moddate': '2025-09-04T16:51:11+09:00', 'page_label': '3', 'trapped': '/False', 'total_pages': 87, 'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf'}, page_content='삼성전자 지속가능경영보고서 2025\n03\

In [16]:
print(result_total[0].page_content)

삼성전자 지속가능경영보고서 2024 30Our Company AppendixMateriality Assessment Facts & Figures PrinciplePlanet People
사회적 책임을 다하며 미래로 함께 나아갑니다.
People
31    임직원
39    공급망
45    사회공헌
48    개인정보보호/보안
50    고객의 안전/품질


## 2. db 병합(자주 쓰이진 않음)

In [None]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(path="../vectorstore/chroma_samsung")

src24 = client.get_collection("sv_2024")
src25 = client.get_collection("sv_2025")

# 빈 db 생성
sv_all = Chroma(
    collection_name="samsung_all",
    persist_directory="../vectorstore/chroma_samsung",
    embedding_function=emb,
)

dst = client.get_collection("samsung_all")
dst

Collection(name=samsung_all)

In [5]:
# 2. samsung_2024 → samsung_all 복사
batch = 1000
offset = 0
while True:
    batch_data = src24.get(
        include=["embeddings", "metadatas", "documents"],
        limit=batch, offset=offset
    )
    ids = batch_data["ids"]
    if not ids:
        break

    # ID 충돌 방지: 접두사 붙이기
    new_ids = [f"{src24.name}::{i}" for i in ids]

    dst.upsert(
        ids=new_ids,
        documents=batch_data["documents"],
        embeddings=batch_data["embeddings"],   # 이미 계산된 벡터 재사용
        metadatas=batch_data["metadatas"],
    )
    offset += len(ids)
    print(f"[{src24.name}] moved {offset} docs so far...")
    
# 3. samsung_2025 → samsung_all 복사
batch = 1000
offset = 0
while True:
    batch_data = src25.get(
        include=["embeddings", "metadatas", "documents"],
        limit=batch, offset=offset
    )
    ids = batch_data["ids"]
    if not ids:
        break

    new_ids = [f"{src25.name}::{i}" for i in ids]

    dst.upsert(
        ids=new_ids,
        documents=batch_data["documents"],
        embeddings=batch_data["embeddings"],
        metadatas=batch_data["metadatas"],
    )
    offset += len(ids)
    print(f"[{src25.name}] moved {offset} docs so far...")

[sv_2024] moved 207 docs so far...
[sv_2025] moved 237 docs so far...


## 

## 3. 병합된 db 불러와서 활용

In [15]:
from langchain_community.vectorstores import Chroma

# 특정 컬렉션 명으로 로드
total_db = Chroma(
    persist_directory="../vectorstore/chroma_samsung",      # DB 폴더 경로
    collection_name="samsung_all",     
    embedding_function=emb         
)

total_db

<langchain_community.vectorstores.chroma.Chroma at 0x1ccd218d490>

In [18]:
# 검색 예시
query = "각각 2024년과 2025년 삼성 경영 전망"
total_retriever = total_db.as_retriever()
results = total_retriever.invoke(query, k=10)
print(len(results))

10


In [19]:
for result in results:
    print(result.metadata['source'])

../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Sustainability_report_2024_kr.pdf
../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Sustainability_report_2024_kr.pdf
../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Sustainability_report_2024_kr.pdf
../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf
../data/Sustainability_report_2024_kr.pdf
