# Hybrid / Compress

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# 출력 포메팅
from rich.console import Console
from rich.table import Table

console = Console()

def rich_docs(docs, max_len=140, title="Retriever Results"):
    table = Table(title=title)
    table.add_column("#", justify="right")
    table.add_column("Source")
    table.add_column("Page", justify="right")
    table.add_column("Preview")

    for i, d in enumerate(docs, 1):
        m = d.metadata or {}
        src = (m.get("source","") or "").split("/")[-1]
        page = str(m.get("page_label", m.get("page",0)+1))
        text = (d.page_content or "").strip().replace("\n", " ")
        table.add_row(str(i), src, page, (text[:max_len] + ("…" if len(text) > max_len else "")))

    console.print(table)

## 1. db 생성 및 로드

In [4]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

embedding = OpenAIEmbeddings(model='text-embedding-3-small')
persist_directory = '../vectorstore/samsung_2025_re'
collection_name = 'samsung_2025'

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain_community.document_loaders import PyPDFLoader

docs = PyPDFLoader("../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf").load()
len(docs)

87

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)

chunks = splitter.split_documents(docs)
len(chunks)

237

In [7]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    collection_name=collection_name,
    persist_directory=persist_directory,
    embedding=embedding
)

vectorstore

<langchain_chroma.vectorstores.Chroma at 0x22c27c4f290>

In [8]:
load_vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding,
    collection_name=collection_name
)

load_vectorstore

<langchain_chroma.vectorstores.Chroma at 0x22c2f7c3810>

## 2. Ensemble Retriever(Dense + Parse)

In [9]:
# Dense Retriever
sim_retriever = load_vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5
    }
)

In [10]:
db_docs = load_vectorstore._collection.get(include=['documents', 'metadatas'])
db_docs['metadatas'][:2]

[{'creator': 'Adobe InDesign 15.1 (Macintosh)',
  'creationdate': '2025-07-10T16:11:16+09:00',
  'page_label': '1',
  'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf',
  'producer': 'Adobe PDF Library 15.0',
  'total_pages': 87,
  'trapped': '/False',
  'page': 0,
  'moddate': '2025-09-04T16:51:11+09:00'},
 {'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf',
  'total_pages': 87,
  'trapped': '/False',
  'page': 1,
  'page_label': '2',
  'creationdate': '2025-07-10T16:11:16+09:00',
  'producer': 'Adobe PDF Library 15.0',
  'creator': 'Adobe InDesign 15.1 (Macintosh)',
  'moddate': '2025-09-04T16:51:11+09:00'}]

In [11]:
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_core.documents import Document

bm_docs = []

# chunk된 db의 내용을 가져와서 사용 => chunk 보다는 영속된 db의 내용을 쓰는 것이 안정적
for content, meta in zip(db_docs['documents'], db_docs['metadatas']):
    bm_docs.append(Document(page_content = content, metadata = meta))
    
bm_docs[:3]

[Document(metadata={'creator': 'Adobe InDesign 15.1 (Macintosh)', 'creationdate': '2025-07-10T16:11:16+09:00', 'page_label': '1', 'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf', 'producer': 'Adobe PDF Library 15.0', 'total_pages': 87, 'trapped': '/False', 'page': 0, 'moddate': '2025-09-04T16:51:11+09:00'}, page_content='삼성전자 지속가능경영보고서 2025\nA Journey  Towards \n a Sustainable Future\nA Journey  Towards\n a Sustainable Future'),
 Document(metadata={'source': '../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf', 'total_pages': 87, 'trapped': '/False', 'page': 1, 'page_label': '2', 'creationdate': '2025-07-10T16:11:16+09:00', 'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.1 (Macintosh)', 'moddate': '2025-09-04T16:51:11+09:00'}, page_content='삼성전자 지속가능경영보고서 2025 02AppendixFacts & Figures PrinciplePlanet PeopleOur Company삼성전자 지속가능경영보고서 2025 02\nA Journey  Towards \n a Sustainable Future\nA Journey  Towards \n a Sustainable Future\

In [12]:
# Parse Retriever
bm25_retriever = BM25Retriever.from_documents(bm_docs)
bm25_retriever.k = 5

In [13]:
hybrid_retriever = EnsembleRetriever(
    retrievers=[sim_retriever, bm25_retriever],
    weights=[0.7, 0.3]
)

In [14]:
question = '삼성전자의 2025년 전망은?'
result = hybrid_retriever.invoke(question)
rich_docs(result, title='hybrid result')

## 3. 압축 Retriever
- 검색 결과가 내용이 너무 길때
- 검색 결과가 파편화(띄어쓰기/문단구분) 되어 있는 경우
- => 비용 문제는 고려할 것
- => 전처리 단계에서 해결하거나 로컬 요약을 사용하는 것도 고려

In [15]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor, LLMChainFilter
from langchain_openai import ChatOpenAI

compressor = LLMChainExtractor.from_llm(ChatOpenAI(
    model_name='gpt-4.1-mini',
    temperature=0
    )
)

com_retriever = ContextualCompressionRetriever(
    base_retriever=sim_retriever,
    base_compressor=compressor
)

In [16]:
question = '삼성 전자의 목표와 기준 년도만 간단히 알려줘'

com_result = com_retriever.invoke(question)
rich_docs(com_result, title='compress result')

## 3-1. 압축 Retriever(Embedding - No Cost : 잘 안씀)

In [22]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

# 필터만 거치는거라 비용은 들지 않음
compressor = EmbeddingsFilter(
    embeddings=embedding,
    similarity_threshold=0.2
)

# 중복이 많은 경우 활용
mmr_retriever = load_vectorstore.as_retriever(
    search_type = 'mmr',
    search_kwargs = {'k' : 5,
                    'fetch_k' : 10,
                    'lambda_mult' : 0.5 # 1 : 관련성, 0 : 다양성
                    }
)

comp_embed = ContextualCompressionRetriever(
    base_retriever=mmr_retriever, 
    base_compressor=compressor
)

comp_embed_result = comp_embed.invoke(question)
rich_docs(comp_embed_result)