# Reranker / Reorder
- 검색 결과가 10건 정도는 되어야 의미가 있음
![re.png](attachment:re.png)

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# 출력 포메팅
from rich.console import Console
from rich.table import Table

console = Console()

def rich_docs(docs, max_len=140, title="Retriever Results"):
    table = Table(title=title)
    table.add_column("#", justify="right")
    table.add_column("Source")
    table.add_column("Page", justify="right")
    table.add_column("Preview")

    for i, d in enumerate(docs, 1):
        m = d.metadata or {}
        src = (m.get("source","") or "").split("/")[-1]
        page = str(m.get("page_label", m.get("page",0)+1))
        text = (d.page_content or "").strip().replace("\n", " ")
        table.add_row(str(i), src, page, (text[:max_len] + ("…" if len(text) > max_len else "")))

    console.print(table)

## 1. db 생성 및 로드

In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

embedding = OpenAIEmbeddings(model='text-embedding-3-small')
persist_directory = '../vectorstore/samsung_2025_re'
collection_name = 'samsung_2025'

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_community.document_loaders import PyPDFLoader

docs = PyPDFLoader("../data/Samsung_Electronics_Sustainability_Report_2025_KOR.pdf").load()
len(docs)

87

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)

chunks = splitter.split_documents(docs)
len(chunks)

237

In [6]:
vectorstore = Chroma.from_documents(
    documents=chunks,
    collection_name=collection_name,
    persist_directory=persist_directory,
    embedding=embedding
)

vectorstore

<langchain_chroma.vectorstores.Chroma at 0x1e85d45ecd0>

In [7]:
load_vectorstore = Chroma(
    persist_directory=persist_directory,
    embedding_function=embedding,
    collection_name=collection_name
)

load_vectorstore

<langchain_chroma.vectorstores.Chroma at 0x1e85ce24410>

## 2. Retriever

In [None]:
# Dense Retriever
sim_retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 20
    }
)

question = '삼성전자의 2025년 전망은?'

In [10]:
result = sim_retriever.invoke(question)
rich_docs(result)

## 3. Rerank

In [14]:
from langchain_community.cross_encoders.huggingface import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever

In [12]:
hf_ce = HuggingFaceCrossEncoder(
    model_name = 'cross-encoder/ms-marco-MiniLM-L6-v2',
    model_kwargs = {
        'device' : 'cuda',
        'max_length' : 512
    }
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [19]:
compressor = CrossEncoderReranker(
    model = hf_ce,
    top_n = 10
)

rerank_retriever = ContextualCompressionRetriever(
    base_retriever=sim_retriever,
    base_compressor=compressor
    
)

rerank_result = rerank_retriever.invoke(question)
rich_docs(rerank_result, title='rerank result')

## 4. Reorder

In [18]:
from langchain_community.document_transformers import LongContextReorder

reorder  = LongContextReorder()
reordered_result = reorder.transform_documents(rerank_result)

rich_docs(reordered_result, title='reorder result')