# Retriever

In [None]:
# uv add rich : 포메팅 의존성

In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [11]:
# 출력 예쁘게 하기
from rich.console import Console
from rich.table import Table

console = Console()

def rich_docs(docs, max_len=140, title="Retriever Results"):
    table = Table(title=title)
    table.add_column("#", justify="right")
    table.add_column("Source")
    table.add_column("Page", justify="right")
    table.add_column("Preview")

    for i, d in enumerate(docs, 1):
        m = d.metadata or {}
        src = (m.get("source","") or "").split("/")[-1]
        page = str(m.get("page_label", m.get("page",0)+1))
        text = (d.page_content or "").strip().replace("\n", " ")
        table.add_row(str(i), src, page, (text[:max_len] + ("…" if len(text) > max_len else "")))

    console.print(table)

## retriever 설정
- 일반 RAG 기본값 : similarity or mmr
- 중복이 많을 경우 : mmr
- 그외 필터링이 필요한 경우 : search_krwargs
- 길이가 긴 경우 : compressed_retriever | parant_child
- 용어가 중요할 경우 : hybrid(vec + bm25)
- 정확도 극대화 : similarity -> rerank -> reorder

In [3]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma

In [5]:
emb = OpenAIEmbeddings(model='text-embedding-3-small')

db_path = '../vectorstore/chroma_samsung'
collection_name = 'samsung_all'

vecstore = Chroma(
    persist_directory=db_path,
    collection_name=collection_name,
    embedding_function=emb
)

vecstore._collection.count()

444

In [6]:
dim_size = len(emb.embed_query('test'))
dim_size

1536

## 1. 벡터 기반 검색기(유사도/mmr/score_threshole/filter)

In [7]:
question = '삼성의 지속 가능성에 대해 알려줘'

In [None]:
# similarity
ret_sim = vecstore.as_retriever(
    search_type = 'similarity',
    search_kwargs = {'k' : 10}
    
)

rich_docs(ret_sim.invoke(question), title='similarity')

In [18]:
# mmr
ret_mmr = vecstore.as_retriever(
    search_type = 'mmr',
    search_kwargs = {'k' : 10,
                    'fetch_k' : 20,
                    'lambda_mult' : 0.5 # 1 : 관련성, 0 : 다양성
                    }
)

rich_docs(ret_mmr.invoke(question), title='mmr')


In [None]:
# similarity_score_threshold
ret_threshold = vecstore.as_retriever(
    search_type = 'similarity_score_threshold',
    search_kwargs = {'k' : 8,
                    'score_threshold' : 0.4, # 유사도 기준값으로 필터링
                    }
)

rich_docs(ret_threshold.invoke(question), title='similarity_score_threshold')

In [24]:
# filter

print(vecstore._collection.get(limit=1))

ret_filter = vecstore.as_retriever(
    search_kwargs = {'k' : 10,
                    'filter' : {'source': '../data/Sustainability_report_2024_kr.pdf'}
                    }
    
)

rich_docs(ret_filter.invoke(question), title='filter')

{'ids': ['sv_2024::c70cb805-6d44-4ca1-9061-27a9071dfc8e'], 'embeddings': None, 'documents': ['A Journey Towards  \na Sustainable Future\n삼성전자 지속가능경영보고서 2024'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'creator': 'Adobe InDesign 15.1 (Macintosh)', 'page_label': '1', 'moddate': '2024-11-25T11:10:46+09:00', 'total_pages': 83, 'producer': 'Adobe PDF Library 15.0', 'creationdate': '2024-11-25T11:10:32+09:00', 'page': 0, 'source': '../data/Sustainability_report_2024_kr.pdf', 'trapped': '/False'}]}
