In [None]:
# 벡터검색 bm25 키워드 검색을 rrf 알고리즘으로 결합
from typing import List
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever

class FusionRetriveal:
    '''Fusion Retriveal 기법을 구현한 클래스'''
    def __init__(self,documents:List[Document],retriever_k : int =5):
        '''
        Args:
            documents : 전체문서 리스트
            retriever_k : 각 검색 방식 당 반환할 문서 개수
        '''
        self.documents=documents
        self.retriever_k = retriever_k
        #BM25 Retriever 초기화
        self.bm25_retriever = BM25Retriever.from_documents(documents)
        self.bm25_retriever.k = retriever_k
        print('Fusion Retrieval 초기화 완료')
    def fusion_retrieval(self, question:str , vector_retriever) -> List[Document]:
        '''
        벡터 겸색과 bm25 검색 결과를 rrf 알고리즘으로 결합
        Args:
            question : 사용자 질문
            vector_retriever : 벡터검색 retriever 객체
        returns :
            문서리스트
        '''
        # 벡터 검색
        vector_docs = vector_retriever.invoke(question)
        #bm25 검색
        bm25_docs = self.bm25_retriever.invoke(question)
        print('fusion_retrieval...')
        print(f' 벡터검색 : {len(vector_docs)}개 문서')
        print(f' bm25검색 : {len(bm25_docs)}개 문서')
        # RRF (reciprocal rank fusion) 점수 게싼
        fusion_scores = {}
        # 벡터 검색 결과 점수화
        for rank, doc in enumerate(vector_docs):
            doc_key = doc.page_content[:50]
            1/(60+rank)
            fusion_scores[doc_key] = fusion_scores.get(doc_key,0) + score
        # bm25 검색 결과 점수화
        for rank, doc in enumerate(bm25_docs):
            doc_key = doc.page_content[:50]
            1/(60+rank)
            fusion_scores[doc_key] = fusion_scores.get(doc_key,0) + score
        # 점수로 정렬
        sorted_docs = sorted(
            fusion_scores.items(),key=lambda x : x[1], reverse = True
        )
        # 문서객체 반환
        result = []
        for doc_text, score in sorted_docs[:self.retriever_k]:
            if doc.page_content.startswith(doc_text):
                result.append(doc)
                break
        print(f' RRF 통합결과: {len(result)}개 문서')
        print(f' RRF 각 검색 방식의 순위 역수를 합산해서 최종 순서를 결정')
        return result
    
    def get_detail_fusion_info(self,question:str, vector_retriever) -> dict :
        '''
        상세한 Fusion Retrieval 정보 반환
        Args:
            question : 사용자 질문
            vector_retrieval : 벡터 검색 vector_retrieval 객체
        Returns :
            Fusion Retrieval 상세 정보 Dict
        '''
        vector_doc = vector_retriever.invoke(question)
        bm25_docs = self.bm25_retriever.invoke(question)
        return {
            'vector_search_count' : len(vector_doc),
            'bm25_search_count' : len(bm25_docs),
            'vector_docs' : vector_doc,
            'bm25_docs' : bm25_docs
        }

In [None]:
%pip install langchain-community

In [None]:
if __name__ == '__main__':
    from langchain_community.document_loaders import TextLoader
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    print('문서 준비')
    loader = TextLoader('document.txt',encoding= 'utf-8')
    Documents = loader.load()
    print('청크 생성')
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 70
    )
chunks = splitter.split_documents(Documents)
print(f'{len(chunks)}개 청크 생성')
print('벡터 스토어 준비')
from langchain_openai import OpenAIEmbeddings
from langchain.chroma import Chroma
import os
embeddings = OpenAIEmbeddings(
    model = 'text-embedding-3-small',
    api_key = os.environ.get('OPENAI_API_KEY')
)
vectorstore = Chroma.from_documents(
    chunks,
    embeddings,
    persist_directory = './chroma_db'
)

print('벡터스토어 준비완료')
print('리트리버 생성')
retrieval = vectorstore.as_retriever(search_kwargs = {'k':3})

question = 'Langchain의 요소는 무엇인가요?'

print('Fusion Retrieval 사용')
fusion = FusionRetriveal(chunks,3)
fusion_docs = fusion.fusion_retrieval(question,retrieval)
print(f'fusion 검색 결과 : {len(fusion_docs)}개')

문서 준비
청크 생성
1개 청크 생성
벡터 스토어 준비


ModuleNotFoundError: No module named 'langchain.chroma'

In [12]:
%pip install langchain_chroma

Collecting langchain_chroma
  Using cached langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Using cached langchain_chroma-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: langchain_chroma
Successfully installed langchain_chroma-1.0.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
%pip install langchain_openai

Collecting langchain_openai
  Using cached langchain_openai-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Using cached langchain_openai-1.1.0-py3-none-any.whl (84 kB)
Installing collected packages: langchain_openai
Successfully installed langchain_openai-1.1.0
Note: you may need to restart the kernel to use updated packages.
