In [70]:
from dotenv import load_dotenv
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

load_dotenv()

gpt_35_model = 'dev-gpt-35-turbo-sample'
embedding_model = 'dev-text-embedding-ada-002-01' 
urls = [
    'https://www.law.go.kr/lsInfoP.do?lsiSeq=39593&efYd=19971231#0000',
    'https://www.law.go.kr/lsInfoP.do?lsiSeq=258015&efYd=20240101#0000',
    'https://www.law.go.kr/lsInfoP.do?lsiSeq=260889&efYd=20240301#0000',
    'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000'
]

human_question = '''소득세법 제2조(납세의무) 에 대해 설명해줘'''

In [71]:


from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai.embeddings.azure import AzureOpenAIEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever


docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]


In [72]:

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=50
)
doc_splits = text_splitter.split_documents(docs_list)


In [73]:
from langchain_experimental.text_splitter import SemanticChunker

In [74]:
# SemanticChunker 를 생성합니다.
semantic_text_splitter = SemanticChunker(
    AzureOpenAIEmbeddings(
        model=embedding_model# Azure OpenAIEmbedding model명
        ), add_start_index=True)

In [75]:

# 벡터 데이터베이스에 문서 추가
vectorstore = Chroma.from_documents(
    documents=doc_splits,
    collection_name="rag-chroma",
    embedding=AzureOpenAIEmbeddings(
        model=embedding_model# Azure OpenAIEmbedding model명
        ),
    persist_directory="./chroma_db"
)
retriever = vectorstore.as_retriever(search_type="similarity")
search_result = retriever.get_relevant_documents(human_question)



In [76]:

retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 10})
search_result = retriever.get_relevant_documents(human_question)
print(search_result)

[Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', 'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문'}), Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', 'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문'}), Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', 'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문'}), Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', 'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문'}), Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', 'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문'}), Document(page_content='소득세법 시행규칙', metadata={'language': 'ko', '

In [77]:
k = 3

# (Sparse) bm25 retriever and (Dense) faiss retriever 를 초기화 합니다.
bm25_retriever = BM25Retriever.from_documents(doc_splits)
bm25_retriever.k = k

faiss_vectorstore = FAISS.from_documents(doc_splits, AzureOpenAIEmbeddings(model=embedding_model))
faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": k})

# initialize the ensemble retriever
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5]
)


In [78]:
relevant_docs = ensemble_retriever.get_relevant_documents(human_question)



In [79]:
print(relevant_docs)

[Document(page_content='소득세법 시행령', metadata={'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=260889&efYd=20240301#0000', 'title': '소득세법 시행령 | 국가법령정보센터 | 법령 > 본문', 'language': 'ko'}), Document(page_content='소득세법 시행규칙', metadata={'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=261251&efYd=20240322#0000', 'title': '소득세법 시행규칙 | 국가법령정보센터 | 법령 > 본문', 'language': 'ko'}), Document(page_content='소득세법 | 국가법령정보센터 | 법령 > 본문', metadata={'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=258015&efYd=20240101#0000', 'title': '소득세법 | 국가법령정보센터 | 법령 > 본문', 'language': 'ko'}), Document(page_content='소득세법\n\n          [시행 2024. 1. 1.] [법률 제19933호, 2023. 12. 31., 일부개정] \n         \n\n\n\n본문', metadata={'source': 'https://www.law.go.kr/lsInfoP.do?lsiSeq=258015&efYd=20240101#0000', 'title': '소득세법 | 국가법령정보센터 | 법령 > 본문', 'language': 'ko'})]


In [80]:
len(relevant_docs)

4

In [81]:
from langchain import hub

prompt = hub.pull("rlm/rag-prompt")


In [82]:

# 단계 6: 프롬프트 생성(Create Prompt)
# 프롬프트를 생성합니다.
prompt = hub.pull("rlm/rag-prompt")

# 단계 7: 언어모델 생성(Create LLM)
# 모델(LLM) 을 생성합니다.
llm = AzureOpenAIEmbeddings(model_name=gpt_35_model, temperature=0)


def format_docs(docs):
    # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
    return "\n\n".join(doc.page_content for doc in docs)



                    model_name was transferred to model_kwargs.
                    Please confirm that model_name is what you intended.
                    temperature was transferred to model_kwargs.
                    Please confirm that temperature is what you intended.


In [83]:
rag_chain = (
    {"context": ensemble_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

TypeError: Expected a Runnable, callable or dict.Instead got an unsupported type: <class 'langchain_openai.embeddings.azure.AzureOpenAIEmbeddings'>