In [None]:
# %pip install --upgrade --quiet  docx2txt langchain-community
%pip install python-dotenv langchain langchain-upstage langchain-community langchain-text-splitters

In [1]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
)

loader = Docx2txtLoader('./tax.docx')
documentList = loader.load_and_split(text_splitter=text_splitter)

In [12]:

from dotenv import load_dotenv
from langchain_upstage import UpstageEmbeddings
load_dotenv(dotenv_path='.venv/pyvenv.cfg')
embedding = UpstageEmbeddings(model='solar-embedding-1-large')

In [13]:
from langchain_chroma import Chroma

# 처음 chroma 디비에 chroma-tax 네임스페이스로 쪼갠 문서를 'text-embedding-3-large' 모델로 내재화 시킴
database = Chroma.from_documents(persist_directory="./chroma", collection_name='chroma-tax', documents=documentList , embedding=embedding)

In [None]:
query = '연봉 5천만원인 직장인의 소득세는 얼마인가요?'


retrived_docs = database.similarity_search(query)

retrived_docs

In [15]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo')

In [16]:
prompt = f"""[Identity]

- 당신은 최고의 한국 소득세 전문가 입니다.
- [Context]를 참고해서 사용자의 질문에 답변하세요.
{retrived_docs}

Question: {query}
"""

In [None]:
ai_message = llm.invoke(prompt)


ai_message.content

In [18]:

#============= langchain hub를 활용 RetrievalQA ====================
from langchain_chroma import Chroma

database = Chroma(collection_name='chroma-tax' , persist_directory="./chroma" , embedding_function=embedding)

In [19]:
from langchain_openai import ChatOpenAI
query = "연봉 5천만원의 직장인의 소득세는 얼마인가요?"
llm = ChatOpenAI(model='gpt-3.5-turbo')

In [20]:

from dotenv import load_dotenv
from langchain import hub
import os
load_dotenv(dotenv_path='.venv/pyvenv.cfg')
api_key = os.environ['OPENAI_API_KEY']
prompt = hub.pull("rlm/rag-prompt",api_key=api_key)

In [21]:
# 어느 백터 디비를 사용해도 langchain을 사용한다면 같은 코드로 수행 가능

from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = database.as_retriever(),
    chain_type_kwargs={"prompt" : prompt}
)

In [None]:
ai_message = qa_chain({"query" : query})

In [None]:
ai_message