from dotenv import load_dotenv

load_dotenv()

In [12]:
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
import os

os.environ['LANGCHAIN_PROJECT'] = "RAG TUTORIAL"

In [14]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [15]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("./test.pdf")

docs = loader.load()
print(f"문서의 수: {len(docs)}")
docs

문서의 수: 41


[Document(page_content="Title:Gaston de Latour  \nAuthor:Pater, Walter  \nGender of Author:Male  \nPublication Year:1896  \nPublisher:N/A  \nPublication Year of the Edition Used:2003  \nPublisher of the Edition Used:N/A  \nIndex Number: A116  \nSource Link: https://www.gutenberg.org/ebooks/4062  \n  \n*** START OF THIS TEXT Gaston de Latour ***  \n  \nCHAPTER I.  A CLERK IN ORDERS  \n  \nThe white walls of the Château of Deux-manoirs, with its precincts, composed, before its dismantling at \nthe Revolution, the one prominent object which towards the southwest broke the pleasant level of La \nBeauce, the great corn-land of central France.  Abode in those days of the family of Latour, nesting there \ncentury after century, it recorded significantly the effectiveness of their brotherly union, less by way of \ninvasion of the rights of others than by the improvement of all gentler sentiments within.  From the \nsumptuous monuments of their last resting-place, backwards to every object whic

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

splits = text_splitter.split_documents(docs)
len(splits)

254

In [18]:
# 벡터스토어를 생성합니다.
vectorstore = FAISS.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# 뉴스에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

In [21]:
prompt = hub.pull("rlm/rag-prompt")
prompt

ChatPromptTemplate(input_variables=['context', 'question'], metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))])

In [22]:
from langchain.callbacks.base import BaseCallbackHandler


class StreamCallback(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs):
        print(token, end="", flush=True)


llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
    streaming=True,
    callbacks=[StreamCallback()],
)


def format_docs(docs):
    # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
    return "\n\n".join(doc.page_content for doc in docs)


# 체인을 생성합니다.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [23]:
rag_chain.invoke(
    "who are you?"
)

I am an assistant for question-answering tasks.

'I am an assistant for question-answering tasks.'

In [24]:
rag_chain.invoke(
    "do you know about airplain?"
)

I don't know.

"I don't know."