# RAG Basic

## 프로세스
1. 문서 로드
2. 텍스트 분할
3. 임베딩
4. 벡터DB 저장
5. 검색기
6. 증강된 프롬프트
7. LLM

- API 키 환경변수로 관리

In [None]:
from dotenv import load_dotenv

# API 키 정보 로드
load_dotenv()

- 관련 LangChain 라이브러리 설치 및 임포트

In [None]:
!pip install langchain langchain_community langchain-openai faiss-cpu pymupdf

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

## 1. 문서 로드

In [None]:
loader = PyMuPDFLoader("data/초보 투자자를 위한 증권과 투자 따라잡기.pdf")
docs = loader.load()
print(f"문서의 페이지수: {len(docs)}")

In [None]:
print("===페이지 컨텐츠===")
print(docs[10].page_content)
print("===metadata===")
print(docs[10].__dict__)

## 2. 텍스트 분할

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)
print(f"분할된 청크의수: {len(split_documents)}")

## 3. 임베딩(Embedding)

In [None]:
embeddings = OpenAIEmbeddings()
embeddings

# 4. 벡터DB

In [None]:
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)
vectorstore

In [None]:
for i, doc in enumerate(vectorstore.similarity_search("성장주")):
    print(f"\n---문서 청크 {i}---")
    print(doc.page_content)

## 5. 검색기(Retriever)

In [None]:
retriever = vectorstore.as_retriever()
# 검색기에 질문 
retriever.invoke("1980년대의 성장주는 무엇인가?")

## 6. 증강된 프롬프트

In [None]:
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Answer in Korean.

#Context: 
{context}

#Question:
{question}

#Answer:"""
)

## 7. LLM

In [None]:
llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

In [None]:
# 체인(Chain) 생성
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
question = "1980년대의 성장주는 무엇인가??"
response = chain.invoke(question)
print(response)