### 1. 환경 설정

In [11]:
#uv add langgraph

In [None]:
from dotenv import load_dotenv
load_dotenv(override=True)


In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

### 2. Document Loader
- PyPDF로 문서에서 텍스트 추출

In [None]:
loader = PyPDFLoader(file_path="data/arxiv_paper.pdf")
docs = loader.load()
len(docs)

In [None]:
for doc in docs:
    print(doc.page_content[:500])
    print(doc.metadata)
    print("-"*100)

### 3. Text Splitter
-  RecursiveCharacterTextSplitter 사용

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

print(splits[0].page_content)

In [None]:
[len(chunk.page_content) for chunk in splits]

### 4. Embedding and Vector Store
- 텍스트를 벡터로 변환
- Qdrant Vector Store에 저장

In [None]:
embeddings = OllamaEmbeddings(model="bge-m3") # 벡터 모델 선택 (bge-m3)
embeddings

In [None]:
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, Distance
from uuid import uuid4

# Qdrant 클라이언트 생성 (메모리형)
client = QdrantClient(":memory:")

# 컬렉션 생성
client.create_collection(
    collection_name="rag_collection",
    vectors_config=VectorParams(size=1024, distance=Distance.COSINE), # 벡터 크기(임베딩 모델에 따라 틀림)와 거리 측정 방식(코사인 거리)
)

# Vector Store 생성
vectorstore = QdrantVectorStore(
    client=client,
    collection_name="rag_collection",
    embedding=embeddings,
    retrieval_mode=RetrievalMode.DENSE, # 데이터 검색 방식 (DENSE-기본, SPARSE, HYBRID) 
)

uuids = [str(uuid4()) for _ in range(len(splits))]

# 데이터 저장
vectorstore.add_documents(
    documents=splits,
    ids=uuids,
)

### 5. Retrieval
- 데이터 검색

In [None]:
retriever = vectorstore.as_retriever()

search_result = retriever.invoke("Embodied Agent가 뭐야?", k=5)

for doc in search_result:
    print(doc.page_content[:500])
    print(doc.metadata)
    print("-"*100)

In [10]:
# RAG Prompt Template 설정
from langchain.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
당신은 Q&A 전문 AI 어시스턴트입니다. 주어진 컨텍스트를 사용하여 질문에 답변해주세요.

컨텍스트:
{context}

질문:
{question}

답변:
 """)

In [12]:
# Langgraph State, node, edge 선언하기
from langgraph.graph import StateGraph, START, END
from typing import TypedDict, List
from langchain_core.documents import Document

class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Define application steps
def retrieve(state: State) -> State:
    retrieved_docs = retriever.invoke(state["question"])
    return {"context": retrieved_docs}

def generate(state: State) -> State:
    llm = ChatOllama(model="gemma3:4b", temperature=0)
    docs_content = "\n\n".join([doc.page_content for doc in state["context"]])
    messages = prompt.invoke({"context": docs_content, "question": state["question"]})
    response = llm.invoke(messages)
    return {"answer": response.content}

# compile application and test
graph_builder = StateGraph(State).add_sequence( [retrieve, generate] )
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

In [None]:
from IPython.display import Image,display
display(Image(graph.get_graph().draw_mermaid_png()))

In [None]:
# 답변 Streaming

for message, metadata in graph.stream({"question": "Embodied Agent가 뭐야?"}, stream_mode="messages"):
    print(message.content, end="", flush=True)