# 02-rag.ipynb

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
# %pip install pypdf langchain-community

## 저장 파트

In [6]:
# 1. Document Load (PDF)
# 지원하는 문서 로더: https://docs.langchain.com/oss/python/integrations/document_loaders

from langchain_community.document_loaders import PyPDFLoader

# 불러올 파일 위치
file_path = './nke-10k-2023.pdf'

# 대상 pdf 를 변환해줄 로더
loader = PyPDFLoader(file_path)

# 로더가 pdf를 python에서 쓸 수 있도록 변환(pdf 1page -> 1 Document)
docs = loader.load()

print(len(docs))  # 원본 pdf 페이지수가 나옴

107


In [7]:
# 2. Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document 를 잘라줄 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

# 쪼개기
chunks = text_splitter.split_documents(docs)

print(len(chunks))  # 전체 chunk 개수
print(chunks[0].page_content)  # 첫번째 청크의 원본 텍스트 내용

516
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)


In [8]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content)  # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content)  # 청크2 벡터로 변환

# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10])  # 벡터 눈으로 확인하기

True
[0.05371641367673874, 0.04882446676492691, 0.01884348876774311, -0.0048622614704072475, 0.025029662996530533, -0.008186884224414825, -0.00638802582398057, 0.02413913980126381, -0.0011487760348245502, 0.010846583172678947]


In [9]:
# 4. Vector Store 에 저장하기
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# pdf 쪼개놓은 chunks 를 벡터스토어에 저장 (저장 후 id들이 나옴)
ids = vector_store.add_documents(documents=chunks)

## 검색 파트

In [10]:
# 벡터스토어 -> 검색기로 활용
retriever = vector_store.as_retriever(
    search_type='similarity',  # 검색방식: 유사도
    search_kwargs={'k': 3}     # 결과개수: 3개
)

# 검색
retriever.invoke('나이키의 미국 영업점 개수?')

[Document(id='d249d5e6-a691-42b7-a6bb-97d511fd8737', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further informati

## PDF RAG를 Agent 에 통합

In [11]:
# 검색기(retriever)를 Tool(함수)로 만들기

# 검색어(query)를 인자로 받음
def search_vectorstore(query: str) -> str:
    """Retrieve info to help answer a query about Nike"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'

    return result


print(search_vectorstore('나이키 영업점 개수'))

direct to consumer operations sell products through the following number of retail stores in the United States:
U.S. RETAIL STORES NUMBER
NIKE Brand factory stores 213 
NIKE Brand in-line stores (including employee-only stores) 74 
Converse stores (including factory stores) 82 
TOTAL 369 
In the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further information.
2023 FORM 10-K 2

Table of Contents
ITEM 1B. UNRESOLVED STAFF COMMENTS
None.
ITEM 2. PROPERTIES
The following is a summary of principal properties owned or leased by NIKE:
The NIKE World Campus, owned by NIKE and located near Beaverton, Oregon, USA, is an approximately 400-acre site consisting of over 40 buildings which, together
with adjacent leased properties, functions as our world headquarters and is occupied by approximately 11,400 employees engaged in management, research, design,
development, marketing, finance and other administrative functions serving nearly all of our s

In [12]:
from langchain.agents import create_agent

prompt = """너는 2023 나이키 10k 보고서를 검색하는 도구를 다룰 수 있어. 
사용자 질문에 답변하기 위해 필요하면 사용해. 경제분석 전문가처럼 답변해."""


agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_vectorstore],
    system_prompt=prompt
)

In [13]:
content = "나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

{'messages': [HumanMessage(content='나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함.', additional_kwargs={}, response_metadata={}, id='e99210ee-e5dd-4206-bd77-b1d60c2f1ad9'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 117, 'total_tokens': 143, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_a391f2cee0', 'id': 'chatcmpl-DCx7K4MfVCTnnJDH5WdSdECNWgeba', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c9236-3ffc-7391-b9c3-b9e3ecdaaa01-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': 'Nike number of stores and average sales per store 2023'}, 'id': 'call_kRwvlEoAnxzeMQC9OsQ5spJ2', 'type': 'tool_call'}], inval

## Web문서(HTML) RAG + Agent

In [14]:
# % pip install beautifulsoup4

In [15]:
# HTML은 문서 본문 외에 필요하지 않은 내용이 많다. 전처리가 필요하다!
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 전처리기
bs4_strainer = bs4.SoupStrainer(class_=('post-title', 'post-header', 'post-content'))
# 로더
loader = WebBaseLoader(
    web_path="https://lilianweng.github.io/posts/2023-06-23-agent/",
    bs_kwargs={'parse_only': bs4_strainer},  # 처리기 넣기
)

docs = loader.load()
# 문서 페이지 수, 총 글자수
print( len(docs), len(docs[0].page_content) )


1 43047


### Todos
1. Web Base Loader 로 불러온 docs Split
2. Split 된 내용들을 OpenAI Embedding Model 을 활용하여 Vectorstore에 저장
3. Agent 에 통합하기

In [16]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore

# Split
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
chunks = text_splitter.split_documents(docs)
print(len(chunks))

# Embedding
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# Store
vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=chunks)

63


In [17]:
from langchain.agents import create_agent
from langchain.tools import tool


@tool
def search_vectorstore(query: str) -> str:
    """Retrieve info to help answer a user's query """
    docs = vector_store.similarity_search(query, k=4)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'

    return result


prompt = """사용자 질문에 답변하기 위한 검색하는 도구를 다룰 수 있어. 
사용자 질문에 답변하기 위해 사용해. 검색한 내용을 기반으로 답변해"""


# Agent 에 통합하기
agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_vectorstore],
    system_prompt=prompt
)

In [19]:
content = "자율 Agent라는게 결국 무엇을 의미하는가"

for event in agent.stream({"messages": [{"role": "user", "content": content}]}, stream_mode='values'):
    event['messages'][-1].pretty_print()


자율 Agent라는게 결국 무엇을 의미하는가
Tool Calls:
  search_vectorstore (call_Gy3InY9I1QIlKVG9VTABtkgV)
 Call ID: call_Gy3InY9I1QIlKVG9VTABtkgV
  Args:
    query: 자율 Agent 정의
Name: search_vectorstore

}
]
Then after these clarification, the agent moved into the code writing mode with a different system message.
System message:

}
]
Challenges#
After going through key ideas and demos of building LLM-centered agents, I start to see a couple common limitations:

This benchmark evaluates the agent’s tool use capabilities at three levels:

Level-1 evaluates the ability to call the API. Given an API’s description, the model needs to determine whether to call a given API, call it correctly, and respond properly to API returns.
Level-2 examines the ability to retrieve the API. The model needs to search for possible APIs that may solve the user’s requirement and learn how to use them by reading documentation.
Level-3 assesses the ability to plan API beyond retrieve and call. Given unclear user requests (e.g