LangChain, RAG

In [1]:
# 출처 : https://wikidocs.net/231393
# pip install langchain_community


# 웹 로더더
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 여러 개의 url 지정 가능
url1 = "https://blog.langchain.dev/customers-replit/"
url2 = "https://blog.langchain.dev/langgraph-v0-2/"

loader = WebBaseLoader(
    web_paths=(url1, url2),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("article-header", "article-content")
        )
    ),
)
docs = loader.load()
len(docs)

USER_AGENT environment variable not set, consider setting it to identify your requests.


2

In [2]:
docs[0]   # 첫 번째 문서 내용 확인

Document(metadata={'source': 'https://blog.langchain.dev/customers-replit/'}, page_content='\nReplit is at the forefront of AI innovation with its platform that simplifies writing, running, and collaborating on code for over 30+ million developers. They recently released Replit Agent, which immediately went viral due to the incredible applications people could easily create with this tool.Behind the scenes, Replit Agent has a complex workflow built on LangGraph, which enables a highly custom agentic workflow with a high-degree of control and parallel execution. A major benefit of using LangGraph was the seamless integration with LangSmith, which gave Replit deep visibility into their agent interactions to debug tricky issues.\xa0The level of complexity required for Replit Agent also pushed the boundaries of LangSmith. The LangChain and Replit teams worked closely together to add functionality to LangSmith that would satisfy their LLM observability needs. Specifically, there were three 

In [3]:
docs[1]

Document(metadata={'source': 'https://blog.langchain.dev/langgraph-v0-2/'}, page_content="\nToday, we’re excited to announce the stable release of LangGraph v0.2, which introduces a new ecosystem of LangGraph checkpointer libraries. These simplify the creation and customization of checkpointers, which allows users to build more resilient LLM applications with smooth session memory, robust error recovery, and human-in-the-loop features.Why we built LangGraph v0.2One of the key pillars of LangGraph is its built-in persistence layer, implemented through checkpointers. When you use a checkpointer with a graph, you can interact with and manage the graph's state. The checkpointer saves a checkpoint of the graph state at each step, enabling several powerful capabilities, including:Session memory: Store history (checkpoints) of user interactions and resume from a saved checkpoint in follow up interactionsError recovery: Recover from failures at any given step in the graph execution by continui

LangChain 실습

In [7]:
# 각 문자로 구분하여 분할

from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator='',
    chunk_size = 500,
    chunk_overlap = 100,
    length_function = len,
)

texts = text_splitter.split_text(docs[0].page_content)

len(texts)

12

In [8]:
len(texts[0])

498

In [9]:
texts[0]

'Replit is at the forefront of AI innovation with its platform that simplifies writing, running, and collaborating on code for over 30+ million developers. They recently released Replit Agent, which immediately went viral due to the incredible applications people could easily create with this tool.Behind the scenes, Replit Agent has a complex workflow built on LangGraph, which enables a highly custom agentic workflow with a high-degree of control and parallel execution. A major benefit of using'

In [None]:
# 대규모 언어 모델(LLM)을 사용할 때 모델이 처리할 수 있는 토큰 수의 한계 
# 입력 데이터를 모델의 제한을 초과하지 않도록 적절히 분할하는 것이 중요
# 이때, LLM 모델에 적용되는 토크나이저를 기준으로 텍스트를 토큰을 분할하고,
# 이 토큰들의 수를 기준으로 텍스트를 Chunk 단위로 나누면 모델 입력 토큰 수를 조절할 수 있음.

In [11]:
# 참고 : https://littlefoxdiary.tistory.com/114
# LangChain에서 문서를 분할할 수 있는 여러가지 TextSplitter

# pip install langchain
# pip install tiktoken : 에러 메세지 해결

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 600,
    chunk_overlap = 200,
    encoding_name = 'cl100k_base'
)

docs_sp = text_splitter.split_documents(docs)
len(docs)

2

In [12]:
docs_sp[0]

Document(metadata={'source': 'https://blog.langchain.dev/customers-replit/'}, page_content='Replit is at the forefront of AI innovation with its platform that simplifies writing, running, and collaborating on code for over 30+ million developers. They recently released Replit Agent, which immediately went viral due to the incredible applications people could easily create with this tool.Behind the scenes, Replit Agent has a complex workflow built on LangGraph, which enables a highly custom agentic workflow with a high-degree of control and parallel execution. A major benefit of using LangGraph was the seamless integration with LangSmith, which gave Replit deep visibility into their agent interactions to debug tricky issues.\xa0The level of complexity required for Replit Agent also pushed the boundaries of LangSmith. The LangChain and Replit teams worked closely together to add functionality to LangSmith that would satisfy their LLM observability needs. Specifically, there were three ma

In [13]:
docs_sp[1]

Document(metadata={'source': 'https://blog.langchain.dev/langgraph-v0-2/'}, page_content="Today, we’re excited to announce the stable release of LangGraph v0.2, which introduces a new ecosystem of LangGraph checkpointer libraries. These simplify the creation and customization of checkpointers, which allows users to build more resilient LLM applications with smooth session memory, robust error recovery, and human-in-the-loop features.Why we built LangGraph v0.2One of the key pillars of LangGraph is its built-in persistence layer, implemented through checkpointers. When you use a checkpointer with a graph, you can interact with and manage the graph's state. The checkpointer saves a checkpoint of the graph state at each step, enabling several powerful capabilities, including:Session memory: Store history (checkpoints) of user interactions and resume from a saved checkpoint in follow up interactionsError recovery: Recover from failures at any given step in the graph execution by continuing

In [None]:
# 텍스트 파일 로더

from langchain_community.document_loaders import TextLoader

loader = TextLoader('history.txt')
data = loader.load()

print(type(data))
print(len(data)) 

In [None]:
# 폴더 순회

import os
from glob import glob

files = glob(os.path.join('./', '*.txt'))
files

from langchain_community.document_loaders import DirectoryLoader

loader = DirectoryLoader(path='./', glob='*.txt', loader_cls=TextLoader)

data = loader.load()

len(data)

In [None]:
# csv 파일

from langchain_community.document_loaders.csv_loader import CSVLoader

loader = CSVLoader(file_path='한국주택금융공사_주택금융관련_지수_20160101.csv', encoding='cp949')
data = loader.load()

len(data)

In [None]:
# pdf 문서 
# https://wikidocs.net/231565

## 문서 페이지 별 로드
## 형식이 없는 pdf 문서 로드
## 문서의 메타 데이터를 상세하게 추출 - 파일의 경로, 페이지, 정보, 키워드 등 상세 정보
## 온라인 문서 로드
## 특정 폴더 내부의 문서 로드