In [7]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser



# pyMuPDFLoader 객체 정의
loader = PyMuPDFLoader("data/SPRi AI Brief_10월호_산업동향_F.pdf")

# 문서 로드
docs = loader.load()


text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)

split_documents = text_splitter.split_documents(docs)

embeddings = OpenAIEmbeddings()

vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)


# 벡터스토어에 있는 정보를 검색하고 생성
retriever = vectorstore.as_retriever()


prompt = PromptTemplate.from_template(
    """
    You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question.
    Use a very kind and gentle tone like a kindergarten teacher talking to a child.
    Speak in a warm and friendly way.
    If you don't know the answer, just say that you don't know. 
    Answer in Korean.

    #Context: 
    {context}

    #Question:
    {question}

    #Answer:
    """
)


llm = ChatOpenAI(model="gpt-4o", temperature=0)


chain = (
    {"context" : retriever, "question" : RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 체인 실행
question = "메타버스"
response = chain.invoke(question)

print(response)

미안하지만, 메타버스에 대한 정보는 제공된 문서에서 찾을 수 없어요. 다른 질문이 있으면 언제든지 물어봐 주세요!


In [9]:
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
import bs4

#환경변수 가져오기
load_dotenv()

# 문서 로드
loader = WebBaseLoader(
    web_path=("https://n.news.naver.com/article/437/0000416134"),
    bs_kwargs=dict(
        # 특정 요소에서만 파싱하도록 제한하는 필터 적용
        parse_only=bs4.SoupStrainer(
            "div",
            attrs={"class" : ["newsct_article _article_body","media_end_head  go_trans"]}
        )
    )
)

docs=loader.load()

# print(docs)

# 문서분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

splits = text_splitter.split_documents(docs)

# 임베딩 생성
embedding = OpenAIEmbeddings()

# 벡터스토어 생성
vectorstore = FAISS.from_documents(documents=splits, embedding=embedding)

# 검색기 (retriver) 생성
retriver = vectorstore.as_retriever()

# runtime


# 프롬프트
prompt = PromptTemplate.from_template(
    """
    당신은 질문-답변을 수행하는 AI 어시스턴트이다.
    주어진 문맥에 검색된 다음문맥(context)를 사용해 질문에 답해야한다.
    만약 주어진 문잭에서 답을 찾을 수 없는 겨우, 모른다고 이야기하세요
    한글로 답변해주세요.
    
    #Question:
    {question}
    
    #Context:
    {context}
    
    #Answer:
    """
)

# LLM 모델
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Chain 구성
news_chain = (
    {"context" : retriver, "question":RunnablePassthrough()}
    |prompt
    |llm
    |StrOutputParser()
)

# 질문
answer = news_chain.invoke("아파트 가사 알려줘")

print(answer)

모르겠습니다. 주어진 문맥에서는 '아파트'의 가사에 대한 정보가 없습니다.
