In [1]:
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("dongguk_update.pdf")
docs=loader.load()
print(len(docs))



240


In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
split_documents = text_splitter.split_documents(docs)
print(f"분할된 청크의수: {len(split_documents)}")

분할된 청크의수: 265


In [4]:
from langchain_upstage import UpstageEmbeddings
embeddings = UpstageEmbeddings(model="solar-embedding-1-large")

In [5]:
from langchain_chroma import Chroma

# 데이터를 처음 저장할 때 
#database = Chroma.from_documents(documents=split_documents, embedding=embeddings, collection_name='chroma-dongguk', persist_directory="./chroma")

# 이미 저장된 데이터를 사용할 때 
database = Chroma(collection_name='chroma-dongguk', persist_directory="./chroma", embedding_function=embeddings)

In [6]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
    You are an aiassistant who answers questions related to school life of Dongguk University students.
    Use the following pieces of retrieved context to answer the question. 
    Please answer in the questioning language. For example, if the Question is in English, answer in English, and answer in Chinese
    Please present the page of the document you referred to when answering in the following format.
    If a user's question varies depending on various situations or environments, encourages the user to ask more specific questions, please.
    You should never answer inaccurate or incorrect content. If you can't answer the content, don't provide a page. let's think step by step
    # format : 이 답변은 2024 신입생 학업이수가이드 ??? 페이지를 참고해 작성되었습니다.

    Retrieved Context: {context}

    Question: {question}

    Answer:
    """
)

In [8]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=database.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

In [10]:
query='동국대학교 건학 이념 알려줘'

In [11]:
ai_message=qa_chain.invoke({'query':query})

In [12]:
ai_message['result']

'동국대학교의 건학 이념은 불교정신을 바탕으로 학술과 인격을 연마하고, 민족과 인류사회 및 자연에 이르기까지 지혜와 자비를 충만케 하여 서로 신뢰하고 공경하는 이상 세계의 구현을 목표로 하고 있습니다. \n\n이 답변은 2024 신입생 학업이수가이드 3 페이지를 참고해 작성되었습니다.'