## input PDF

In [None]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import SpacyTextSplitter

In [None]:
loader = PyMuPDFLoader("../data/sample.pdf")
documents = loader.load()

In [None]:
len(documents)

In [None]:
text_splitter = SpacyTextSplitter(chunk_size=1000, pipeline="ja_core_news_sm")
split_documents = text_splitter.split_documents(documents)

In [None]:
print(len(split_documents))
split_documents[0]

## store vectore

In [None]:
from langchain.embeddings import OpenAIEmbeddings
import chromadb
from chromadb.config import Settings
from langchain.vectorstores import Chroma

In [None]:
embeddings = OpenAIEmbeddings(
    model="text-embedding-ada-002"
)

client = chromadb.HttpClient(
    host="chromadb",
    port=8000,
    settings=Settings(allow_reset=True, anonymized_telemetry=False),
)

In [None]:
# 新しいDBの作成
db = Chroma(
    collection_name="langchain_store",
    embedding_function=embeddings,
    client=client,
)
db.add_documents(documents=documents, embedding=embeddings)

## RAG

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o-mini")

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
output_parser = StrOutputParser()

In [None]:
retriever = db.as_retriever()

In [None]:
from langchain_core.prompts import ChatPromptTemplate

template = """次の文脈（context）のみに基づいて質問（question）に答えてください。:
{context}

質問: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
# LCELによるチェーンの作成と結果の取得
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [None]:
query = ""
result = chain.invoke(query)
print(result)
# query_vector = embeddings.embed_query(query)

In [None]:
# searched_documents = db.similarity_search_by_vector(query_vector)

In [None]:
# for searched_document in searched_documents:
#     print(searched_document.page_content)