In [35]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.callbacks import get_openai_callback
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

cache_dir = LocalFileStore("./tmp/embeddings")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size = 600,
    chunk_overlap = 100
)

loader = UnstructuredFileLoader("./tmp/read3.pdf")

docs = loader.load_and_split(text_splitter=splitter) 

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

question = "年間保険料をいくらだと記載されている、被保険者名または対象者はなんと記載されている"

with get_openai_callback() as usage:
    vectorstore = Chroma.from_documents(documents=docs, embedding=cached_embeddings)
    # result = vectorstore.similarity_search(question)
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(),
    )

    result = chain.run(question);
    print(result)
    print(usage)


年間保険料は14,325,500円と記載されていますが、被保険者名や対象者に関する情報は提供されていません。
Tokens Used: 1651
	Prompt Tokens: 1601
	Completion Tokens: 50
Successful Requests: 1
Total Cost (USD): $0.0025015
