In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback


# Text

In [None]:
# from langchain.indexes import VectorstoreIndexCreator

# index = VectorstoreIndexCreator(
#     vectorstore_kwargs={"persist_directory": save_path},
# ).from_loaders([loader])

# index.query(query)

In [None]:
loader = TextLoader('./data/小说.txt')

save_path = 'novel' # db

# text_splitter = CharacterTextSplitter(separator = "\n", chunk_size=300, chunk_overlap=0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=0)
texts = text_splitter.split_documents(loader.load())

embeddings = OpenAIEmbeddings()

db = Chroma.from_documents(texts, embeddings, persist_directory=save_path)

retriever = db.as_retriever()

qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=retriever)
query = "张小凡使用什么方式打败了鬼王？"

qa.run(query)

# PDF

- Method 1

In [None]:
from PyPDF2 import PdfReader
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS
from langchain.chains.question_answering import load_qa_chain

reader = PdfReader('./data/1705.07750.pdf')
# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

print(len(raw_text))  

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_text(raw_text)

embeddings = OpenAIEmbeddings()

docsearch = FAISS.from_texts(texts, embeddings)

chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

query = "who are the authors of the article?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

- Method 2

In [6]:
"""
sudo apt-get update
sudo apt-get install tesseract-ocr
"""

from langchain.document_loaders import UnstructuredPDFLoader
from langchain.chains.question_answering import load_qa_chain

save_path = 'cv_paper'

loader = UnstructuredPDFLoader('./data/1705.07750.pdf')
pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0))

embeddings = OpenAIEmbeddings()

docsearch = Chroma.from_documents(pages, embeddings, persist_directory=save_path).as_retriever()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with another strategy.
Falling back to partitioning with ocr_only.
Using embedded DuckDB with persistence: data will be stored in: cv_paper


In [None]:
query = "这篇文章主要的贡献有哪些"
docs = docsearch.get_relevant_documents(query)

chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff", verbose=True)


with get_openai_callback() as cb:
    response = chain.run(input_documents=docs, question=query)
    print(f"Answer: {response}\n")
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")