In [1]:
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

  from tqdm.autonotebook import tqdm


In [2]:
# Uploading and loading Pdf file. Reference: https://python.langchain.com/en/latest/modules/indexes/document_loaders/examples/pdf.html
pdf_file = UnstructuredPDFLoader(file_path="path to file") # Replace "path to file" with pdf file location at your local machine
content = pdf_file.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [3]:
# Splitting the file into chunks. Reference: https://python.langchain.com/en/latest/reference/modules/text_splitter.html

file_chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
text_chunks = file_chunks.split_documents(content)

In [4]:
OPENAI_API_KEY = "your_openai_key" # Replace "your_openai_key" with ChatGPT key
PINECONE_API_KEY = "your_pinecone_api" # Replace "your_pinecone_api" with ChatGPT key
PINECONE_API_ENV = "your_pinecone_env" # Replace "your_pinecone_env" with ChatGPT key

In [5]:
# Creating embeddings for semantic search. Reference: https://python.langchain.com/en/latest/modules/models/text_embedding/examples/openai.html

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Initializing pinecone. Reference: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/pinecone.html
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_API_ENV
)
index_name = "Write pinecone index name" # Replace "Write pinecone index name" with Pinecone index name
texts = [text.page_content for text in text_chunks]
# Wrapper on raw documents. Reference: https://python.langchain.com/en/latest/reference/modules/vectorstore.html
docsearch = Pinecone.from_texts(texts, embeddings, index_name=index_name)

In [6]:
# Interacting with OpenAI via API. Reference: https://python.langchain.com/en/latest/modules/models/llms/integrations/openai.html
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
# starting a question answering chain. Reference: https://python.langchain.com/en/latest/use_cases/question_answering.html
chain = load_qa_chain(llm, chain_type="stuff")

In [13]:
# User Query
query = input("Write your query")
# Return pinecone documents most similar to query. Reference: https://python.langchain.com/en/latest/modules/indexes/vectorstores/examples/pinecone.html
docs = docsearch.similarity_search(query, include_metadata=True)

In [14]:
chain.run(input_documents=docs, question=query)

' The IEEE definition of software engineering is "the application of a systematic, disciplined, quantifiable approach to the development, operation, and maintenance of software; that is, the application of engineering to software."'