# Chat with all PDF files and OpenAI

### Libray Imports

In [2]:
!pip install PyPDF2



In [11]:
import os
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI


# Read the OpenAI key
openai_key = os.getenv("OPENAI_API_KEY")

### Load a PDF file and Read its contents

In [4]:
DATA_DIR = os.getenv("AI_DATASETS_PATH")
PDF_PATH = os.path.join(DATA_DIR, "genai_datasets/Docs/yolov7paper.pdf")

# Initialize PDF Reader
reader = PdfReader(PDF_PATH)

# Read the data (extract the data from PDF) and put it in variable raw_text
raw_text = ""
for index, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text


### Split Text into Smaller Chunks

In [7]:
# Now we will split the text we read into smaller chunks so that during information retrieval we dont hit the maximum token limit
# OpenAI models such as GPT 3.5 or GPT 4, have a maximum token limit, which restricts the input length.
# The token limit for gpt-3.5-turbo is 4096 tokens, whereas the token limits for gpt-4-8k and gpt-4-32k are 8192 and 32768 respectively.

textsplitter = CharacterTextSplitter(
    separator = "\n",
    # chunk size of 1000 Token each and there is going to be an overlap of 200 tokens between the consecutive chunks
    # first chunk 1000 charaters long, Next chunk will include last 200 charaters from the first chunk
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = len
)

# Now we will convert text into chunks
texts = textsplitter.split_text(raw_text)

print(f"Length of the chunks(number of chunks splitted) - {len(texts)}")

Length of the chunks(number of chunks splitted) - 84


In [9]:
# check once chunk
print(texts[3])

Figure 1: Comparison with other real-time object detectors, our
proposed methods achieve state-of-the-arts performance.
opment of MCUNet [49, 48] and NanoDet [54] focused on
producing low-power single-chip and improving the infer-
ence speed on edge CPU. As for methods such as YOLOX
[21] and YOLOR [81], they focus on improving the infer-
ence speed of various GPUs. More recently, the develop-
ment of real-time object detector has focused on the de-
sign of efﬁcient architecture. As for real-time object de-
tectors that can be used on CPU [54, 88, 84, 83], their de-
sign is mostly based on MobileNet [28, 66, 27], ShufﬂeNet
[92, 55], or GhostNet [25]. Another mainstream real-time
object detectors are developed for GPU [81, 21, 97], they
mostly use ResNet [26], DarkNet [63], or DLA [87], and
then use the CSPNet [80] strategy to optimize the architec-
ture. The development direction of the proposed methods in
this paper are different from that of the current mainstream


### Convert to embeddings

In [10]:
# Initialize OpenAI embeddings

embeddings = OpenAIEmbeddings()

# We want to compute the embedding on our document, there is a bunch of vector stores that langchain support we will use FAISS
# FAISS will take the text chunks, find the corresponding embedding and that will be stored in the Document Search
docsearch = FAISS.from_texts(texts, embedding=embeddings)

### Build the Query - Answer using OpenAI

In [12]:
chain = load_qa_chain(OpenAI(api_key=openai_key), chain_type='stuff')

In [13]:
query = "Who are the authors of this article YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors"
docs = docsearch.similarity_search(query)

# Pass the docsearch to OpenAI and have LLM answer back
chain.run(input_documents=docs, question=query, verbose=True)

' The authors of this article are Chien-Yao Wang, Alexey Bochkovskiy, and Hong-Yuan Mark Liao.'

In [14]:
query = "YOLOv7 surpasses which models"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' YOLOv7 surpasses YOLOv4, YOLOR-CSP, YOLOv4-tiny-31.'

In [18]:
query = "What dataset YOLOv7 is trained on?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' YOLOv7 is trained on MS COCO dataset from scratch without using any other datasets or pre-trained weights.'

In [19]:
query = "Do you know about Google Bard?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" No, I don't know about Google Bard."