In [None]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken

In [2]:
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS

from dotenv import load_dotenv
import os
# Load the .env file located in the project directory
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv('OPENAI_API_KEY')

# Change this to the local path where your PDF file is located.
local_pdf_path = "PDF/GPT4_MED_sample_paper.pdf"

reader = PdfReader(local_pdf_path)

# read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# We need to split the text that we read into smaller chunks so that during information retreival we don't hit the token size limits. 
text_splitter = RecursiveCharacterTextSplitter(
    separators=["。", "！", "？", "\n", " "],
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

# Download embeddings from OpenAI
embeddings = OpenAIEmbeddings()

docsearch = FAISS.from_texts(texts, embeddings)

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

chain = load_qa_chain(OpenAI(), chain_type="stuff")

In [3]:
query = "Explain the paper"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

" This paper investigates GPT-4's capabilities in the medical domain, specifically focusing on its performance on multiple-choice exam questions. It also looks at other aspects of GPT-4's behavior, such as its ability to provide explanations and conduct counterfactual analyses. The paper also examines the implications of these findings, such as the potential for reliable information in healthcare applications."

In [4]:
query = "Ｗhat is 5-shot in this paper"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' 5-shot in this paper refers to leave-one-out cross validation (LOOCV) accuracy, where for each evaluation sample, we draw the 5 few-shot exemplars randomly from the remainder of the dataset.'