In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load data

In [2]:
loader = UnstructuredPDFLoader("gpt4-tech-report.pdf")

In [3]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [4]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 263628 characters in your document


## Chun the data

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [6]:
print (f'Now you have {len(texts)} documents')

Now you have 301 documents


In [7]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import os

  from tqdm.autonotebook import tqdm


In [8]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
PINECONE_API_KEY = os.environ['PINECONE_KEY']
PINECONE_API_ENV = 'us-east4-gcp'

In [9]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [10]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain"

In [11]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [14]:
query = "Which areas does GPT-4 work best?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [15]:
docs

[Document(page_content='I can help you with, feel free to ask.GPT-4', metadata={}),
 Document(page_content='Askell et al.2022Askell et al.2022gpt-3.5-basegpt-3.5-basegpt-3.5-turbogpt-4-basegpt-4-basegpt-40%10%20%30%40%50%60%70%ModelAccuracyAccuracy on adversarial questions (TruthfulQA mc1)Anthropic-LMgpt-3.5gpt-4Figure8:PerformanceofGPT-4onTruthfulQA.Accuracyisshownonthey-axis,higherisbetter.WecompareGPT-4underzero-shotprompting,few-shotprompting,andafterRLHFﬁne-tuning.GPT-4signiﬁcantlyoutperformsbothGPT-3.5andAskelletal[100].ﬁxestoplotlegendandtitle65', metadata={}),
 Document(page_content='wide scoring bins. For example although GPT-4 attains the highest possible score on AP Biology (5/5),\n\nthis is only shown in the plot as 85th percentile because 15 percent of test-takers achieve that score.\n\nGPT-4 exhibits human-level performance on the majority of these professional and academic exams.\n\nNotably, it passes a simulated version of the Uniform Bar Examination with a score in the

### Query docs to get answers back

In [16]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [17]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
query = "GPT-4 在哪些领域超过 GPT-3?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [19]:
chain.run(input_documents=docs, question=query)

' GPT-4 outperforms GPT-3 in areas such as reasoning, knowledge retention, coding, simulated bar exams, traditional NLP benchmarks, and the MMLU benchmark.'

In [24]:
query = "What are the major topics for this paper?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [25]:
chain.run(input_documents=docs, question=query)

' The major topics for this paper are the different types of exams and tests, such as the AP Art History (FRQ), AP World History (MCQ), Graduate Record Examination (GRE) Verbal, AP US Government (FRQ), AP Physics 2 (FRQ), AP US Government (MCQ), SAT EBRW – Reading Portion, MKSAP Questions (MCQ), AP Chemistry (MCQ), AP Statistics (FRQ), AP Psychology (MCQ), AP Chemistry (FRQ), AP Macroeconomics (MCQ), AP Statistics (MCQ), Certiﬁed Sommelier (theory knowledge), SAT Math (MCQ), AP Calculus BC (MCQ), AP Environmental Science (MCQ), Introductory Sommelier (theory knowledge), USNCO Local Section Exam 2022, Advanced Sommelier, (theory knowledge), AMC 12, AMC 10, AP Microeconomics (MCQ), USA Biolympiad Semiﬁnal Exam 2020, AP Biology (MCQ), AP Art History (MCQ), Uniform Bar Exam (MBE+MEE+MPT), SAT EBRW – Writing Portion, Leetcode (medium), Leetcode (hard), Le'