In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

### Load your data

In [None]:
?UnstructuredPDFLoader

In [4]:
loader = UnstructuredPDFLoader("../data/Smart Contract and DeFi Security.pdf")
# loader = OnlinePDFLoader("https://wolfpaulus.com/wp-content/uploads/2017/05/field-guide-to-data-science.pdf")

In [5]:
data = loader.load()

In [6]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 71269 characters in your document


### Chunk your data up into smaller documents

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [8]:
print (f'Now you have {len(texts)} documents')

Now you have 97 documents


### Create embeddings of your documents to get ready for semantic search

In [9]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [10]:
OPENAI_API_KEY = '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'
PINECONE_API_KEY = 'cb5d9140-8325-422e-a68c-8fd98184f99b'
PINECONE_API_ENV = 'asia-southeast1-gcp'

In [11]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [12]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "langchain2"

In [13]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [14]:
query = "What are vulnerabilities based on Smart Contract Layer, and Protocol Layer"
docs = docsearch.similarity_search(query, include_metadata=True)

In [15]:
docs[1]

Document(page_content='We chose this dataset because it reflects the real-world attacks that have occurred in the smart contract and DeFi ecosystem. While other related works [14, 39] have employed datasets of known vul- nerable contracts or contracts with induced vulnerabilities, we believe that our selection of real-world attacks provides a more representative sample of the types of vulnerabilities smart con- tract developers and auditors should be aware of. Furthermore, the contracts in the dataset have greater complexity than minimal examples, making reasoning about them more challenging.\n\nFigure 2: Summary of vulnerability categories and the num- ber of corresponding exploits in the Zhou et al. dataset [61]. ● indicates tool support for a corresponding vulnerability type. SC: Smart Contract Layer, PRO: Protocol Layer. We ex- clude vulnerability types that (1) the tools cannot support and (2) do not exist in the dataset.\n\nfor security tools in GitHub repositories. The above pro

### Query those docs to get your answer back

In [16]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [17]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
query = "What are vulnerabilities based on Smart Contract Layer, and Protocol Layer"
docs = docsearch.similarity_search(query, include_metadata=True)

In [19]:
chain.run(input_documents=docs, question=query)

' Vulnerabilities based on the Smart Contract Layer include absence of coding logic or sanity check, reentrancy, function/state visibility error, inconsistent, improper or unprotected access control, logic errors, direct call to untrusted contract, delegatecall to untrusted callee, and improper asset locks or frozen asset. Vulnerabilities based on the Protocol Layer include on-chain oracle manipulation, liquidity borrow, purchase, mint, deposit, camouflage a token contract, token standard incompatibility, other unsafe DeFi protocol dependency, unfair slippage protection, unfair liquidity providing, other protocol vulnerabilities, transaction order dependence, other unfair or unsafe DeFi protocol interaction, camouflage a non-token contract, weak randomness, unhandled or mishandled exception, and unbounded or gas costly operation.'

In [20]:
query = "What are the most common vulnerabilities?"
docs = docsearch.similarity_search(query, include_metadata=True)
chain.run(input_documents=docs, question=query)

' The most common vulnerabilities in the dataset were integer overflow/underflow, function/state visibility errors, timestamp dependency, token standard incompatibility, reentrancy, unhandled/mishandled exceptions, absence of coding logic, improper asset locks/frozen asset, and logic errors. Oracle manipulation was also identified as a common vulnerability.'