# Smart Contract Security Assistant - Exploration

Use this notebook to explore the data and test the RAG system.

## Setup

In [None]:
import sys
sys.path.append('..')

from src.database import load_vulnerability_database, search_similar_vulnerabilities
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv
import json

load_dotenv()

## Explore the Dataset

In [None]:
# Load a sample finding
with open('../sample-smart-contract-dataset/finding_62000.json') as f:
    sample_finding = json.load(f)

print("Sample Finding:")
print(f"ID: {sample_finding['id']}")
print(f"Title: {sample_finding['title']}")
print(f"Impact: {sample_finding['impact']}")
print(f"\nContent Preview: {sample_finding['content'][:200]}...")

## Load Vector Database

In [None]:
# This will create embeddings on first run (takes 5-15 minutes)
# Subsequent runs will load from saved database (fast)
vectorstore = load_vulnerability_database()

## Test Semantic Search

In [None]:
# Search for reentrancy vulnerabilities
query = "reentrancy attack"
results = search_similar_vulnerabilities(vectorstore, query, k=5)

print(f"Top 5 results for '{query}':\n")
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc.page_content[:150]}...\n")

In [None]:
# Try different queries
queries = [
    "input validation",
    "overflow underflow",
    "access control",
    "missing checks"
]

for query in queries:
    results = search_similar_vulnerabilities(vectorstore, query, k=3)
    print(f"Query: {query}")
    print(f"Results: {len(results)} findings\n")

## Test Q&A Chain

In [None]:
from langchain.chains import RetrievalQA

# Initialize LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)

# Create Q&A chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True
)

In [None]:
# Ask a question
question = "What is a reentrancy attack and how do I prevent it?"
result = qa_chain(question)

print("Question:", question)
print("\nAnswer:")
print(result['result'])
print("\nSources used:", len(result['source_documents']))

## Test Code Analysis

In [None]:
# Sample vulnerable code
vulnerable_code = """
contract MyToken {
    mapping(address => uint) public balances;
    
    function withdraw(uint amount) public {
        require(balances[msg.sender] >= amount);
        msg.sender.call{value: amount}("");
        balances[msg.sender] -= amount;
    }
}
"""

# Search for similar vulnerabilities
similar_vulns = vectorstore.similarity_search(vulnerable_code, k=5)

print("Similar vulnerabilities found:")
for i, doc in enumerate(similar_vulns, 1):
    print(f"\n{i}. {doc.page_content[:200]}...")

In [None]:
# Ask LLM to analyze
context = "\n\n".join([doc.page_content for doc in similar_vulns])

analysis_prompt = f"""
Analyze this Solidity code for vulnerabilities:

{vulnerable_code}

Known similar vulnerabilities:
{context}

List all security issues with severity levels.
"""

analysis = llm.predict(analysis_prompt)
print("Analysis:")
print(analysis)

## Experiment with Different Models

In [None]:
# Compare GPT-4 vs GPT-3.5-turbo
from langchain.chat_models import ChatOpenAI

models = [
    ChatOpenAI(model="gpt-4", temperature=0),
    ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
]

question = "What is input validation?"

for model in models:
    chain = RetrievalQA.from_chain_type(
        llm=model,
        retriever=vectorstore.as_retriever()
    )
    result = chain(question)
    print(f"Model: {model.model_name}")
    print(f"Answer: {result['result'][:200]}...\n")

## Your Experiments

Use the cells below to experiment with your own queries and code!

In [None]:
# Your code here