In [None]:

!pip install --quiet llama-index  # main llamaindex library

!pip install --quiet llama-index-vector-stores-MongoDB # mongodb vector database

!pip install --quiet llama-index-llms-anthropic # anthropic LLM provider

!pip install --quiet llama-index-embeddings-openai # openai embedding provider

!pip install --quiet beautifulsoup4

!pip install --quiet pymongo pandas datasets # others


In [None]:
import os
import config

os.environ["ANTHROPIC_API_KEY"] = config.ANTHROPIC_API_KEY
os.environ["HF_TOKEN"] = config.HF_TOKEN
os.environ["OPENAI_API_KEY"] = config.OPENAI_API_KEY

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.anthropic import Anthropic
from llama_index.core import Settings
llm = Anthropic(model="claude-3-5-sonnet-20240620")

embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
    dimensions=256,
    embed_batch_size=10,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

Settings.embed_model = embed_model
Settings.llm = llm


from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/bstee615/bigvul/viewer/default/train
dataset = load_dataset("benjis/bigvul", split="train", streaming=True)
dataset = dataset.take(4000)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

In [None]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

documents_json = dataset_df.to_json(orient='records')
documents_list = json.loads(documents_json)

llama_documents = []

maxSize = 0

for document in documents_list:
    # Convert complex objects to JSON strings
    for field in [
        "CVE ID",
        "CVE Page",
        "CWE ID",
        "codeLink",
        "commit_id",
        "commit_message",
        "func_after",
        "func_before",
        "lang",
        "project",
        "vul"
    ]:
        document[field] = json.dumps(document[field])

    # Create a Document object
    llama_document = Document(
        text=document["CVE Page"],
        metadata=document,
        excluded_llm_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        excluded_embed_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    if len(llama_document.get_metadata_str()) < 10000:
        llama_documents.append(llama_document)

    maxSize = max(maxSize, len(llama_document.get_metadata_str()))

# Observing input examples
print("\nThe LLM sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))
print("\nThe Embedding model sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

In [None]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import MetadataMode
from tqdm import tqdm

base_splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=200)

nodes = base_splitter.get_nodes_from_documents(llama_documents)

# Progress bar
pbar = tqdm(total=len(nodes), desc="Embedding Progress", unit="node")

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    node.embedding = node_embedding
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

In [None]:
print("Embedding process completed!")

import pymongo

os.environ["MONGO_URI"] = config.MONGO_URI

def get_mongo_client(mongo_uri):
    """Establish and validate connection to the MongoDB."""
    
    client = pymongo.MongoClient(mongo_uri, appname="devrel.showcase.python")

    # Validate the connection
    ping_result = client.admin.command('ping')
    if ping_result.get('ok') == 1.0:
        # Connection successful
        print("Connection to MongoDB successful")
        return client
    else:
        print("Connection to MongoDB failed")
    return None


mongo_client = get_mongo_client(config.MONGO_URI)

DB_NAME = "Claude"
COLLECTION_NAME = "BigVulData"

db = mongo_client.get_database(DB_NAME)
collection = db.get_collection(COLLECTION_NAME)

In [None]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongo_client, 
    db_name=DB_NAME, 
    collection_name=COLLECTION_NAME, 
    index_name="vector_index"
)

vector_store.add(nodes)

In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata

index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine(similarity_top_k=5, llm=llm)

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="knowledge_base",
        description=(
            "Provides information about Airbnb listings and reviews."
            "Use a detailed plain text question as input to the tool."
        ),
    ),
)

In [None]:
from llama_index.core.agent import FunctionCallingAgentWorker

agent_worker = FunctionCallingAgentWorker.from_tools(
    [query_engine_tool], llm=llm, verbose=True
)
agent = agent_worker.as_agent()

In [None]:
response = agent.chat("""
#include <unistd.h>
#include <stdlib.h>
#include <stdlib.h>
void *mymalloc(unsigned int size) { return malloc(size); }

int main()
{
    char *buf;
    size_t len;
    read(0, &len, sizeof(len));
    /* we forgot to check the maximum length */
    /* 64-bit size_t gets truncated to 32-bit unsigned int */
    buf = mymalloc(len);
    read(0, buf, len);
    return 0;
}
                      
                      There is a vulnerability in this code, what is it and specify the CVE ID
                      """)
print(str(response))