In [1]:

!pip install --quiet llama-index  # main llamaindex library

!pip install --quiet llama-index-vector-stores-MongoDB # mongodb vector database

!pip install --quiet llama-index-llms-anthropic # anthropic LLM provider

!pip install --quiet llama-index-embeddings-openai # openai embedding provider

!pip install --quiet beautifulsoup4

!pip install --quiet pymongo pandas datasets # others


In [2]:
import os
import config

os.environ["ANTHROPIC_API_KEY"] = config.ANTHROPIC_API_KEY
os.environ["HF_TOKEN"] = config.HF_TOKEN
os.environ["OPENAI_API_KEY"] = config.OPENAI_API_KEY

In [4]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.anthropic import Anthropic
from llama_index.core import Settings
llm = Anthropic(model="claude-3-5-sonnet-20240620")

embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
    dimensions=256,
    embed_batch_size=10,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

Settings.embed_model = embed_model
Settings.llm = llm


In [6]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/bstee615/bigvul/viewer/default/train
dataset = load_dataset("benjis/bigvul", split="train", streaming=True)
dataset = dataset.take(10000)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

Unnamed: 0,CVE ID,CVE Page,CWE ID,codeLink,commit_id,commit_message,func_after,func_before,lang,project,vul
0,CVE-2017-7586,https://www.cvedetails.com/cve/CVE-2017-7586/,CWE-119,https://github.com/erikd/libsndfile/commit/708...,708e996c87c5fae77b104ccfeb8f6db784c32074,src/ : Move to a variable length header buffer...,"psf_get_date_str (char *str, int maxlen)\n{\tt...","psf_get_date_str (char *str, int maxlen)\n{\tt...",C,libsndfile,0
1,CVE-2018-18352,https://www.cvedetails.com/cve/CVE-2018-18352/,CWE-732,https://github.com/chromium/chromium/commit/a9...,a9cbaa7a40e2b2723cfc2f266c42f4980038a949,"Simplify ""WouldTaintOrigin"" concept in media/b...",void MultibufferDataSource::CreateResourceLoad...,void MultibufferDataSource::CreateResourceLoad...,C,Chrome,0
2,CVE-2010-1166,https://www.cvedetails.com/cve/CVE-2010-1166/,CWE-189,https://cgit.freedesktop.org/xorg/xserver/comm...,d2f813f7db157fc83abc4b3726821c36ee7e40b1,,"fbStore_a2r2g2b2 (FbBits *bits, const CARD32 *...","fbStore_a2r2g2b2 (FbBits *bits, const CARD32 *...",C,xserver,0
3,,,,https://github.com/chromium/chromium/commit/61...,610f904d8215075c4681be4eb413f4348860bf9f,Retrieve per host storage usage from QuotaMana...,void UsageTracker::DidGetClientGlobalUsage(Sto...,void UsageTracker::DidGetClientGlobalUsage(Sto...,C,Chrome,0
4,,,,https://github.com/chromium/chromium/commit/95...,957973753ec4159003ff7930d946b7e89c7e09f3,Make NotifyHeadersComplete the last call in th...,void BlobURLRequestJob::DidRead(int result) {\...,void BlobURLRequestJob::DidRead(int result) {\...,C,Chrome,0


In [7]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

documents_json = dataset_df.to_json(orient='records')
documents_list = json.loads(documents_json)

llama_documents = []

maxSize = 0

for document in documents_list:
    # Convert complex objects to JSON strings
    for field in [
        "CVE ID",
        "CVE Page",
        "CWE ID",
        "codeLink",
        "commit_id",
        "commit_message",
        "func_after",
        "func_before",
        "lang",
        "project",
        "vul"
    ]:
        document[field] = json.dumps(document[field])

    # Create a Document object
    llama_document = Document(
        text=document["CVE Page"],
        metadata=document,
        excluded_llm_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        excluded_embed_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    if len(llama_document.get_metadata_str()) < 10000:
        llama_documents.append(llama_document)

    maxSize = max(maxSize, len(llama_document.get_metadata_str()))

# Observing input examples
print("\nThe LLM sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))
print("\nThe Embedding model sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))


The LLM sees this: 
 Metadata: CVE Page=>"https://www.cvedetails.com/cve/CVE-2017-7586/"
commit_message=>"src/ : Move to a variable length header buffer\n\nPreviously, the `psf->header` buffer was a fixed length specified by\n`SF_HEADER_LEN` which was set to `12292`. This was problematic for\ntwo reasons; this value was un-necessarily large for the majority\nof files and too small for some others.\n\nNow the size of the header buffer starts at 256 bytes and grows as\nnecessary up to a maximum of 100k."
func_after=>"psf_get_date_str (char *str, int maxlen)\n{\ttime_t\t\tcurrent ;\n\tstruct tm\ttimedata, *tmptr ;\n\n\ttime (&current) ;\n\n#if defined (HAVE_GMTIME_R)\n\t/* If the re-entrant version is available, use it. */\n\ttmptr = gmtime_r (&current, &timedata) ;\n#elif defined (HAVE_GMTIME)\n\t/* Otherwise use the standard one and copy the data to local storage. */\n\ttmptr = gmtime (&current) ;\n\tmemcpy (&timedata, tmptr, sizeof (timedata)) ;\n#else\n\ttmptr = NULL ;\n#endif\n\n\ti

In [8]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import MetadataMode
from tqdm import tqdm

base_splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=200)

nodes = base_splitter.get_nodes_from_documents(llama_documents)

# Progress bar
pbar = tqdm(total=len(nodes), desc="Embedding Progress", unit="node")

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    node.embedding = node_embedding
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

Embedding Progress: 100%|██████████| 9679/9679 [41:00<00:00,  3.93node/s]  


In [9]:
print("Embedding process completed!")

import pymongo

os.environ["MONGO_URI"] = config.MONGO_URI

def get_mongo_client(mongo_uri):
    """Establish and validate connection to the MongoDB."""
    
    client = pymongo.MongoClient(mongo_uri, appname="devrel.showcase.python")

    # Validate the connection
    ping_result = client.admin.command('ping')
    if ping_result.get('ok') == 1.0:
        # Connection successful
        print("Connection to MongoDB successful")
        return client
    else:
        print("Connection to MongoDB failed")
    return None


mongo_client = get_mongo_client(config.MONGO_URI)

DB_NAME = "Claude"
COLLECTION_NAME = "BigVulData"

db = mongo_client.get_database(DB_NAME)
collection = db.get_collection(COLLECTION_NAME)

Embedding process completed!
Connection to MongoDB successful


In [10]:
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

vector_store = MongoDBAtlasVectorSearch(
    mongo_client, 
    db_name=DB_NAME, 
    collection_name=COLLECTION_NAME, 
    index_name="vector_index"
)

vector_store.add(nodes)

index_name is deprecated. Please use vector_index_name
vector_index_name and index_name both specified. Will use vector_index_name


['3fe0f590-c9a3-4293-988b-885c4b7e0d14',
 '628c57d6-51c1-44de-81a0-099535e4cb77',
 '59d631a4-88df-4b3e-8dec-7db77e2d1dc6',
 'f1ee3905-5a8e-44f5-96ff-665e8802ac15',
 'e1dfcf41-5f00-4b97-95e0-b7b66deeac19',
 '91e78888-508f-4004-a66d-a3df098c8176',
 '1cfb8b19-122f-4c35-94b0-19e5f6ebdfa8',
 'dd32696d-7e46-409f-b294-1aceb2b09cc1',
 'a119cc63-1c10-4413-b6b1-a154840e90de',
 '2aac595f-582c-408b-bd24-c0301af09972',
 '7ea5da79-30e9-485f-9ec7-f9b741aaa449',
 'e49a2f5a-ead8-4158-996c-dd144b3b90a5',
 '141c3092-7c29-4c8f-8c87-bf6d1f1172db',
 '9c20a01a-c050-4caa-9cbb-76c731227051',
 '10ec54c1-aef2-44d6-8333-97430b3ff219',
 '78f2a778-28d2-40b1-9615-8c54e08def4f',
 'ec9f4086-c618-4436-a06e-0011d2fe1733',
 '21436989-fa34-4f9a-85ed-e26524bdc065',
 'c8f4042e-ac3d-4d8d-b30f-702643e88796',
 '3e285b50-511f-45d6-a4a0-f644696199fe',
 '0da54743-fbf6-4cee-8621-61f27dff3cf7',
 'ceccd100-3765-448e-a1e8-9e85d961f9ad',
 '9806b51c-7b9d-460a-8def-556e73b8f6e8',
 '8eca036a-7e9f-48bd-a003-a1eb1edaf1d9',
 '47a2f19a-e51c-

In [11]:
from llama_index.core import VectorStoreIndex
from llama_index.core.tools import QueryEngineTool, ToolMetadata

index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine(similarity_top_k=5, llm=llm)

query_engine_tool = QueryEngineTool(
    query_engine=query_engine,
    metadata=ToolMetadata(
        name="knowledge_base",
        description=(
            "Provides information about Airbnb listings and reviews."
            "Use a detailed plain text question as input to the tool."
        ),
    ),
)

In [12]:
from llama_index.core.agent import FunctionCallingAgentWorker

agent_worker = FunctionCallingAgentWorker.from_tools(
    [query_engine_tool], llm=llm, verbose=True
)
agent = agent_worker.as_agent()

In [15]:
response = agent.chat("""

psf_get_date_str (char *str, int maxlen) {
    time_t current ; 
    struct tm timedata, *tmptr ;
    time (&current) ; 
    
    #if defined (HAVE_GMTIME_R) /* If the re-entrant version is available, use it. */
        tmptr = gmtime_r (&current, &timedata) ; 
    #elif defined (HAVE_GMTIME) /* Otherwise use the standard one and copy the data to local storage. */ 
        tmptr = gmtime (&current) ; memcpy (&timedata, tmptr, sizeof (timedata)) ; 
    #else
        tmptr = NULL ; 
    #endif 
     
    if (tmptr) 
        snprintf (str, maxlen, "%4d-%02d-%02d %02d:%02d:%02d UTC", 1900 + timedata.tm_year, timedata.tm_mon, timedata.tm_mday, timedata.tm_hour, timedata.tm_min, timedata.tm_sec) ; 
    else snprintf (str, maxlen, "Unknown date") ; 
    
    return ; 
} /* psf_get_date_str */
       
There is a vulnerability in this code, what is it and specify database entries used to come to this conclusion.
""")
print(str(response))

Added user message to memory: 

psf_get_date_str (char *str, int maxlen) {
    time_t current ; 
    struct tm timedata, *tmptr ;
    time (&current) ; 
    
    #if defined (HAVE_GMTIME_R) /* If the re-entrant version is available, use it. */
        tmptr = gmtime_r (&current, &timedata) ; 
    #elif defined (HAVE_GMTIME) /* Otherwise use the standard one and copy the data to local storage. */ 
        tmptr = gmtime (&current) ; memcpy (&timedata, tmptr, sizeof (timedata)) ; 
    #else
        tmptr = NULL ; 
    #endif 
     
    if (tmptr) 
        snprintf (str, maxlen, "%4d-%02d-%02d %02d:%02d:%02d UTC", 1900 + timedata.tm_year, timedata.tm_mon, timedata.tm_mday, timedata.tm_hour, timedata.tm_min, timedata.tm_sec) ; 
    else snprintf (str, maxlen, "Unknown date") ; 
    
    return ; 
} /* psf_get_date_str */
       
There is a vulnerability in this code, what is it and specify database entries used to come to this conclusion.

=== LLM Response ===
To answer your question about