In [7]:

!pip install --quiet llama-index  # main llamaindex library

!pip install --quiet llama-index-vector-stores-MongoDB # mongodb vector database

!pip install --quiet llama-index-llms-anthropic # anthropic LLM provider

!pip install --quiet llama-index-embeddings-openai # openai embedding provider

!pip install beautifulsoup4

!pip install --quiet pymongo pandas datasets # others




In [4]:
import os
import config

os.environ["ANTHROPIC_API_KEY"] = config.ANTHROPIC_API_KEY
os.environ["HF_TOKEN"] = config.HF_TOKEN
os.environ["OPENAI_API_KEY"] = config.OPENAI_API_KEY

In [5]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.anthropic import Anthropic
from llama_index.core import Settings
llm = Anthropic(model="claude-3-5-sonnet-20240620")

embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
    dimensions=256,
    embed_batch_size=10,
    openai_api_key=os.environ["OPENAI_API_KEY"]
)

Settings.embed_model = embed_model
Settings.llm = llm


In [9]:
from datasets import load_dataset
import pandas as pd

# https://huggingface.co/datasets/MongoDB/airbnb_embeddings
dataset = load_dataset("benjis/bigvul", split="train", streaming=True)
dataset = dataset.take(4000)

# Convert the dataset to a pandas dataframe
dataset_df = pd.DataFrame(dataset)
dataset_df.head(5)

Unnamed: 0,CVE ID,CVE Page,CWE ID,codeLink,commit_id,commit_message,func_after,func_before,lang,project,vul
0,CVE-2017-7586,https://www.cvedetails.com/cve/CVE-2017-7586/,CWE-119,https://github.com/erikd/libsndfile/commit/708...,708e996c87c5fae77b104ccfeb8f6db784c32074,src/ : Move to a variable length header buffer...,"psf_get_date_str (char *str, int maxlen)\n{\tt...","psf_get_date_str (char *str, int maxlen)\n{\tt...",C,libsndfile,0
1,CVE-2018-18352,https://www.cvedetails.com/cve/CVE-2018-18352/,CWE-732,https://github.com/chromium/chromium/commit/a9...,a9cbaa7a40e2b2723cfc2f266c42f4980038a949,"Simplify ""WouldTaintOrigin"" concept in media/b...",void MultibufferDataSource::CreateResourceLoad...,void MultibufferDataSource::CreateResourceLoad...,C,Chrome,0
2,CVE-2010-1166,https://www.cvedetails.com/cve/CVE-2010-1166/,CWE-189,https://cgit.freedesktop.org/xorg/xserver/comm...,d2f813f7db157fc83abc4b3726821c36ee7e40b1,,"fbStore_a2r2g2b2 (FbBits *bits, const CARD32 *...","fbStore_a2r2g2b2 (FbBits *bits, const CARD32 *...",C,xserver,0
3,,,,https://github.com/chromium/chromium/commit/61...,610f904d8215075c4681be4eb413f4348860bf9f,Retrieve per host storage usage from QuotaMana...,void UsageTracker::DidGetClientGlobalUsage(Sto...,void UsageTracker::DidGetClientGlobalUsage(Sto...,C,Chrome,0
4,,,,https://github.com/chromium/chromium/commit/95...,957973753ec4159003ff7930d946b7e89c7e09f3,Make NotifyHeadersComplete the last call in th...,void BlobURLRequestJob::DidRead(int result) {\...,void BlobURLRequestJob::DidRead(int result) {\...,C,Chrome,0


In [18]:
import json
from llama_index.core import Document
from llama_index.core.schema import MetadataMode

documents_json = dataset_df.to_json(orient='records')
documents_list = json.loads(documents_json)

llama_documents = []

maxSize = 0

for document in documents_list:
    # Convert complex objects to JSON strings
    for field in [
        "CVE ID",
        "CVE Page",
        "CWE ID",
        "codeLink",
        "commit_id",
        "commit_message",
        "func_after",
        "func_before",
        "lang",
        "project",
        "vul"
    ]:
        document[field] = json.dumps(document[field])

    # Create a Document object
    llama_document = Document(
        text=document["CVE Page"],
        metadata=document,
        excluded_llm_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        excluded_embed_metadata_keys=["CVE ID", "CWE ID", "codeLink", "commit_id", "lang", "project", "vul"],
        metadata_template="{key}=>{value}",
        text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
    )

    if len(llama_document.get_metadata_str()) < 10000:
        llama_documents.append(llama_document)

    maxSize = max(maxSize, len(llama_document.get_metadata_str()))

# Observing input examples
print("\nThe LLM sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.LLM))
print("\nThe Embedding model sees this: \n", llama_documents[0].get_content(metadata_mode=MetadataMode.EMBED))

print("\nGreatest Metadata Size:", maxSize)



The LLM sees this: 
 Metadata: CVE Page=>"https://www.cvedetails.com/cve/CVE-2017-7586/"
commit_message=>"src/ : Move to a variable length header buffer\n\nPreviously, the `psf->header` buffer was a fixed length specified by\n`SF_HEADER_LEN` which was set to `12292`. This was problematic for\ntwo reasons; this value was un-necessarily large for the majority\nof files and too small for some others.\n\nNow the size of the header buffer starts at 256 bytes and grows as\nnecessary up to a maximum of 100k."
func_after=>"psf_get_date_str (char *str, int maxlen)\n{\ttime_t\t\tcurrent ;\n\tstruct tm\ttimedata, *tmptr ;\n\n\ttime (&current) ;\n\n#if defined (HAVE_GMTIME_R)\n\t/* If the re-entrant version is available, use it. */\n\ttmptr = gmtime_r (&current, &timedata) ;\n#elif defined (HAVE_GMTIME)\n\t/* Otherwise use the standard one and copy the data to local storage. */\n\ttmptr = gmtime (&current) ;\n\tmemcpy (&timedata, tmptr, sizeof (timedata)) ;\n#else\n\ttmptr = NULL ;\n#endif\n\n\ti

In [19]:
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.schema import MetadataMode
from tqdm import tqdm

base_splitter = TokenTextSplitter(chunk_size=10000, chunk_overlap=200)

nodes = base_splitter.get_nodes_from_documents(llama_documents)

# Progress bar
pbar = tqdm(total=len(nodes), desc="Embedding Progress", unit="node")

for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode=MetadataMode.EMBED)
    )
    node.embedding = node_embedding
    
    # Update the progress bar
    pbar.update(1)

# Close the progress bar
pbar.close()

print("Embedding process completed!")


Embedding Progress: 100%|██████████| 3878/3878 [17:40<00:00,  3.66node/s] 

Embedding process completed!



