# Identify a vulnerable function based on a git diff

In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import torch
from torch import nn
from transformers import LlamaTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, LlamaForCausalLM, AutoTokenizer
from types import SimpleNamespace
from patchparser import github_parser
import requests

In [2]:
## pull the git-diff
## Target: https://pkg.go.dev/vuln/GO-2023-1882
## VFC: https://github.com/cometbft/cometbft/commit/f6ea09171a2bf9f695f59b65f5c51e4a8c168015
# parsed = github_parser.commit(repo_owner="cometbft",
#                                   repo_name="cometbft",
#                                   sha="f6ea09171a2bf9f695f59b65f5c51e4a8c168015")

# parsed = pd.DataFrame(parsed) 

# vuln_desc = "An internal modification to the way PeerState is serialized to JSON introduced a deadlock when the new function MarshalJSON is called. This function can be called in two ways. The first is via logs, by setting the consensus logging module to debug level (which should not happen in production), and setting the log output format to JSON. The second is via RPC dump_consensus_state. For detailed information about this vulnerability, visit https://github.com/cometbft/cometbft/security/advisories/GHSA-mvj3-qrqh-cjvr."

In [3]:
## pull the git-diff
## Target: https://pkg.go.dev/vuln/GO-2023-1882
## VFC: https://github.com/cometbft/cometbft/commit/f6ea09171a2bf9f695f59b65f5c51e4a8c168015
# parsed = github_parser.commit(repo_owner="bnb-chain",
#                                   repo_name="tss-lib",
#                                   sha="bb6fb30bd3ebd35c755109836aa1a5ee6126c8a0")

# parsed = pd.DataFrame(parsed) 

# vuln_desc = "Collision of hash values in github.com/bnb-chain/tss-lib."

In [4]:
## pull the git-diff
## Target: https://pkg.go.dev/vuln/GO-2023-1859
## VFC: https://github.com/lestrrat-go/jwx/commit/6c41e3822485fc7e11dd70b4b0524b075d66b103
# parsed = github_parser.commit(repo_owner="lestrrat-go",
#                                   repo_name="jwx",
#                                   sha="6c41e3822485fc7e11dd70b4b0524b075d66b103")

# parsed = pd.DataFrame(parsed) 

# vuln_desc = "Collision of hash values in github.com/bnb-chain/tss-lib."

In [5]:
## pull the git-diff
## Target: https://github.com/golang/vulndb/blob/14d0da162b75a829b2a37bc5cde76b5bc8bf501d/data/reports/GO-2021-0228.yaml#L4
## VFC: https://github.com/unknwon/cae/commit/07971c00a1bfd9dc171c3ad0bfab5b67c2287e11
# parsed = github_parser.commit(repo_owner="unknwon",
#                                   repo_name="cae",
#                                   sha="07971c00a1bfd9dc171c3ad0bfab5b67c2287e11")

# parsed = pd.DataFrame(parsed) 

# vuln_desc = "Path traversal in github.com/unknwon/cae. The ExtractTo function doesn't securely escape file paths in zip archives which include leading or non-leading."

In [6]:
## pull the git-diff
## Target: https://github.com/golang/vulndb/blob/master/data/reports/GO-2021-0090.yaml
## VFC: https://github.com/tendermint/tendermint/commit/480b995a31727593f58b361af979054d17d84340
parsed = github_parser.commit(repo_owner="tendermint",
                                  repo_name="tendermint",
                                  sha="480b995a31727593f58b361af979054d17d84340")

parsed = pd.DataFrame(parsed) 

vuln_desc = "Proposed commits may contain signatures for blocks not contained within the commit. Instead of skipping these signatures, they cause failure during verification. A malicious proposer can use this to force consensus failures."

In [7]:
parsed = parsed[~parsed['file_name'].str.contains("test")]
git_diff = '\n'.join(parsed.drop_duplicates('raw_file_patch')['raw_file_patch'].values.tolist())

In [8]:
# load the context and template
context = git_diff
template = open("./templates/vulnerable_function_template.txt",'r').read()

# generate a prompt based on the template and context
prompt = template.replace('[CONTEXT]', context)
prompt = prompt.replace('[VULN_DESCRIPTION]', vuln_desc)

In [9]:
print(prompt)

<s>[INST] <<SYS>>
You are an vulnerable function identifying assistant. Given the user provided context, the git-diff, please identify the vulnerable functions. The git-diff was known to fix the following vulnerability:

Proposed commits may contain signatures for blocks not contained within the commit. Instead of skipping these signatures, they cause failure during verification. A malicious proposer can use this to force consensus failures.

Do not give an explanation of the fix! Only provide vulnerable functions from the git-diff. You must only respond with the vulnerable functions as the fully qualified function names in a list [struct.Function, struct.Function].
<</SYS>>

@@ -549,9 +549,11 @@ func (voteSet *VoteSet) sumTotalFrac() (int64, int64, float64) {
 //--------------------------------------------------------------------------------
 // Commit
 
-// MakeCommit constructs a Commit from the VoteSet.
-// Panics if the vote type is not PrecommitType or if
-// there's no +2/3 vote

In [10]:
# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./../../Models/CodeLlama-7b-Instruct-hf")

model = AutoModelForCausalLM.from_pretrained("./../../Models/CodeLlama-7b-Instruct-hf",
                                            # load_in_4bit=True,
                                            device_map='cuda:0',
                                            # torch_dtype=torch.float32,
                                            torch_dtype=torch.float16,
                                            )


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:

# tokenize the prompt
tok_time = time.time()
batch = tokenizer(prompt, return_tensors="pt")
print(f"TOKENIZER TIME: {(time.time() - tok_time):.2f} seconds")

# generate the response
# Temperature settings: 
# https://github.com/facebookresearch/llama/blob/main/example_chat_completion.py#L11

model_time = time.time()
response = model.generate(batch["input_ids"].cuda(),
                          attention_mask = batch["attention_mask"].cuda(),
                            do_sample=True,
                            top_k=50,
                            max_length=5000,
                            max_new_tokens=300,
                            top_p=0.9,
                            temperature=0.1)[0]

# the response needs to be decoded
decode_response = tokenizer.decode(response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Both `max_new_tokens` (=300) and `max_length`(=5000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


TOKENIZER TIME: 0.00 seconds


In [12]:
print(decode_response.split("/INST")[1])

]  The following functions are vulnerable to the attack:

* `func (voteSet *VoteSet) MakeCommit() *Commit`

This function constructs a `Commit` from the `VoteSet`, but it does not properly filter out precommits for blocks that do not have a 2/3 majority. This means that a malicious proposer could include precommits for blocks that do not have a 2/3 majority, which could cause the consensus algorithm to fail.</s>
