In [1]:
import getpass

chatgpt_token = getpass.getpass(prompt="Enter ChatGPT API token...")

In [None]:
# Import the manifest data
manifest_file = './benchmark/ground_truth.csv'
with open(manifest_file, 'r') as input_fh:
    raw_data = input_fh.readlines()

# Parse the data
manifest = {}
for line in raw_data[1:]:
    line_split = line.strip().split(',')

    if line_split[2] == 'true':
        true_positive = True
    else:
        true_positive = False

    manifest.update({line_split[0]: {"category": line_split[1],
                                     "true_positive": true_positive,
                                     "cwe": 'CWE-' + line_split[3]}})
    
print(manifest)

# Baseline Analysis

In [None]:
import re
import glob
import pathlib
from openai import OpenAI

# Initialize variables
testcase_dir = './benchmark'

# Load the API token
client = OpenAI(
  api_key=chatgpt_token
)

# Print some header information
print("{:<25} | {:<10} | {:<15} | {:<15} | {:<15}".format("Testcase", "True CWE", "True Positive?", "Detected CWE", "Result"))
print("="*100)

# Iterate through the list of files and perform analysis
for file_path in glob.glob(testcase_dir + '/*.java'):
    # Get the textcase identifier and associated information
    textcase_id = pathlib.Path(file_path).stem
    testcase_true_positive = manifest[textcase_id]['true_positive']
    testcase_cwe = manifest[textcase_id]['cwe']

    # Import the file data
    with open(file_path, 'r') as input_fh:
        # Srtip out the block comment data
        file_data = re.sub(r"\/\*(.|\n)+\*\/", "", input_fh.read())

    # Define the prompt
    generic_prompt = "Manually review the code snippet shown below and identify any potential security issues. Do not write or execute any code during your analysis. There may or may not be any security issues in the code. If there is an issue, return only the most closely associated CWE number and nothing else in the form CWE-XXX. If there is no security issue, return None.\n\n{}".format(file_data)

    # Send a prompt
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    store=True,
    messages=[
        {"role": "system", "content": "Do not ever write or execute any code to perform analysis."},
        {"role": "user", "content": generic_prompt}
    ]
    )

    # Capture the response
    response = completion.choices[0].message.content

    # CWE identified?
    if 'CWE' in response:
        cwe_found = True
    else:
        cwe_found = False

    # Correct CWE found?
    if testcase_cwe in response:
        correct_cwe_found = True
    else:
        correct_cwe_found = False

    # Compute the result
    if correct_cwe_found and testcase_true_positive:
        result = 'True Positive'
    elif cwe_found and not testcase_true_positive:
        result = 'False Positive'
    elif cwe_found and testcase_true_positive and not correct_cwe_found:
        result = 'Potential True Positive'
    elif not cwe_found and not testcase_true_positive:
        result = 'True Negative'
    else:
        result = 'False Negative'

    # Print a status message
    print('{:<25} | {:<10} | {:<15} | {:<15} | {:<15}'.format(textcase_id, testcase_cwe, testcase_true_positive, response, result))
    

Testcase                  | True CWE   | True Positive?  | Detected CWE    | Result         
BenchmarkTest01224        | CWE-643    | 1               | CWE-20          | Potential True Positive
BenchmarkTest00321        | CWE-501    | 1               | CWE-352         | Potential True Positive
BenchmarkTest00008        | CWE-89     | 1               | CWE-89          | True Positive  
BenchmarkTest00090        | CWE-78     | 0               | CWE-78          | False Positive 
BenchmarkTest00028        | CWE-22     | 1               | CWE-22          | True Positive  
BenchmarkTest00252        | CWE-501    | 0               | CWE-20          | False Positive 
BenchmarkTest00069        | CWE-328    | 0               | CWE-276         | False Positive 
BenchmarkTest00086        | CWE-330    | 1               | CWE-336         | Potential True Positive
BenchmarkTest00012        | CWE-90     | 1               | CWE-88          | Potential True Positive
BenchmarkTest00442        | CWE-643   

# Targeted Analysis

In [None]:
import re
import glob
import pathlib
from openai import OpenAI

# Initialize variables
testcase_dir = './TestCases'

# Load the API token
client = OpenAI(
  api_key=chatgpt_token
)

# Print some header information
print("{:<25} | {:<10} | {:<15} | {:<15} | {:<15}".format("Testcase", "True CWE", "True Positive?", "Detection?", "Result"))
print("="*100)

# Iterate through the list of files and perform analysis
for file_path in glob.glob(testcase_dir + '/*.java'):
    # Get the textcase identifier and associated information
    textcase_id = pathlib.Path(file_path).stem
    testcase_true_positive = manifest[textcase_id]['true_positive']
    testcase_cwe = manifest[textcase_id]['cwe']

    # Import the file data
    with open(file_path, 'r') as input_fh:
        # Srtip out the block comment data
        file_data = re.sub(r"\/\*(.|\n)+\*\/", "", input_fh.read())

    # Define the prompt
    generic_prompt = "Manually review the code snippet shown below and determine if {} is present in any part of the code. This CWE may or may not be present in the code. Return only a True/False response and nothing else. Do not write or execute any code during your analysis.\n\n{}".format(testcase_cwe, file_data)

    # Send a prompt
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    store=True,
    messages=[
        {"role": "system", "content": "Do not ever write or execute any code to perform analysis."},
        {"role": "user", "content": generic_prompt}
    ]
    )

    # Capture the response
    response = completion.choices[0].message.content

    # Determine result
    if 'true' in response.lower() and testcase_true_positive:
        result = 'True Positive'
    elif 'true' in response.lower() and not testcase_true_positive:
        result = 'False Positive'
    elif 'false' in response.lower() and testcase_true_positive:
        result = 'False Negative'
    elif 'false' in response.lower() and not testcase_true_positive:
        result = 'True Negative'
    else:
        result = 'unknown'

    # Print a status message
    print('{:<25} | {:<10} | {:<15} | {:<15} | {:<15}'.format(textcase_id, testcase_cwe, testcase_true_positive, response, result))

Testcase                  | True CWE   | True Positive?  | Detection?      | Result         


BenchmarkTest01224        | CWE-643    | 1               | False           | False Negative 
BenchmarkTest00321        | CWE-501    | 1               | False           | False Negative 
BenchmarkTest00008        | CWE-89     | 1               | True            | True Positive  
BenchmarkTest00090        | CWE-78     | 0               | True            | False Positive 
BenchmarkTest00028        | CWE-22     | 1               | True            | True Positive  
BenchmarkTest00252        | CWE-501    | 0               | False           | True Negative  
BenchmarkTest00069        | CWE-328    | 0               | False           | True Negative  
BenchmarkTest00086        | CWE-330    | 1               | False           | False Negative 
BenchmarkTest00012        | CWE-90     | 1               | True            | True Positive  
BenchmarkTest00442        | CWE-643    | 1               | True            | True Positive  
BenchmarkTest00100        | CWE-89     | 1               | True       