In [2]:
import pandas as pd
import numpy as np
import os
import json
import re

In [3]:
SPOTBUGS_DATASET_PATH = '../../owasp_benchmark/spotbugs_dataset.pkl'
spotbugs_dataset = pd.read_pickle(SPOTBUGS_DATASET_PATH)

In [41]:
# the prefix "cot" marks the Chain-of-Thought prompting results, while "default" marks the default prompting results

filenames = [
  "experiment_2_config2__0_shot_CoT__gpt-35-turbo_2024.06.14_21-39-08_1562.json",
  "experiment_2_config2__3_shot_CoT__gpt-35-turbo_2024.06.16_17-22-11_1557.json",
  "experiment_2_config2__5_shot_CoT__gpt-35-turbo_2024.06.16_19-18-11_1557.json",
  "experiment_2_config2__0_shot_Default__gpt-35-turbo_2024.06.14_20-24-10_1562.json",
  "experiment_2_config2__3_shot_Default__gpt-35-turbo_2024.06.16_14-22-55_1557.json",
  "experiment_2_config2__5_shot_Default__gpt-35-turbo_2024.06.16_15-35-09_1557.json"
]

In [43]:
dataframes = {}

for filename in filenames:
    with open(filename, 'r') as f:
        json_data = json.load(f)
    
    vulnerabilities_list = [
        {**value, "name": key} for key, value in json_data["vulnerabilities"].items()
    ]
    llm_df = pd.json_normalize(vulnerabilities_list)
    llm_df["Date"] = json_data["Date"]

    # Extract shot count and prompt type
    parts = filename.split('__')
    shot_part = parts[1].lower() if len(parts) > 1 else ""

    # Determine shot count
    if "0_shot" in shot_part:
        number = "0"
    elif "3_shot" in shot_part:
        number = "3"
    elif "5_shot" in shot_part:
        number = "5"
    else:
        number = "unknown"

    # Determine prompt type
    if "cot" in shot_part:
        prompt = "cot"
    elif "default" in shot_part:
        prompt = "default"
    else:
        prompt = "unknown"

    key = f"{prompt}_{number}_shot"
    dataframes[key] = llm_df

print(dataframes.keys())


dict_keys(['cot_0_shot', 'cot_3_shot', 'cot_5_shot', 'default_0_shot', 'default_3_shot', 'default_5_shot'])


In [46]:
# Create a mapping from the spotbugs dataset
real_vulnerability_map = dict(zip(spotbugs_dataset["name"], spotbugs_dataset["real vulnerability"]))

# Add the "real_vulnerability" column to each dataframe
for _, df in dataframes.items():
    df["real_vulnerability"] = df["name"].map(real_vulnerability_map)


In [49]:
TRESHOLD = 2

In [51]:
for _, df in dataframes.items():
  df['llm_vul_decision'] = df['threshold_value'] >= TRESHOLD

In [54]:
for _, df in dataframes.items():
  conditions = [
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == False),
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == False)
  ]
  choices = ['TP', 'FP', 'TN', 'FN']
  df['llm_classification'] = np.select(conditions, choices)

In [55]:
for prompting_technique, df in dataframes.items():
  print(f"Confusion Matrix for {prompting_technique}:")
  print(df["llm_classification"].value_counts().reindex(choices, fill_value=0))
  print()

Confusion Matrix for cot_0_shot:
TP    1068
FP     493
TN       0
FN       1
Name: llm_classification, dtype: int64

Confusion Matrix for cot_3_shot:
TP    1049
FP     465
TN      28
FN      15
Name: llm_classification, dtype: int64

Confusion Matrix for cot_5_shot:
TP    1032
FP     437
TN      56
FN      32
Name: llm_classification, dtype: int64

Confusion Matrix for default_0_shot:
TP    1060
FP     482
TN      11
FN       9
Name: llm_classification, dtype: int64

Confusion Matrix for default_3_shot:
TP    1001
FP     445
TN      48
FN      63
Name: llm_classification, dtype: int64

Confusion Matrix for default_5_shot:
TP    924
FP    391
TN    102
FN    140
Name: llm_classification, dtype: int64



It can be seen that the results of 3-shot and 5-shot CoT presented the best ratio of TN to FN classifications. While the ratio between the two is comparable, 5-shot CoT prompting results in more than double the number of FN classifications compared to 3-shot CoT. Furthermore, 5-shot CoT prompting is computationally more expensive, requiring approximately 54% more input tokens (+4,320,159) and therefore being nearly 46% more expensive in terms of OpenAI’s API costs.

For this reason, we decided to continue our experiments using the 3-shot CoT prompting technique.

