In [2]:
import pandas as pd
import numpy as np
import json
import os

In [3]:
SPOTBUGS_DATASET_PATH = '../../owasp_benchmark/spotbugs_dataset.pkl'
spotbugs_dataset = pd.read_pickle(SPOTBUGS_DATASET_PATH)

In [4]:
filenames = [
  "experiment_2_config2__0_shot_CoT__gpt-35-turbo_2024.06.14_21-39-08_1562.json",
  "experiment_2_config2__3_shot_CoT__gpt-35-turbo_2024.06.16_17-22-11_1557.json",
  "experiment_2_config2__5_shot_CoT__gpt-35-turbo_2024.06.16_19-18-11_1557.json",
  "experiment_2_config2__0_shot_Default__gpt-35-turbo_2024.06.14_20-24-10_1562.json",
  "experiment_2_config2__3_shot_Default__gpt-35-turbo_2024.06.16_14-22-55_1557.json",
  "experiment_2_config2__5_shot_Default__gpt-35-turbo_2024.06.16_15-35-09_1557.json" # 5 shot default prompting results contain one invalid response (trehsold_value = -1.0)
]

In [5]:
dataframes = {}
results_directory = "results/"

filenames = [os.path.join(results_directory, filename) for filename in filenames]
for filename in filenames:
    with open(filename, 'r') as f:
        json_data = json.load(f)
    
    vulnerabilities_list = [
        {**value, "name": key} for key, value in json_data["vulnerabilities"].items()
    ]
    llm_df = pd.json_normalize(vulnerabilities_list)
    llm_df["Date"] = json_data["Date"]

    parts = filename.split('__')
    shot_part = parts[1].lower() if len(parts) > 1 else ""

    if "0_shot" in shot_part:
        number = "0"
    elif "3_shot" in shot_part:
        number = "3"
    elif "5_shot" in shot_part:
        number = "5"
    else:
        number = "unknown"

    if "cot" in shot_part:
        prompt = "cot"
    elif "default" in shot_part:
        prompt = "default"
    else:
        prompt = "unknown"

    key = f"{prompt}_{number}_shot"
    dataframes[key] = llm_df

print(dataframes.keys())


dict_keys(['cot_0_shot', 'cot_3_shot', 'cot_5_shot', 'default_0_shot', 'default_3_shot', 'default_5_shot'])


In [19]:
print(dataframes['cot_0_shot'].head())

                                            question  \
0  \nAnalyze the following potential vulnerabilit...   
1  \nAnalyze the following potential vulnerabilit...   
2  \nAnalyze the following potential vulnerabilit...   
3  \nAnalyze the following potential vulnerabilit...   
4  \nAnalyze the following potential vulnerabilit...   

                                            response confidence_of_llm  \
0  Let's think step by step...\n\nThe identified ...     Decision: 7.0   
1  Let's think step by step...\n\nThe identified ...     Decision: 9.0   
2  Let's think step by step...\n\nThe identified ...     Decision: 8.0   
3  Let's think step by step...\n\nThe identified ...     Decision: 7.5   
4  Let's think step by step...\n\nThe vulnerabili...     Decision: 9.0   

   computation_time  threshold_value  prompt_tokens  response_tokens  \
0          1.275691              7.0           1069              178   
1          1.422901              9.0           1969              166   
2 

In [6]:
real_vulnerability_map = dict(zip(spotbugs_dataset["name"], spotbugs_dataset["real vulnerability"]))

for _, df in dataframes.items():
    df["real_vulnerability"] = df["name"].map(real_vulnerability_map)

In [7]:
TRESHOLD = 2

In [8]:
for _, df in dataframes.items():
  df['llm_vul_decision'] = np.where(df['threshold_value'] < 0.0, True, df['threshold_value'] >= TRESHOLD) # due to handling invalid LLM responses we label threshold_value < 0 as True, as those weaknesses must be reviewed by humans

In [9]:
for _, df in dataframes.items():
  conditions = [
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == False),
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == False)
  ]
  choices = ['TP', 'FP', 'TN', 'FN']
  df['llm_classification'] = np.select(conditions, choices)

In [20]:
for prompting_technique, df in dataframes.items():
  print(f"Confusion Matrix for {prompting_technique}:")
  print(df["llm_classification"].value_counts().reindex(choices, fill_value=0))
  print(f"Total samples: {len(df)}")
  print(f"Total prompt tokens: {df['prompt_tokens'].sum():,}")
  print(f"Total response tokens: {df['response_tokens'].sum():,}")
  print(f"Total tokens: {df['prompt_tokens'].sum() + df['response_tokens'].sum():,}")
  print("-" * 40)

Confusion Matrix for cot_0_shot:
TP    1063
FP     493
TN       0
FN       1
Name: llm_classification, dtype: int64
Total samples: 1557
Total prompt tokens: 1,566,463
Total response tokens: 273,211
Total tokens: 1,839,674
----------------------------------------
Confusion Matrix for cot_3_shot:
TP    1049
FP     465
TN      28
FN      15
Name: llm_classification, dtype: int64
Total samples: 1557
Total prompt tokens: 7,969,063
Total response tokens: 628,048
Total tokens: 8,597,111
----------------------------------------
Confusion Matrix for cot_5_shot:
TP    1032
FP     437
TN      56
FN      32
Name: llm_classification, dtype: int64
Total samples: 1557
Total prompt tokens: 12,289,222
Total response tokens: 615,808
Total tokens: 12,905,030
----------------------------------------
Confusion Matrix for default_0_shot:
TP    1055
FP     482
TN      11
FN       9
Name: llm_classification, dtype: int64
Total samples: 1557
Total prompt tokens: 1,535,323
Total response tokens: 93,499
Total to

It can be seen that the results of 3-shot and 5-shot CoT presented the best ratio of TN to FN classifications. While the ratio between the two is comparable, 5-shot CoT prompting results in more than double the number of FN classifications compared to 3-shot CoT. Furthermore, 5-shot CoT prompting is computationally more expensive, requiring approximately 54% more input tokens (+4,320,159) and therefore being nearly 46% more expensive in terms of OpenAI’s API costs.

For this reason, we decided to continue our experiments using the 3-shot CoT prompting technique.

## Self-Consistency

In [11]:
json_files = [f for f in os.listdir('self_consistency_results/') if f.endswith('.json')]

dataframes_sc = {}
def get_key(filename):
  return os.path.splitext(filename)[0]

for filename in json_files:
    with open('self_consistency_results/' + filename, 'r') as f:
        json_data = json.load(f)
    
    vulnerabilities_list = [
        {**value, "name": key} for key, value in json_data["vulnerabilities"].items()
    ]
    
    llm_df = pd.json_normalize(vulnerabilities_list)
    llm_df["Date"] = json_data["Date"]
    dataframes_sc[get_key(filename)] = llm_df


In [12]:
print(dataframes_sc.keys())

dict_keys(['experiment_2_sc_config2__few_shot_CoT__gpt-35-turbo_2024.06.19_17-40-34_1557', 'experiment_2_sc_config2__few_shot_CoT__gpt-35-turbo_2024.06.19_21-07-00_1557', 'experiment_2_sc_config2__few_shot_CoT__gpt-35-turbo_2024.06.20_00-11-47_1557', 'experiment_2_sc_config2__few_shot_CoT__gpt-35-turbo_2024.06.19_19-26-51_1557', 'experiment_2_sc_config2__few_shot_CoT__gpt-35-turbo_2024.06.19_22-41-24_1557'])


In [13]:
for _, df in dataframes_sc.items():
    df["real_vulnerability"] = df["name"].map(real_vulnerability_map)

In [14]:
TRESHOLD = 2

In [15]:
for _, df in dataframes_sc.items():
  df['llm_vul_decision'] = np.where(df['threshold_value'] < 0.0, True, df['threshold_value'] >= TRESHOLD) # due to handling invalid LLM responses we label threshold_value < 0 as True, as those weaknesses must be reviewed by humans
  print(df.tail(1)['llm_vul_decision'])

1556    True
Name: llm_vul_decision, dtype: bool
1556    True
Name: llm_vul_decision, dtype: bool
1556    False
Name: llm_vul_decision, dtype: bool
1556    True
Name: llm_vul_decision, dtype: bool
1556    True
Name: llm_vul_decision, dtype: bool


In [16]:
df_sc_consistent = list(dataframes_sc.values())[0].copy()

dfs = list(dataframes_sc.values())
dfs_other = [df for df in dfs if df is not list(dataframes_sc.values())[0]]

for idx in df_sc_consistent.index:
  decisions = [df.loc[idx, 'llm_vul_decision'] for df in dfs]
  count_true = decisions.count(True)
  count_false = decisions.count(False)
  consensus = df_sc_consistent.loc[idx, 'llm_vul_decision'] if count_true == count_false else (True if count_true > count_false else False)
  df_sc_consistent.at[idx, 'llm_vul_decision'] = consensus

In [17]:
for _, _ in dataframes_sc.items():
  conditions = [
    (df_sc_consistent['real_vulnerability'] == True) & (df_sc_consistent['llm_vul_decision'] == True),
    (df_sc_consistent['real_vulnerability'] == False) & (df_sc_consistent['llm_vul_decision'] == True),
    (df_sc_consistent['real_vulnerability'] == False) & (df_sc_consistent['llm_vul_decision'] == False),
    (df_sc_consistent['real_vulnerability'] == True) & (df_sc_consistent['llm_vul_decision'] == False)
  ]
  choices = ['TP', 'FP', 'TN', 'FN']
  df_sc_consistent['llm_classification'] = np.select(conditions, choices)

In [18]:
print(f"Confusion Matrix for SC dataframe")
print(df_sc_consistent["llm_classification"].value_counts().reindex(choices, fill_value=0))
print(len(df_sc_consistent))

Confusion Matrix for SC dataframe
TP    1055
FP     467
TN      26
FN       9
Name: llm_classification, dtype: int64
1557
