In [10]:
import pandas as pd
import numpy as np
import os
import json
import re

In [11]:
SPOTBUGS_DATASET_PATH = '../../owasp_benchmark/spotbugs_dataset.pkl'
spotbugs_dataset = pd.read_pickle(SPOTBUGS_DATASET_PATH)

In [12]:
results_dir = './results/'
json_files = [os.path.join(results_dir, f) for f in os.listdir(results_dir) if f.endswith('.json')]

dataframes = {}

for filename in json_files:
    print(filename)
    with open(filename, 'r') as f:
        json_data = json.load(f)
    
    vulnerabilities_list = [
        {**value, "name": key} for key, value in json_data["vulnerabilities"].items()
    ]
    
    llm_df = pd.json_normalize(vulnerabilities_list)
    llm_df["Date"] = json_data["Date"]
    
    config_match = re.search(r'(config\d+)', filename)
    if config_match:
        config_number = config_match.group(1).replace('config', 'config_')
        dataframes[config_number] = llm_df

print(dataframes["config_1"].columns)
print(dataframes["config_1"].head(1).threshold_value)  

./results/experiment_1_config3__zero_shot_Default__gpt-35-turbo_2024.06.12_17-36-59_1557.json
./results/experiment_1_config5__zero_shot_Default__gpt-35-turbo_2024.06.12_21-56-18_1557.json
./results/experiment_1_config1__zero_shot_Default__gpt-35-turbo_2024.06.12_16-11-45_1557.json
./results/experiment_1_config4__zero_shot_Default__gpt-35-turbo_2024.06.12_18-26-23_1557.json
./results/experiment_1_config2__zero_shot_Default__gpt-35-turbo_2024.06.12_17-02-13_1557.json
Index(['question', 'response', 'confidence_of_llm', 'computation_time',
       'threshold_value', 'prompt_tokens', 'response_tokens', 'name', 'Date'],
      dtype='object')
0    9.0
Name: threshold_value, dtype: float64


In [13]:
for _, df in dataframes.items():
  real_vulnerability_map = dict(zip(spotbugs_dataset["name"], spotbugs_dataset["real vulnerability"]))
  df["real_vulnerability"] = df["name"].map(real_vulnerability_map)

In [14]:
TRESHOLD = 2

In [15]:
for _, df in dataframes.items():
  df['llm_vul_decision'] = np.where(df['threshold_value'] < 0.0, True, df['threshold_value'] >= TRESHOLD) # due to handling invalid LLM responses we label threshold_value < 0 as True, as those weaknesses must be reviewed by humans

In [16]:
for _, df in dataframes.items():
  conditions = [
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == True),
    (df['real_vulnerability'] == False) & (df['llm_vul_decision'] == False),
    (df['real_vulnerability'] == True) & (df['llm_vul_decision'] == False)
  ]
  choices = ['TP', 'FP', 'TN', 'FN']
  df['llm_classification'] = np.select(conditions, choices)

In [17]:
for config, df in dataframes.items():
  print(f"Confusion Matrix for {config}:")
  print(df["llm_classification"].value_counts().reindex(choices, fill_value=0))
  print()

Confusion Matrix for config_3:
TP    1020
FP     464
TN      29
FN      44
Name: llm_classification, dtype: int64

Confusion Matrix for config_5:
TP    1051
FP     488
TN       5
FN      13
Name: llm_classification, dtype: int64

Confusion Matrix for config_1:
TP    1047
FP     482
TN      11
FN      17
Name: llm_classification, dtype: int64

Confusion Matrix for config_4:
TP    1047
FP     489
TN       4
FN      17
Name: llm_classification, dtype: int64

Confusion Matrix for config_2:
TP    1057
FP     485
TN       8
FN       7
Name: llm_classification, dtype: int64



It can be seen that config 2 of the contextual information is the only config that comes with a higher number of TNs then FNs. This means it detected more FPs in the dataset then missing real weaknesses. 

For that reason config 2 was choosen to be moved on with.