In [1]:
import torch
import numpy as np
import torch
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, matthews_corrcoef, roc_curve
from datasets import load_metric
metric_accuracy = load_metric("accuracy")
metric_f1 = load_metric("f1")
import warnings; warnings.simplefilter('ignore')
import os

  metric_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [2]:
def calculate_vul_det_score(predictions, ground_truth, target_fpr=0.005):
    """
    Calculate the vulnerability detection score (VD-S) given a tolerable FPR.
    
    Args:
    - predictions: List of model prediction probabilities for the positive class.
    - ground_truth: List of ground truth labels, where 1 means vulnerable class, and 0 means benign class.
    - target_fpr: The tolerable false positive rate.
    
    Returns:
    - vds: Calculated vulnerability detection score given the acceptable .
    - threshold: The classification threshold for vulnerable prediction.
    """
    predictions = predictions.tolist()
    ground_truth = ground_truth.tolist()

    # Calculate FPR, TPR, and thresholds using ROC curve
    fpr, tpr, thresholds = roc_curve(ground_truth, predictions)
    
    # Filter thresholds where FPR is less than or equal to the target FPR
    valid_indices = np.where(fpr <= target_fpr)[0]
    
    # Choose the threshold with the largest FPR that is still below the target FPR, if possible
    if len(valid_indices) > 0:
        idx = valid_indices[-1]  # Last index where FPR is below or equal to target FPR
    else:
        # If no such threshold exists (unlikely), default to the closest to the target FPR
        idx = np.abs(fpr - target_fpr).argmin()
        
    chosen_threshold = thresholds[idx]
    
    # Classify predictions based on the chosen threshold
    classified_preds = [1 if pred >= chosen_threshold else 0 for pred in predictions]
    
    # Calculate VD-S
    fn = sum([1 for i in range(len(ground_truth)) if ground_truth[i] == 1 and classified_preds[i] == 0])
    tp = sum([1 for i in range(len(ground_truth)) if ground_truth[i] == 1 and classified_preds[i] == 1])
    vds = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    return vds, chosen_threshold

def calmetrics(prediction_file, label_file):
    all_predictions = torch.load(prediction_file, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    all_labels = torch.load(label_file, map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    accuracy = metric_accuracy.compute(predictions=all_predictions, references=all_labels)["accuracy"]
    f1 = metric_f1.compute(predictions=all_predictions, references=all_labels)["f1"]
    precision = precision_score(all_labels.cpu(), all_predictions.cpu())
    recall = recall_score(all_labels.cpu(), all_predictions.cpu())
    tn, fp, fn, tp = confusion_matrix(all_labels.cpu(), all_predictions.cpu()).ravel()
    fpr = fp / (fp + tn)
    vds, threshold = calculate_vul_det_score(all_predictions, all_labels, target_fpr=0.005)
    #print(f"{vds:.2f} & {accuracy*100:.2f} & {f1:.4f} & {precision:.4f} & {recall:.4f} & {fpr:.4f} & {tp} & {tn} & {fp} & {fn} \\\\")
    print(f"{accuracy*100:.2f} & {f1*100:.2f} & {precision*100:.2f} & {recall*100:.2f} & {fpr*100:.2f} & {tp} & {tn} & {fp} & {fn} \\\\")
    return accuracy, f1, precision, recall, fpr, vds

In [3]:
directory = "pred_label"  # Replace with your actual path

cwes = ["119", "125", "787", "20", "476", "200", "416", "703", "190", "399"]
models = {"unixcoder": "unixcoder-base", "codebert": "codebert-base"}
# Dictionary to store best F1 score and identifier for each model-CWE pair
best_results = {}

# Loop through each model and CWE
for model_name in models.values():
    for cwe in cwes:
        best_f1 = 0  # Initialize best F1 score for this model-CWE pair
        best_identifier = None  # Store the identifier with the best F1 score
        
        for identifier in range(10):  # Assuming identifiers are in the range 0-9
            # Construct file paths
            prediction_file = f"{model_name}_{cwe}_{identifier}_predictions.pt"
            label_file = f"{model_name}_{cwe}_{identifier}_labels.pt"
            pred_path = os.path.join(directory, prediction_file)
            label_path = os.path.join(directory, label_file)
            
            # Check if both files exist
            if os.path.isfile(pred_path) and os.path.isfile(label_path):
                print(f"Processing {model_name} for CWE {cwe}, identifier {identifier}")
                
                # Calculate metrics
                accuracy, f1, precision, recall, fpr, vds = calmetrics(pred_path, label_path)
                
                # Track the best F1 score and identifier
                if f1 > best_f1:
                    best_f1 = f1
                    best_identifier = identifier
        
        # Store the best identifier and F1 score for this model and CWE
        best_results[(model_name, cwe)] = {"best_identifier": best_identifier, "best_f1": best_f1}

# Print the best identifier based on the highest F1 score for each model and CWE
for (model_name, cwe), result in best_results.items():
    print(f"Best identifier for {model_name} with CWE {cwe} is {result['best_identifier']} with F1 score {result['best_f1']:.4f}")
    prediction_file = f"{model_name}_{cwe}_{result['best_identifier']}_predictions.pt"
    label_file = f"{model_name}_{cwe}_{result['best_identifier']}_labels.pt"
    pred_path = os.path.join(directory, prediction_file)
    label_path = os.path.join(directory, label_file)
    calmetrics(pred_path, label_path)

Processing unixcoder-base for CWE 119, identifier 0
97.41 & 10.39 & 80.00 & 5.56 & 0.04 & 4 & 2588 & 1 & 68 \\
Processing unixcoder-base for CWE 119, identifier 1
97.86 & 51.28 & 66.67 & 41.67 & 0.58 & 30 & 2574 & 15 & 42 \\
Processing unixcoder-base for CWE 119, identifier 2
97.75 & 58.90 & 58.11 & 59.72 & 1.20 & 43 & 2558 & 31 & 29 \\
Processing unixcoder-base for CWE 119, identifier 3
97.75 & 58.33 & 58.33 & 58.33 & 1.16 & 42 & 2559 & 30 & 30 \\
Processing unixcoder-base for CWE 119, identifier 4
97.78 & 52.80 & 62.26 & 45.83 & 0.77 & 33 & 2569 & 20 & 39 \\
Processing unixcoder-base for CWE 119, identifier 5
97.56 & 53.24 & 55.22 & 51.39 & 1.16 & 37 & 2559 & 30 & 35 \\
Processing unixcoder-base for CWE 119, identifier 6
97.63 & 55.32 & 56.52 & 54.17 & 1.16 & 39 & 2559 & 30 & 33 \\
Processing unixcoder-base for CWE 119, identifier 7
97.56 & 52.55 & 55.38 & 50.00 & 1.12 & 36 & 2560 & 29 & 36 \\
Processing unixcoder-base for CWE 119, identifier 8
97.78 & 54.96 & 61.02 & 50.00 & 0.89 & 

In [9]:
directory = "pred_label"  # Replace with your actual path

cwes = ["119", "125", "787", "20", "476", "200", "416", "703", "190", "399"]
models = {"starcoder": "starcoder2-7b"}
# Dictionary to store best F1 score and identifier for each model-CWE pair
best_results = {}

# Loop through each model and CWE
for model_name in models.values():
    for cwe in cwes:        
        # Construct file paths
        prediction_file = f"all_{model_name}_{cwe}_predictions.pt"
        label_file = f"all_{model_name}_{cwe}_labels.pt"
        pred_path = os.path.join(directory, prediction_file)
        label_path = os.path.join(directory, label_file)
        # Check if both files exist
        if os.path.isfile(pred_path) and os.path.isfile(label_path):
            print(f"Processing {model_name} for CWE {cwe}")
                
            # Calculate metrics
            accuracy, f1, precision, recall, fpr, vds = calmetrics(pred_path, label_path)

Processing starcoder2-7b for CWE 125
96.33 & 15.19 & 8.59 & 65.81 & 3.52 & 102 & 29754 & 1086 & 53 \\
Processing starcoder2-7b for CWE 787
94.75 & 9.67 & 5.22 & 65.41 & 5.12 & 87 & 29282 & 1580 & 46 \\
Processing starcoder2-7b for CWE 119
99.60 & 0.00 & 0.00 & 0.00 & 0.00 & 0 & 30870 & 0 & 125 \\
Processing starcoder2-7b for CWE 20
98.99 & 8.21 & 6.70 & 10.61 & 0.63 & 14 & 30668 & 195 & 118 \\
Processing starcoder2-7b for CWE 416
97.87 & 8.08 & 4.65 & 30.85 & 1.93 & 29 & 30306 & 595 & 65 \\
Processing starcoder2-7b for CWE 190
98.53 & 16.79 & 9.91 & 54.76 & 1.35 & 46 & 30493 & 418 & 38 \\
Processing starcoder2-7b for CWE 476
99.75 & 0.00 & 0.00 & 0.00 & 0.00 & 0 & 30916 & 0 & 79 \\
Processing starcoder2-7b for CWE 200
97.36 & 6.63 & 3.62 & 39.19 & 2.50 & 29 & 30149 & 772 & 45 \\
Processing starcoder2-7b for CWE 79
98.37 & 17.65 & 9.76 & 91.53 & 1.61 & 54 & 30437 & 499 & 5 \\
Processing starcoder2-7b for CWE 703
99.82 & 0.00 & 0.00 & 0.00 & 0.00 & 0 & 30938 & 0 & 57 \\


In [4]:
directory = "pred_label"  # Replace with your actual path

cwes = ["2", "3", "4", "5", "6", "7", "8", "9", "10"]
models = {"starcoder": "starcoder2-7b"}
# Dictionary to store best F1 score and identifier for each model-CWE pair
best_results = {}

# Loop through each model and CWE
for model_name in models.values():
    for cwe in cwes:        
        # Construct file paths
        prediction_file = f"TOP_{model_name}_{cwe}_best_predictions.pt"
        label_file = f"TOP_{model_name}_{cwe}_best_labels.pt"
        pred_path = os.path.join(directory, prediction_file)
        label_path = os.path.join(directory, label_file)
        # Check if both files exist
        if os.path.isfile(pred_path) and os.path.isfile(label_path):
            print(f"Processing {model_name} for TOP {cwe}")
                
            # Calculate metrics
            accuracy, f1, precision, recall, fpr, vds = calmetrics(pred_path, label_path)

Processing starcoder2-7b for TOP 2
94.57 & 13.54 & 59.46 & 7.64 & 0.31 & 22 & 4868 & 15 & 266 \\
Processing starcoder2-7b for TOP 3
94.44 & 5.42 & 52.17 & 2.86 & 0.15 & 12 & 7106 & 11 & 408 \\
Processing starcoder2-7b for TOP 4
94.40 & 0.36 & 20.00 & 0.18 & 0.04 & 1 & 9228 & 4 & 544 \\
Processing starcoder2-7b for TOP 5
94.28 & 3.81 & 30.23 & 2.03 & 0.28 & 13 & 10792 & 30 & 626 \\
Processing starcoder2-7b for TOP 6
94.35 & 2.40 & 32.14 & 1.24 & 0.16 & 9 & 12225 & 19 & 714 \\
Processing starcoder2-7b for TOP 7
94.44 & 0.50 & 100.00 & 0.25 & 0.00 & 2 & 13587 & 0 & 800 \\
Processing starcoder2-7b for TOP 8
94.43 & 0.68 & 60.00 & 0.34 & 0.01 & 3 & 14837 & 2 & 873 \\
Processing starcoder2-7b for TOP 9
94.32 & 4.99 & 37.31 & 2.67 & 0.27 & 25 & 15797 & 42 & 910 \\
Processing starcoder2-7b for TOP 10
94.43 & 0.60 & 75.00 & 0.30 & 0.01 & 3 & 16792 & 1 & 989 \\


In [4]:
directory = "pred_label"  # Replace with your actual path

tops = ["2", "3", "4", "5", "6", "7", "8", "9", "10"]
#models = {"unixcoder": "unixcoder-base", "codebert": "codebert-base"}
models = {"unixcoder": "unixcoder-base"}
#models = {"codebert": "codebert-base"}
# Dictionary to store best F1 score and identifier for each model-TOP pair
best_results = {}

# Loop through each model and TOP
for model_name in models.values():
    for top in tops:
        best_f1 = 0  # Initialize best F1 score for this model-TOP pair
        best_identifier = None  # Store the identifier with the best F1 score
        
        for identifier in range(10):  # Assuming identifiers are in the range 0-9
            # Construct file paths
            print(model_name)
            prediction_file = f"top_{model_name}_{top}_{identifier}_predictions.pt"
            label_file = f"top_{model_name}_{top}_{identifier}_labels.pt"
            pred_path = os.path.join(directory, prediction_file)
            label_path = os.path.join(directory, label_file)
            
            # Check if both files exist
            if os.path.isfile(pred_path) and os.path.isfile(label_path):
                print(f"Processing {model_name} for TOP {top}, identifier {identifier}")
                
                # Calculate metrics
                accuracy, f1, precision, recall, fpr, vds = calmetrics(pred_path, label_path)
                
                # Track the best F1 score and identifier
                if f1 > best_f1:
                    best_f1 = f1
                    best_identifier = identifier
        
        # Store the best identifier and F1 score for this model and TOP
        best_results[(model_name, top)] = {"best_identifier": best_identifier, "best_f1": best_f1}

# Print the best identifier based on the highest F1 score for each model and TOP
for (model_name, top), result in best_results.items():
    print(f"Best identifier for {model_name} with TOP {top} is {result['best_identifier']} with F1 score {result['best_f1']:.4f}")
    prediction_file = f"top_{model_name}_{top}_{result['best_identifier']}_predictions.pt"
    label_file = f"top_{model_name}_{top}_{result['best_identifier']}_labels.pt"
    pred_path = os.path.join(directory, prediction_file)
    label_path = os.path.join(directory, label_file)
    calmetrics(pred_path, label_path)

unixcoder-base
Processing unixcoder-base for TOP 2, identifier 0
97.16 & 48.06 & 47.55 & 48.57 & 1.49 & 68 & 4952 & 75 & 72 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 1
97.54 & 33.51 & 62.75 & 22.86 & 0.38 & 32 & 5008 & 19 & 108 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 2
97.35 & 51.25 & 51.06 & 51.43 & 1.37 & 72 & 4958 & 69 & 68 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 3
97.39 & 44.90 & 52.38 & 39.29 & 0.99 & 55 & 4977 & 50 & 85 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 4
97.04 & 46.69 & 45.58 & 47.86 & 1.59 & 67 & 4947 & 80 & 73 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 5
97.04 & 47.06 & 45.64 & 48.57 & 1.61 & 68 & 4946 & 81 & 72 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 6
97.10 & 48.98 & 46.75 & 51.43 & 1.63 & 72 & 4945 & 82 & 68 \\
unixcoder-base
Processing unixcoder-base for TOP 2, identifier 7
97.46 & 49.81 & 53.72 & 46.43 & 1.11 & 65 & 4

In [5]:
directory = "pred_label"  # Replace with your actual path

tops = ["2", "3", "4", "5", "6", "7", "8", "9", "10"]
#models = {"unixcoder": "unixcoder-base", "codebert": "codebert-base"}
#models = {"unixcoder": "unixcoder-base"}
models = {"codebert": "codebert-base"}
# Dictionary to store best F1 score and identifier for each model-TOP pair
best_results = {}

# Loop through each model and TOP
for model_name in models.values():
    for top in tops:
        best_f1 = 0  # Initialize best F1 score for this model-TOP pair
        best_identifier = None  # Store the identifier with the best F1 score
        
        for identifier in range(10):  # Assuming identifiers are in the range 0-9
            # Construct file paths
            print(model_name)
            prediction_file = f"top_{model_name}_{top}_{identifier}_predictions.pt"
            label_file = f"top_{model_name}_{top}_{identifier}_labels.pt"
            pred_path = os.path.join(directory, prediction_file)
            label_path = os.path.join(directory, label_file)
            
            # Check if both files exist
            if os.path.isfile(pred_path) and os.path.isfile(label_path):
                print(f"Processing {model_name} for TOP {top}, identifier {identifier}")
                
                # Calculate metrics
                accuracy, f1, precision, recall, fpr, vds = calmetrics(pred_path, label_path)
                
                # Track the best F1 score and identifier
                if f1 > best_f1:
                    best_f1 = f1
                    best_identifier = identifier
        
        # Store the best identifier and F1 score for this model and TOP
        best_results[(model_name, top)] = {"best_identifier": best_identifier, "best_f1": best_f1}

# Print the best identifier based on the highest F1 score for each model and TOP
for (model_name, top), result in best_results.items():
    print(f"Best identifier for {model_name} with TOP {top} is {result['best_identifier']} with F1 score {result['best_f1']:.4f}")
    prediction_file = f"top_{model_name}_{top}_{result['best_identifier']}_predictions.pt"
    label_file = f"top_{model_name}_{top}_{result['best_identifier']}_labels.pt"
    pred_path = os.path.join(directory, prediction_file)
    label_path = os.path.join(directory, label_file)
    calmetrics(pred_path, label_path)

codebert-base
Processing codebert-base for TOP 2, identifier 0
97.58 & 41.86 & 60.00 & 32.14 & 0.60 & 45 & 4997 & 30 & 95 \\
codebert-base
Processing codebert-base for TOP 2, identifier 1
97.43 & 22.22 & 61.29 & 13.57 & 0.24 & 19 & 5015 & 12 & 121 \\
codebert-base
Processing codebert-base for TOP 2, identifier 2
97.52 & 46.22 & 56.12 & 39.29 & 0.86 & 55 & 4984 & 43 & 85 \\
codebert-base
Processing codebert-base for TOP 2, identifier 3
97.64 & 40.78 & 63.64 & 30.00 & 0.48 & 42 & 5003 & 24 & 98 \\
codebert-base
Processing codebert-base for TOP 2, identifier 4
97.06 & 46.85 & 45.89 & 47.86 & 1.57 & 67 & 4948 & 79 & 73 \\
codebert-base
Processing codebert-base for TOP 2, identifier 5
97.37 & 48.09 & 51.64 & 45.00 & 1.17 & 63 & 4968 & 59 & 77 \\
codebert-base
Processing codebert-base for TOP 2, identifier 6
97.19 & 49.12 & 48.28 & 50.00 & 1.49 & 70 & 4952 & 75 & 70 \\
codebert-base
Processing codebert-base for TOP 2, identifier 7
97.56 & 47.06 & 57.14 & 40.00 & 0.84 & 56 & 4985 & 42 & 84 \\