In [None]:
"""
LLM Security Bechmark
This script is designed to measure the ability of LLMs to identify safe and unsafe code.
"""

from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import json
import os
import sys
import re
import time
import pandas as pd
from pathlib import Path
from llmScan import *

In [None]:
# Create the benchmark-specific template and function
def create_benchmark_chain():
    """Create a chain specifically for benchmark boolean responses"""
    model, _ = get_model_and_chain()
    if model is None:
        return None, None
    
    benchmark_template = """<system>You are a security vulnerability analyzer for Python code.</system>

<user>
Examples of security vulnerabilities include:
- Unsafe use of untrusted input (e.g., from `request.args`, `request.form`, environment variables, external files)
- Dangerous function calls (e.g., `eval`, `exec`, `os.system`, `subprocess.run`, `pickle.loads`, `yaml.load`)
- Insecure file handling (e.g., `open` or `send_file` with user-controlled paths)
- Cryptographic mistakes (e.g., hardcoded keys, insecure algorithms)
- Web-specific issues (e.g., Cross-Site Scripting (XSS), CSRF vulnerabilities, Open Redirects)
- Hardcoded secrets (e.g., API keys, passwords, tokens)
- Misconfigurations (e.g., exposing debug mode, bad CORS policies)

Analyze this Python code for security vulnerabilities:

```python
{python_code}
```

Return your analysis as a single boolean value: True if vulnerable and False if safe
```

</user>
"""
    
    from langchain_core.prompts import ChatPromptTemplate
    prompt = ChatPromptTemplate.from_template(benchmark_template)
    chain = prompt | model
    
    # Get model name
    model_name = getattr(model, 'model', 'unknown')
    
    return chain, model_name

benchmark_chain, model_name = create_benchmark_chain()
print(f"Benchmark chain created successfully with model: {model_name}" if benchmark_chain else "Failed to create benchmark chain")

In [None]:
# Define the benchmark test function
def test_vulnerabilities_benchmark(python_code_path):
    """
    Analyzes a Python file for security vulnerabilities using boolean response
    
    Args:
        python_code_path (str): Path to the Python file to analyze
    
    Returns:
        tuple: (success, result, model_name)
            - success (bool): True if analysis successful, False otherwise
            - result (bool): True if vulnerable, False if safe
            - model_name (str): Name of the LLM model used
    """
    try:
        # Check if LLM is available
        if benchmark_chain is None:
            return False, None, "Error: No working LLM model available"
        
        # Load Python code
        python_code = load_python_file(python_code_path)

        # Invoke the chain with retries 
        retries = 0
        while retries < MAX_RETRIES:
            try:
                response = benchmark_chain.invoke({"python_code": python_code})
                break
            except Exception as e:
                retries += 1
                error_msg = f"LLM invocation failed (attempt {retries}/{MAX_RETRIES}): {str(e)}"
                print(error_msg)
                if retries >= MAX_RETRIES:
                    return False, None, model_name
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)

        # Parse response - handle both boolean and string responses
        if isinstance(response, bool):
            return True, response, model_name
        elif isinstance(response, str):
            response_lower = response.strip().lower()
            if response_lower == "true":
                return True, True, model_name
            elif response_lower == "false":
                return True, False, model_name
            else:
                # Try to extract boolean from text
                if "true" in response_lower and "false" not in response_lower:
                    return True, True, model_name
                elif "false" in response_lower and "true" not in response_lower:
                    return True, False, model_name
                else:
                    print(f"Ambiguous response: {response}")
                    return False, None, model_name
        else:
            print(f"Unexpected response type: {type(response)}")
            return False, None, model_name

    except Exception as e:
        return False, None, model_name

print("test_vulnerabilities_benchmark function defined successfully!")

In [None]:
# Define the benchmark test function
def test_vulnerabilities_benchmark(python_code_path):
    """
    Analyzes a Python file for security vulnerabilities using boolean response
    
    Args:
        python_code_path (str): Path to the Python file to analyze
    
    Returns:
        tuple: (success, result, model_name)
            - success (bool): True if analysis successful, False otherwise
            - result (bool): True if vulnerable, False if safe
            - model_name (str): Name of the LLM model used
    """
    try:
        # Check if LLM is available
        if benchmark_chain is None:
            return False, None, "Error: No working LLM model available"
        
        # Load Python code
        python_code = load_python_file(python_code_path)

        # Invoke the chain with retries 
        retries = 0
        while retries < MAX_RETRIES:
            try:
                response = benchmark_chain.invoke({"python_code": python_code})
                break
            except Exception as e:
                retries += 1
                error_msg = f"LLM invocation failed (attempt {retries}/{MAX_RETRIES}): {str(e)}"
                print(error_msg)
                if retries >= MAX_RETRIES:
                    return False, None, model_name
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)

        # Parse response - handle both boolean and string responses
        if isinstance(response, bool):
            return True, response, model_name
        elif isinstance(response, str):
            response_lower = response.strip().lower()
            if response_lower == "true":
                return True, True, model_name
            elif response_lower == "false":
                return True, False, model_name
            else:
                # Try to extract boolean from text
                if "true" in response_lower and "false" not in response_lower:
                    return True, True, model_name
                elif "false" in response_lower and "true" not in response_lower:
                    return True, False, model_name
                else:
                    print(f"Ambiguous response: {response}")
                    return False, None, model_name
        else:
            print(f"Unexpected response type: {type(response)}")
            return False, None, model_name

    except Exception as e:
        return False, None, model_name

print("test_vulnerabilities_benchmark function defined successfully!")

In [None]:
# Define the benchmark runner function
def run_benchmark(train_dir="train"):
    """
    Run benchmark on all Python files in train/patched/ and train/vulnerable/ directories
    
    Args:
        train_dir (str): Base directory containing patched/ and vulnerable/ subdirectories
    
    Returns:
        pd.DataFrame: Results with columns [file_path, success, llm_model, llm_result, actual]
    """
    results = []
    
    # Define directories and their labels
    directories = {
        os.path.join(train_dir, "patched"): False,    # Safe files
        os.path.join(train_dir, "vulnerable"): True   # Vulnerable files
    }
    
    for directory, actual_vulnerable in directories.items():
        if not os.path.exists(directory):
            print(f"Warning: Directory {directory} does not exist")
            continue
            
        print(f"Processing directory: {directory}")
        
        # Find all Python files in the directory
        python_files = list(Path(directory).glob("**/*.py"))
        print(f"Found {len(python_files)} Python files")
        
        for file_path in python_files:
            print(f"Analyzing: {file_path}")
            
            # Run vulnerability test
            success, llm_result, llm_model = test_vulnerabilities_benchmark(str(file_path))
            
            # Store results
            results.append({
                "file_path": str(file_path),
                "success": success,
                "llm_model": llm_model,
                "llm_result": llm_result,
                "actual": actual_vulnerable
            })
            
            # Print progress
            if success:
                status = "VULNERABLE" if llm_result else "SAFE"
                expected = "VULNERABLE" if actual_vulnerable else "SAFE"
                match = "✓" if llm_result == actual_vulnerable else "✗"
                print(f"  Result: {status} | Expected: {expected} | {match}")
            else:
                print(f"  Error: Analysis failed")
    
    return pd.DataFrame(results)

print("run_benchmark function defined successfully!")

In [None]:
# Run the benchmark
print("Starting LLM Security Benchmark...")
df_results = run_benchmark()

print(f"\nBenchmark completed! Analyzed {len(df_results)} files.")
print("Results stored in 'df_results' DataFrame")

In [None]:
# Calculate and display metrics
print("="*60)
print("BENCHMARK RESULTS")
print("="*60)

# Basic statistics
total_files = len(df_results)
successful_analyses = df_results['success'].sum()
failed_analyses = total_files - successful_analyses

print(f"Total files analyzed: {total_files}")
print(f"Successful analyses: {successful_analyses}")
print(f"Failed analyses: {failed_analyses}")

# Show model information
if not df_results.empty:
    models_used = df_results['llm_model'].unique()
    print(f"LLM Model(s) used: {', '.join(models_used)}")

if successful_analyses > 0:
    # Calculate accuracy on successful analyses only
    successful_df = df_results[df_results['success'] == True]
    correct_predictions = (successful_df['llm_result'] == successful_df['actual']).sum()
    accuracy = correct_predictions / len(successful_df)
    
    print(f"\nAccuracy: {accuracy:.2%} ({correct_predictions}/{len(successful_df)})")
    
    # Confusion matrix
    true_positives = ((successful_df['llm_result'] == True) & (successful_df['actual'] == True)).sum()
    true_negatives = ((successful_df['llm_result'] == False) & (successful_df['actual'] == False)).sum()
    false_positives = ((successful_df['llm_result'] == True) & (successful_df['actual'] == False)).sum()
    false_negatives = ((successful_df['llm_result'] == False) & (successful_df['actual'] == True)).sum()
    
    print(f"\nConfusion Matrix:")
    print(f"True Positives (correctly identified vulnerabilities): {true_positives}")
    print(f"True Negatives (correctly identified safe code): {true_negatives}")
    print(f"False Positives (incorrectly flagged as vulnerable): {false_positives}")
    print(f"False Negatives (missed vulnerabilities): {false_negatives}")
    
    # Precision, Recall, F1
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"\nMetrics:")
    print(f"Precision: {precision:.2%}")
    print(f"Recall: {recall:.2%}")
    print(f"F1 Score: {f1_score:.2%}")
else:
    print("\nNo successful analyses to calculate metrics.")

In [None]:
# Save results to CSV
output_file = "llm_benchmark_results.csv"
df_results.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Save summary metrics to a separate file
if successful_analyses > 0:
    summary = {
        "total_files": total_files,
        "successful_analyses": successful_analyses,
        "failed_analyses": failed_analyses,
        "llm_model": ', '.join(models_used),
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "true_positives": true_positives,
        "true_negatives": true_negatives,
        "false_positives": false_positives,
        "false_negatives": false_negatives
    }
    
    summary_df = pd.DataFrame([summary])
    summary_df.to_csv("llm_benchmark_summary.csv", index=False)
    print("Summary metrics saved to: llm_benchmark_summary.csv")