In [None]:
"""
LLM Security Bechmark
This script is designed to measure the ability of LLMs to identify safe and unsafe code.
"""

from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
import json
import os
import sys
import re
import time
import pandas as pd
from pathlib import Path
from llmScan import *

In [None]:
# Create the benchmark-specific template and function
def create_benchmark_chain():
    """Create a chain specifically for benchmark boolean responses"""
    model, _ = get_model_and_chain()
    if model is None:
        return None, None
    
    benchmark_template = """<system>You are a security vulnerability analyzer for Python code.</system>

<user>
Examples of security vulnerabilities include:
- Unsafe use of untrusted input (e.g., from `request.args`, `request.form`, environment variables, external files)
- Dangerous function calls (e.g., `eval`, `exec`, `os.system`, `subprocess.run`, `pickle.loads`, `yaml.load`)
- Insecure file handling (e.g., `open` or `send_file` with user-controlled paths)
- Cryptographic mistakes (e.g., hardcoded keys, insecure algorithms)
- Web-specific issues (e.g., Cross-Site Scripting (XSS), CSRF vulnerabilities, Open Redirects)
- Hardcoded secrets (e.g., API keys, passwords, tokens)
- Misconfigurations (e.g., exposing debug mode, bad CORS policies)

Analyze this Python code for security vulnerabilities:

```python
{python_code}
```

Return your analysis as a single boolean value: True if vulnerable and False if safe
```

</user>
"""
    
    from langchain_core.prompts import ChatPromptTemplate
    prompt = ChatPromptTemplate.from_template(benchmark_template)
    chain = prompt | model
    
    # Get model name
    model_name = getattr(model, 'model', 'unknown')
    
    return chain, model_name

benchmark_chain, model_name = create_benchmark_chain()
print(f"Benchmark chain created successfully with model: {model_name}" if benchmark_chain else "Failed to create benchmark chain")

In [None]:
# Define the benchmark test function with timing
def test_vulnerabilities_benchmark(python_code_path):
    """
    Analyzes a Python file for security vulnerabilities using boolean response
    
    Args:
        python_code_path (str): Path to the Python file to analyze
    
    Returns:
        tuple: (success, result, model_name, scan_time)
            - success (bool): True if analysis successful, False otherwise
            - result (bool): True if vulnerable, False if safe
            - model_name (str): Name of the LLM model used
            - scan_time (float): Time taken for the scan in seconds
    """
    start_time = time.time()
    
    try:
        # Check if LLM is available
        if benchmark_chain is None:
            scan_time = time.time() - start_time
            return False, None, "Error: No working LLM model available", scan_time
        
        # Load Python code
        python_code = load_python_file(python_code_path)

        # Invoke the chain with retries 
        retries = 0
        while retries < MAX_RETRIES:
            try:
                response = benchmark_chain.invoke({"python_code": python_code})
                break
            except Exception as e:
                retries += 1
                error_msg = f"LLM invocation failed (attempt {retries}/{MAX_RETRIES}): {str(e)}"
                print(error_msg)
                if retries >= MAX_RETRIES:
                    scan_time = time.time() - start_time
                    return False, None, model_name, scan_time
                print(f"Retrying in {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)

        # Parse response - handle both boolean and string responses
        scan_time = time.time() - start_time
        
        if isinstance(response, bool):
            return True, response, model_name, scan_time
        elif isinstance(response, str):
            response_lower = response.strip().lower()
            if response_lower == "true":
                return True, True, model_name, scan_time
            elif response_lower == "false":
                return True, False, model_name, scan_time
            else:
                # Try to extract boolean from text
                if "true" in response_lower and "false" not in response_lower:
                    return True, True, model_name, scan_time
                elif "false" in response_lower and "true" not in response_lower:
                    return True, False, model_name, scan_time
                else:
                    print(f"Ambiguous response: {response}")
                    return False, None, model_name, scan_time
        else:
            print(f"Unexpected response type: {type(response)}")
            return False, None, model_name, scan_time

    except Exception as e:
        scan_time = time.time() - start_time
        return False, None, model_name, scan_time

print("test_vulnerabilities_benchmark function defined successfully!")

In [None]:
# Define the benchmark runner function (updated with detailed file count reporting)
def run_benchmark(train_dir="train", max_files_per_dir=None, total_max_files=None, 
                 include_patched=True, include_vulnerable=True, test_split=None):
    """
    Run benchmark on Python files in train directories with flexible selection options
    
    Args:
        train_dir (str): Base directory containing patched/ and vulnerable/ subdirectories
        max_files_per_dir (int, optional): Maximum number of files to process per directory
        total_max_files (int, optional): Maximum total number of files to process across all directories
        include_patched (bool): Whether to include files from patched/ directory (default: True)
        include_vulnerable (bool): Whether to include files from vulnerable/ directory (default: True)
        test_split (float, optional): Fraction of files to use for testing (0.0-1.0). If None, use all files
    
    Returns:
        pd.DataFrame: Results with columns [file_path, success, llm_model, llm_result, actual, scan_time_seconds]
    """
    results = []
    total_files_processed = 0
    
    # Track files by directory for reporting
    files_by_directory = {}
    
    # Define directories and their labels based on inclusion flags
    directories = {}
    if include_patched:
        directories[os.path.join(train_dir, "patched")] = False    # Safe files
    if include_vulnerable:
        directories[os.path.join(train_dir, "vulnerable")] = True   # Vulnerable files
    
    if not directories:
        print("Error: No directories selected. Set include_patched=True or include_vulnerable=True")
        return pd.DataFrame()
    
    print(f"Selected directories: {list(directories.keys())}")
    
    for directory, actual_vulnerable in directories.items():
        if not os.path.exists(directory):
            print(f"Warning: Directory {directory} does not exist")
            continue
            
        # Check if we've reached the total file limit
        if total_max_files and total_files_processed >= total_max_files:
            print(f"Reached total file limit of {total_max_files}. Stopping.")
            break
            
        print(f"\nProcessing directory: {directory}")
        
        # Find all Python files in the directory
        all_python_files = list(Path(directory).glob("**/*.py"))
        original_count = len(all_python_files)
        print(f"Found {original_count} Python files")
        
        python_files = all_python_files.copy()
        
        # Apply test split if specified
        if test_split is not None:
            if not 0.0 <= test_split <= 1.0:
                print(f"Warning: test_split must be between 0.0 and 1.0, got {test_split}")
            else:
                import random
                random.seed(42)  # For reproducible results
                random.shuffle(python_files)
                split_size = int(len(python_files) * test_split)
                python_files = python_files[:split_size]
                print(f"Using test split of {test_split:.2%}: {len(python_files)} files selected from {original_count} available")
        
        # Apply per-directory limit if specified
        if max_files_per_dir:
            files_before_limit = len(python_files)
            python_files = python_files[:max_files_per_dir]
            if len(python_files) < files_before_limit:
                print(f"Limited to first {max_files_per_dir} files (was {files_before_limit})")
        
        # Apply total limit if specified
        if total_max_files:
            remaining_files = total_max_files - total_files_processed
            if remaining_files <= 0:
                break
            files_before_total_limit = len(python_files)
            python_files = python_files[:remaining_files]
            if len(python_files) < files_before_total_limit:
                print(f"Limited to {len(python_files)} files due to total limit (was {files_before_total_limit})")
        
        # Store file count for this directory
        directory_name = "patched" if not actual_vulnerable else "vulnerable"
        files_by_directory[directory_name] = len(python_files)
        
        print(f"Will analyze {len(python_files)} files from {directory_name} directory")
        
        for file_path in python_files:
            print(f"Analyzing: {file_path}")
            
            # Run vulnerability test with timing
            success, llm_result, llm_model, scan_time = test_vulnerabilities_benchmark(str(file_path))
            
            # Store results
            results.append({
                "file_path": str(file_path),
                "success": success,
                "llm_model": llm_model,
                "llm_result": llm_result,
                "actual": actual_vulnerable,
                "scan_time_seconds": round(scan_time, 3)  # Round to 3 decimal places
            })
            
            total_files_processed += 1
            
            # Print progress
            if success:
                status = "VULNERABLE" if llm_result else "SAFE"
                expected = "VULNERABLE" if actual_vulnerable else "SAFE"
                match = "✓" if llm_result == actual_vulnerable else "✗"
                print(f"  Result: {status} | Expected: {expected} | {match} | Time: {scan_time:.3f}s")
            else:
                print(f"  Error: Analysis failed | Time: {scan_time:.3f}s")
            
            # Check if we've reached the total file limit
            if total_max_files and total_files_processed >= total_max_files:
                print(f"Reached total file limit of {total_max_files}. Stopping.")
                break
    
    # Print final summary of files processed by directory
    print(f"\n" + "="*50)
    print("FILES PROCESSED SUMMARY")
    print("="*50)
    for dir_name, count in files_by_directory.items():
        print(f"{dir_name.capitalize()} files: {count}")
    print(f"Total files processed: {total_files_processed}")
    
    return pd.DataFrame(results)

print("run_benchmark function defined successfully!")

In [None]:
# Run the benchmark
print("Starting LLM Security Benchmark...")

# Example usage with different options:

# Basic usage:
# df_results = run_benchmark()                           # All files from both directories

# Directory selection:
# df_results = run_benchmark(include_patched=True, include_vulnerable=False)   # Only safe files
# df_results = run_benchmark(include_patched=False, include_vulnerable=True)   # Only vulnerable files

# File limits:
df_results = run_benchmark(max_files_per_dir=5)        # Max 5 files per directory
# df_results = run_benchmark(total_max_files=10)         # Max 10 files total

# Test split:
# df_results = run_benchmark(test_split=0.1)             # Use 10% of files for testing
# df_results = run_benchmark(test_split=0.5)             # Use 50% of files for testing

# Combined options:
# df_results = run_benchmark(include_vulnerable=True, include_patched=False, 
#                           test_split=0.2, max_files_per_dir=10)

# Run with default settings (all files from both directories)
# df_results = run_benchmark()

print(f"\nBenchmark completed! Analyzed {len(df_results)} files.")
print("Results stored in 'df_results' DataFrame")

In [None]:
# Calculate and display metrics (updated to include failed scans)
print("="*60)
print("BENCHMARK RESULTS")
print("="*60)

# Basic statistics
total_files = len(df_results)
successful_analyses = df_results['success'].sum()
failed_analyses = total_files - successful_analyses

print(f"Total files analyzed: {total_files}")
print(f"Successful analyses: {successful_analyses}")
print(f"Failed analyses: {failed_analyses}")

# Show model information
if not df_results.empty:
    models_used = df_results['llm_model'].unique()
    print(f"LLM Model(s) used: {', '.join(models_used)}")

# Timing statistics
if not df_results.empty:
    total_time = df_results['scan_time_seconds'].sum()
    avg_time = df_results['scan_time_seconds'].mean()
    min_time = df_results['scan_time_seconds'].min()
    max_time = df_results['scan_time_seconds'].max()
    
    print(f"\nTiming Statistics:")
    print(f"Total scan time: {total_time:.3f} seconds")
    print(f"Average scan time: {avg_time:.3f} seconds")
    print(f"Fastest scan: {min_time:.3f} seconds")
    print(f"Slowest scan: {max_time:.3f} seconds")
    
    if successful_analyses > 0:
        successful_df = df_results[df_results['success'] == True]
        avg_successful_time = successful_df['scan_time_seconds'].mean()
        print(f"Average time for successful scans: {avg_successful_time:.3f} seconds")
        
    if failed_analyses > 0:
        failed_df = df_results[df_results['success'] == False]
        avg_failed_time = failed_df['scan_time_seconds'].mean()
        print(f"Average time for failed scans: {avg_failed_time:.3f} seconds")

# Calculate metrics including failed scans
if total_files > 0:
    # Overall success rate
    success_rate = successful_analyses / total_files
    print(f"\nOverall Success Rate: {success_rate:.2%} ({successful_analyses}/{total_files})")
    
    if successful_analyses > 0:
        # Calculate accuracy on successful analyses only
        successful_df = df_results[df_results['success'] == True]
        correct_predictions = (successful_df['llm_result'] == successful_df['actual']).sum()
        accuracy_on_successful = correct_predictions / len(successful_df)
        
        print(f"Accuracy on successful scans: {accuracy_on_successful:.2%} ({correct_predictions}/{len(successful_df)})")
        
        # Calculate overall accuracy (treating failed scans as incorrect)
        overall_accuracy = correct_predictions / total_files
        print(f"Overall accuracy (failed scans counted as incorrect): {overall_accuracy:.2%} ({correct_predictions}/{total_files})")
        
        # Confusion matrix for successful scans
        true_positives = ((successful_df['llm_result'] == True) & (successful_df['actual'] == True)).sum()
        true_negatives = ((successful_df['llm_result'] == False) & (successful_df['actual'] == False)).sum()
        false_positives = ((successful_df['llm_result'] == True) & (successful_df['actual'] == False)).sum()
        false_negatives = ((successful_df['llm_result'] == False) & (successful_df['actual'] == True)).sum()
        
        print(f"\nConfusion Matrix (Successful Scans Only):")
        print(f"True Positives (correctly identified vulnerabilities): {true_positives}")
        print(f"True Negatives (correctly identified safe code): {true_negatives}")
        print(f"False Positives (incorrectly flagged as vulnerable): {false_positives}")
        print(f"False Negatives (missed vulnerabilities): {false_negatives}")
        
        # Extended confusion matrix including failed scans
        failed_df = df_results[df_results['success'] == False]
        failed_vulnerable = (failed_df['actual'] == True).sum()
        failed_safe = (failed_df['actual'] == False).sum()
        
        print(f"\nExtended Confusion Matrix (Including Failed Scans):")
        print(f"True Positives: {true_positives}")
        print(f"True Negatives: {true_negatives}")
        print(f"False Positives: {false_positives}")
        print(f"False Negatives: {false_negatives}")
        print(f"Failed on Vulnerable Files: {failed_vulnerable}")
        print(f"Failed on Safe Files: {failed_safe}")
        print(f"Total Failed: {failed_analyses}")
        
        # Precision, Recall, F1 (on successful scans only)
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        print(f"\nMetrics (Successful Scans Only):")
        print(f"Precision: {precision:.2%}")
        print(f"Recall: {recall:.2%}")
        print(f"F1 Score: {f1_score:.2%}")
        
        # Adjusted metrics including failed scans (treating failures as false negatives/positives)
        # For overall metrics, we'll be conservative and count failed scans as prediction errors
        total_vulnerable = (df_results['actual'] == True).sum()
        total_safe = (df_results['actual'] == False).sum()
        
        # Adjusted recall: failed vulnerable files count as missed detections
        adjusted_recall = true_positives / total_vulnerable if total_vulnerable > 0 else 0
        
        # Adjusted precision: we can't count failed scans in denominator for precision
        # so we keep the original precision calculation
        adjusted_precision = precision
        
        # Adjusted F1
        adjusted_f1 = 2 * (adjusted_precision * adjusted_recall) / (adjusted_precision + adjusted_recall) if (adjusted_precision + adjusted_recall) > 0 else 0
        
        print(f"\nAdjusted Metrics (Failed Scans Impact Recall):")
        print(f"Adjusted Precision: {adjusted_precision:.2%} (same as above)")
        print(f"Adjusted Recall: {adjusted_recall:.2%} (failed vulnerable files counted as missed)")
        print(f"Adjusted F1 Score: {adjusted_f1:.2%}")
        
    else:
        print("\nNo successful analyses to calculate detailed metrics.")
        
    # Failure analysis by file type
    if failed_analyses > 0:
        failed_df = df_results[df_results['success'] == False]
        failed_vulnerable = (failed_df['actual'] == True).sum()
        failed_safe = (failed_df['actual'] == False).sum()
        
        print(f"\nFailure Analysis:")
        print(f"Failed vulnerable file scans: {failed_vulnerable}")
        print(f"Failed safe file scans: {failed_safe}")
        print(f"Failure rate on vulnerable files: {failed_vulnerable/total_vulnerable:.2%}" if total_vulnerable > 0 else "No vulnerable files")
        print(f"Failure rate on safe files: {failed_safe/total_safe:.2%}" if total_safe > 0 else "No safe files")
        
else:
    print("\nNo files analyzed.")

In [None]:
# Save results to CSV (fixed variable names)
output_file = "llm_benchmark_results.csv"
df_results.to_csv(output_file, index=False)
print(f"Results saved to: {output_file}")

# Save summary metrics to a separate file
if successful_analyses > 0:
    summary = {
        "total_files": total_files,
        "successful_analyses": successful_analyses,
        "failed_analyses": failed_analyses,
        "llm_model": ', '.join(models_used),
        "accuracy_on_successful": accuracy_on_successful,
        "overall_accuracy": overall_accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1_score,
        "adjusted_precision": adjusted_precision,
        "adjusted_recall": adjusted_recall,
        "adjusted_f1": adjusted_f1,
        "true_positives": true_positives,
        "true_negatives": true_negatives,
        "false_positives": false_positives,
        "false_negatives": false_negatives,
        "failed_vulnerable": failed_vulnerable,
        "failed_safe": failed_safe,
        "total_scan_time_seconds": total_time,
        "average_scan_time_seconds": avg_time,
        "min_scan_time_seconds": min_time,
        "max_scan_time_seconds": max_time
    }
    
    summary_df = pd.DataFrame([summary])
    summary_df.to_csv("llm_benchmark_summary.csv", index=False)
    print("Summary metrics saved to: llm_benchmark_summary.csv")
else:
    print("No successful analyses to save summary metrics.")

In [None]:
# Debug LLM Input/Output
def debug_llm_example(file_path=None):
    """
    Show exactly what the LLM receives and outputs for debugging purposes
    
    Args:
        file_path (str, optional): Path to a specific file to analyze. If None, uses first available file.
    """
    print("="*60)
    print("LLM INPUT/OUTPUT DEBUG EXAMPLE")
    print("="*60)
    
    # Find a file to analyze if none provided
    if file_path is None:
        # Look for files in train directories
        for directory in ["train/patched", "train/vulnerable"]:
            if os.path.exists(directory):
                python_files = list(Path(directory).glob("**/*.py"))
                if python_files:
                    file_path = str(python_files[0])
                    break
        
        if file_path is None:
            print("No Python files found in train/patched or train/vulnerable directories")
            return
    
    print(f"Analyzing file: {file_path}")
    print(f"File type: {'VULNERABLE' if 'vulnerable' in file_path else 'SAFE'}")
    print("-" * 60)
    
    try:
        # Load the Python code
        python_code = load_python_file(file_path)
        
        print("PYTHON CODE CONTENT:")
        print("-" * 30)
        print(python_code)
        print("-" * 30)
        
        # Show the prompt template
        benchmark_template = """<system>You are a security vulnerability analyzer for Python code.</system>

<user>
Examples of security vulnerabilities include:
- Unsafe use of untrusted input (e.g., from `request.args`, `request.form`, environment variables, external files)
- Dangerous function calls (e.g., `eval`, `exec`, `os.system`, `subprocess.run`, `pickle.loads`, `yaml.load`)
- Insecure file handling (e.g., `open` or `send_file` with user-controlled paths)
- Cryptographic mistakes (e.g., hardcoded keys, insecure algorithms)
- Web-specific issues (e.g., Cross-Site Scripting (XSS), CSRF vulnerabilities, Open Redirects)
- Hardcoded secrets (e.g., API keys, passwords, tokens)
- Misconfigurations (e.g., exposing debug mode, bad CORS policies)

Analyze this Python code for security vulnerabilities:

```python
{python_code}
```

Return your analysis as a single boolean value: True if vulnerable and False if safe
```

</user>
"""
        
        # Format the full prompt
        full_prompt = benchmark_template.format(python_code=python_code)
        
        print("FULL PROMPT SENT TO LLM:")
        print("-" * 30)
        print(full_prompt)
        print("-" * 30)
        
        # Get LLM response
        if benchmark_chain is None:
            print("ERROR: No LLM chain available")
            return
            
        print("Sending to LLM...")
        start_time = time.time()
        
        try:
            response = benchmark_chain.invoke({"python_code": python_code})
            scan_time = time.time() - start_time
            
            print(f"LLM RESPONSE (took {scan_time:.3f}s):")
            print("-" * 30)
            print(f"Raw response: {repr(response)}")
            print(f"Response type: {type(response)}")
            print(f"Response content: {response}")
            print("-" * 30)
            
            # Show how the response gets parsed
            if isinstance(response, bool):
                parsed_result = response
                print(f"PARSED RESULT: {parsed_result} (direct boolean)")
            elif isinstance(response, str):
                response_lower = response.strip().lower()
                if response_lower == "true":
                    parsed_result = True
                    print(f"PARSED RESULT: {parsed_result} (string 'true')")
                elif response_lower == "false":
                    parsed_result = False
                    print(f"PARSED RESULT: {parsed_result} (string 'false')")
                elif "true" in response_lower and "false" not in response_lower:
                    parsed_result = True
                    print(f"PARSED RESULT: {parsed_result} (contains 'true')")
                elif "false" in response_lower and "true" not in response_lower:
                    parsed_result = False
                    print(f"PARSED RESULT: {parsed_result} (contains 'false')")
                else:
                    parsed_result = None
                    print(f"PARSED RESULT: {parsed_result} (ambiguous response)")
            else:
                parsed_result = None
                print(f"PARSED RESULT: {parsed_result} (unexpected type)")
            
            # Show expected vs actual
            expected_result = "vulnerable" in file_path.lower()
            print(f"EXPECTED RESULT: {expected_result}")
            
            if parsed_result is not None:
                match = "✓ CORRECT" if parsed_result == expected_result else "✗ INCORRECT"
                print(f"MATCH: {match}")
            else:
                print("MATCH: ✗ FAILED TO PARSE")
                
        except Exception as e:
            print(f"LLM ERROR: {str(e)}")
            
    except Exception as e:
        print(f"FILE LOADING ERROR: {str(e)}")

# Run the debug example
print("Running LLM debug example...")
debug_llm_example()