In [1]:
# Let's look at an actual gold patch
from datasets import load_dataset

# Load one instance and show its gold patch
dataset = load_dataset('princeton-nlp/SWE-bench_Lite', split='test')
instance = dataset[0]  # First instance

print(f"Instance: {instance['instance_id']}")
print(f"Repository: {instance['repo']}")
print(f"\nProblem Statement (first 200 chars):")
print(instance['problem_statement'][:200] + "...")
print(f"\nGold Patch (first 500 chars):")
print(instance['patch'][:500] + "...")
print(f"\nThis patch has {len(instance['patch'])} total characters")

  from .autonotebook import tqdm as notebook_tqdm


Instance: astropy__astropy-12907
Repository: astropy/astropy

Problem Statement (first 200 chars):
Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
Consider the following model:

```python
from astropy.modeling import models as m
from astropy.mo...

Gold Patch (first 500 chars):
diff --git a/astropy/modeling/separable.py b/astropy/modeling/separable.py
--- a/astropy/modeling/separable.py
+++ b/astropy/modeling/separable.py
@@ -242,7 +242,7 @@ def _cstack(left, right):
         cright = _coord_matrix(right, 'right', noutp)
     else:
         cright = np.zeros((noutp, right.shape[1]))
-        cright[-right.shape[0]:, -right.shape[1]:] = 1
+        cright[-right.shape[0]:, -right.shape[1]:] = right
 
     return np.hstack([cleft, cright])
 
...

This patch has 470 total characters


In [2]:
# First, let's check what instances are available in SWE-bench_Lite
from datasets import load_dataset

# Load SWE-bench_Lite dataset
dataset = load_dataset('princeton-nlp/SWE-bench_Lite', split='test')

# Show first 10 instances
print(f"Total instances in SWE-bench_Lite: {len(dataset)}")
print("\nFirst 10 instances:")
for i, instance in enumerate(dataset):
    if i < 10:
        print(f"{i+1}. {instance['instance_id']}")
        print(f"   Repo: {instance['repo']}")
        print(f"   Problem: {instance['problem_statement'][:80]}...")
        print()

Total instances in SWE-bench_Lite: 300

First 10 instances:
1. astropy__astropy-12907
   Repo: astropy/astropy
   Problem: Modeling's `separability_matrix` does not compute separability correctly for nes...

2. astropy__astropy-14182
   Repo: astropy/astropy
   Problem: Please support header rows in RestructuredText output
### Description

It woul...

3. astropy__astropy-14365
   Repo: astropy/astropy
   Problem: ascii.qdp Table format assumes QDP commands are upper case
### Description

asci...

4. astropy__astropy-14995
   Repo: astropy/astropy
   Problem: In v5.3, NDDataRef mask propagation fails when one of the operand does not have ...

5. astropy__astropy-6938
   Repo: astropy/astropy
   Problem: Possible bug in io.fits related to D exponents
I came across the following code ...

6. astropy__astropy-7746
   Repo: astropy/astropy
   Problem: Issue when passing empty lists/arrays to WCS transformations
The following shoul...

7. django__django-10914
   Repo: django/django
   Proble

## Example 1: Run evaluation with gold predictions (actual fixes) for 1 instance

In [None]:
import subprocess
import json
from pathlib import Path

# Run evaluation for a single instance using gold predictions
# Gold predictions are the actual human-written fixes
def run_single_instance_gold(instance_id, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", "gold",
        "--instance_ids", instance_id,
        "--run_id", f"notebook_test_{instance_id}",
        "--max_workers", "1",
        "--timeout", "300"
    ]
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Success!")
        print(result.stdout[-500:])  # Last 500 chars
    else:
        print("Error!")
        print(result.stderr[-500:])
    
    return result

# Example: Run one instance locally (WARNING: requires Docker)
# Uncomment to run locally:
# result = run_single_instance_gold("astropy__astropy-12907", modal=False)

# Example: Run one instance on Modal (recommended)
# Uncomment to run on Modal:
# result = run_single_instance_gold("astropy__astropy-12907", modal=True)

## Example 2: Run evaluation for multiple instances (2-3 examples)

In [None]:
# Run evaluation for multiple instances
def run_multiple_instances_gold(instance_ids, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", "gold",
        "--instance_ids", *instance_ids,  # Unpack the list
        "--run_id", f"notebook_test_multiple_{len(instance_ids)}",
        "--max_workers", "4",
        "--timeout", "600"
    ]
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running {len(instance_ids)} instances...")
    print(f"Command: {' '.join(cmd[:10])}...")  # Show first part of command
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Success!")
        # Parse results if available
        try:
            lines = result.stdout.split('\n')
            for line in lines[-20:]:  # Last 20 lines
                if 'resolved' in line or 'completed' in line or 'Resolution' in line:
                    print(line)
        except:
            print(result.stdout[-500:])
    else:
        print("Error!")
        print(result.stderr[-500:])
    
    return result

# Example: Run 3 instances on Modal
three_instances = [
    "astropy__astropy-12907",
    "django__django-11099", 
    "matplotlib__matplotlib-23913"
]

# Uncomment to run:
# result = run_multiple_instances_gold(three_instances, modal=True)

## Example 3: Create and run custom predictions

In [None]:
# Create a custom predictions file
def create_custom_predictions(instance_ids, patches, output_file="custom_predictions.jsonl"):
    """
    Create a predictions file with custom patches
    
    Args:
        instance_ids: List of instance IDs
        patches: List of patch strings (diff format) or None for empty patches
        output_file: Where to save the predictions
    """
    predictions = []
    
    for i, instance_id in enumerate(instance_ids):
        patch = patches[i] if i < len(patches) else ""
        prediction = {
            "instance_id": instance_id,
            "model_name_or_path": "custom-notebook-test",
            "model_patch": patch
        }
        predictions.append(prediction)
    
    # Write as JSONL
    with open(output_file, 'w') as f:
        for pred in predictions:
            f.write(json.dumps(pred) + '\n')
    
    print(f"Created predictions file: {output_file}")
    print(f"Contains {len(predictions)} predictions")
    return output_file

# Example: Create empty patches (will fail) for testing
test_instances = ["astropy__astropy-12907", "django__django-11099"]
empty_patches = ["", ""]  # Empty patches - these will fail

# Create the predictions file
# predictions_file = create_custom_predictions(test_instances, empty_patches)

# Run evaluation with custom predictions
def run_custom_predictions(predictions_file, instance_ids=None, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", predictions_file,
        "--run_id", "notebook_custom_test",
        "--max_workers", "2",
        "--timeout", "300"
    ]
    
    if instance_ids:
        cmd.extend(["--instance_ids", *instance_ids])
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running custom predictions from {predictions_file}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Evaluation completed!")
        print(result.stdout[-500:])
    else:
        print("Error during evaluation!")
        print(result.stderr[-500:])
    
    return result

# Uncomment to create and run:
# predictions_file = create_custom_predictions(test_instances, empty_patches)
# result = run_custom_predictions(predictions_file, modal=True)

## Example 4: Check evaluation results

In [None]:
# Check evaluation results after running
import os
from pathlib import Path

def check_evaluation_results(run_id):
    """Check the results of an evaluation run"""
    results_dir = Path("./run_evaluation_logs") / run_id
    
    if not results_dir.exists():
        print(f"No results found for run_id: {run_id}")
        return
    
    print(f"Results directory: {results_dir}")
    
    # Look for report files
    for report_file in results_dir.rglob("report.json"):
        print(f"\nFound report: {report_file}")
        with open(report_file) as f:
            report = json.load(f)
            for instance_id, result in report.items():
                print(f"  Instance: {instance_id}")
                print(f"  Resolved: {result.get('resolved', 'N/A')}")
                if 'tests_status' in result:
                    passed = sum(1 for t in result['tests_status'].values() if t == 'PASSED')
                    total = len(result['tests_status'])
                    print(f"  Tests: {passed}/{total} passed")

# Example usage (after running an evaluation):
# check_evaluation_results("notebook_test_astropy__astropy-12907")