In [12]:
# Let's look at an actual gold patch
from datasets import load_dataset

# Load one instance and show its gold patch
dataset = load_dataset('princeton-nlp/SWE-bench_Lite', split='test')
instance = dataset[0]  # First instance

print(f"Instance: {instance['instance_id']}")
print(f"Repository: {instance['repo']}")
print(f"\nProblem Statement (first 200 chars):")
print(instance['problem_statement'][:200] + "...")
print(f"\nGold Patch (first 500 chars):")
print(instance['patch'][:500] + "...")
print(f"\nThis patch has {len(instance['patch'])} total characters")

Instance: astropy__astropy-12907
Repository: astropy/astropy

Problem Statement (first 200 chars):
Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
Consider the following model:

```python
from astropy.modeling import models as m
from astropy.mo...

Gold Patch (first 500 chars):
diff --git a/astropy/modeling/separable.py b/astropy/modeling/separable.py
--- a/astropy/modeling/separable.py
+++ b/astropy/modeling/separable.py
@@ -242,7 +242,7 @@ def _cstack(left, right):
         cright = _coord_matrix(right, 'right', noutp)
     else:
         cright = np.zeros((noutp, right.shape[1]))
-        cright[-right.shape[0]:, -right.shape[1]:] = 1
+        cright[-right.shape[0]:, -right.shape[1]:] = right
 
     return np.hstack([cleft, cright])
 
...

This patch has 470 total characters


## Repository Analysis - What repos are in SWE-bench_Lite?

In [15]:
# Analyze repositories in SWE-bench_Lite
from collections import Counter
import pandas as pd

# Get all repos
all_repos = [instance['repo'] for instance in dataset]
repo_counts = Counter(all_repos)

# Create a nice summary
print(f"Total instances in SWE-bench_Lite: {len(dataset)}")
print(f"Number of unique repositories: {len(repo_counts)}\n")

# Create DataFrame with repo info and GitHub links
repo_data = []
for repo, count in sorted(repo_counts.items(), key=lambda x: x[1], reverse=True):
    github_link = f"https://github.com/{repo.replace('__', '/')}"
    repo_data.append({
        'Repository': repo,
        'Instances': count,
        'GitHub Link': github_link,
    })

df = pd.DataFrame(repo_data)

# Display summary
print("Repository breakdown:")
print("-" * 50)
for _, row in df.iterrows():
    print(f"{row['Repository']:<30} {row['Instances']:>3} bugs  →  {row['GitHub Link']}")

# Show some statistics
print(f"\nMost bugs: {df.iloc[0]['Repository']} ({df.iloc[0]['Instances']} bugs)")
print(f"Least bugs: {df.iloc[-1]['Repository']} ({df.iloc[-1]['Instances']} bugs)")
print(f"Average bugs per repo: {df['Instances'].mean():.1f}")

Total instances in SWE-bench_Lite: 300
Number of unique repositories: 12

Repository breakdown:
--------------------------------------------------
django/django                  114 bugs  →  https://github.com/django/django
sympy/sympy                     77 bugs  →  https://github.com/sympy/sympy
matplotlib/matplotlib           23 bugs  →  https://github.com/matplotlib/matplotlib
scikit-learn/scikit-learn       23 bugs  →  https://github.com/scikit-learn/scikit-learn
pytest-dev/pytest               17 bugs  →  https://github.com/pytest-dev/pytest
sphinx-doc/sphinx               16 bugs  →  https://github.com/sphinx-doc/sphinx
astropy/astropy                  6 bugs  →  https://github.com/astropy/astropy
psf/requests                     6 bugs  →  https://github.com/psf/requests
pylint-dev/pylint                6 bugs  →  https://github.com/pylint-dev/pylint
pydata/xarray                    5 bugs  →  https://github.com/pydata/xarray
mwaskom/seaborn                  4 bugs  →  https://

In [None]:
# Create a clickable table with GitHub links
from IPython.display import HTML, display

# Create HTML table with clickable links
html_table = "<table style='width:100%; border-collapse: collapse;'>"
html_table += "<tr style='background-color: #f2f2f2;'>"
html_table += "<th style='padding: 8px; text-align: left;'>Repository</th>"
html_table += "<th style='padding: 8px; text-align: center;'>Bugs</th>"
html_table += "<th style='padding: 8px; text-align: left;'>GitHub</th>"
html_table += "<th style='padding: 8px; text-align: left;'>Description</th>"
html_table += "</tr>"

# Add repo descriptions (these are well-known Python projects)
descriptions = {
    'django__django': 'The web framework for perfectionists with deadlines',
    'sympy__sympy': 'A Python library for symbolic mathematics',
    'scikit-learn__scikit-learn': 'Machine learning in Python',
    'matplotlib__matplotlib': 'Comprehensive library for creating static, animated, and interactive visualizations',
    'requests__requests': 'A simple, yet elegant HTTP library',
    'flask__flask': 'A lightweight WSGI web application framework',
    'pytest-dev__pytest': 'The pytest framework makes it easy to write small tests',
    'pandas-dev__pandas': 'Flexible and powerful data analysis/manipulation library',
    'pydata__xarray': 'N-D labeled arrays and datasets in Python',
    'pylint-dev__pylint': 'A static code analyser for Python',
    'psf__requests': 'Python HTTP Requests for Humans',
    'sphinx-doc__sphinx': 'Python documentation generator',
    'pallets__flask': 'The Python micro framework for building web applications',
    'astropy__astropy': 'Astronomy and astrophysics library for Python',
    'mwaskom__seaborn': 'Statistical data visualization'
}

for _, row in df.iterrows():
    html_table += "<tr>"
    html_table += f"<td style='padding: 8px;'><b>{row['Repository']}</b></td>"
    html_table += f"<td style='padding: 8px; text-align: center;'>{row['Instances']}</td>"
    html_table += f"<td style='padding: 8px;'><a href='{row['GitHub Link']}' target='_blank'>View on GitHub →</a></td>"
    desc = descriptions.get(row['Repository'], 'Python library')
    html_table += f"<td style='padding: 8px; color: #666;'>{desc}</td>"
    html_table += "</tr>"

html_table += "</table>"

# Display the table
display(HTML(html_table))

# Also save as DataFrame for further analysis
repo_df = df

In [None]:
# Find instances from a specific repository
def find_instances_by_repo(repo_name):
    """Find all instances from a specific repository"""
    instances = []
    for instance in dataset:
        if repo_name.lower() in instance['repo'].lower():
            instances.append({
                'instance_id': instance['instance_id'],
                'created_at': instance['created_at'],
                'problem': instance['problem_statement'][:100] + '...'
            })
    
    if instances:
        print(f"Found {len(instances)} instances for '{repo_name}':\n")
        for i, inst in enumerate(instances[:5]):  # Show first 5
            print(f"{i+1}. {inst['instance_id']}")
            print(f"   Date: {inst['created_at']}")
            print(f"   Issue: {inst['problem']}\n")
        if len(instances) > 5:
            print(f"... and {len(instances) - 5} more")
    else:
        print(f"No instances found for '{repo_name}'")
    
    return instances

# Example: Find all Django bugs
# django_bugs = find_instances_by_repo('django')

# Example: Find all matplotlib bugs  
# matplotlib_bugs = find_instances_by_repo('matplotlib')

In [2]:
# First, let's check what instances are available in SWE-bench_Lite
from datasets import load_dataset

# Load SWE-bench_Lite dataset
dataset = load_dataset('princeton-nlp/SWE-bench_Lite', split='test')

# Show first 10 instances
print(f"Total instances in SWE-bench_Lite: {len(dataset)}")
print("\nFirst 10 instances:")
for i, instance in enumerate(dataset):
    if i < 10:
        print(f"{i+1}. {instance['instance_id']}")
        print(f"   Repo: {instance['repo']}")
        print(f"   Problem: {instance['problem_statement'][:80]}...")
        print()

Total instances in SWE-bench_Lite: 300

First 10 instances:
1. astropy__astropy-12907
   Repo: astropy/astropy
   Problem: Modeling's `separability_matrix` does not compute separability correctly for nes...

2. astropy__astropy-14182
   Repo: astropy/astropy
   Problem: Please support header rows in RestructuredText output
### Description

It woul...

3. astropy__astropy-14365
   Repo: astropy/astropy
   Problem: ascii.qdp Table format assumes QDP commands are upper case
### Description

asci...

4. astropy__astropy-14995
   Repo: astropy/astropy
   Problem: In v5.3, NDDataRef mask propagation fails when one of the operand does not have ...

5. astropy__astropy-6938
   Repo: astropy/astropy
   Problem: Possible bug in io.fits related to D exponents
I came across the following code ...

6. astropy__astropy-7746
   Repo: astropy/astropy
   Problem: Issue when passing empty lists/arrays to WCS transformations
The following shoul...

7. django__django-10914
   Repo: django/django
   Proble

## Example 1: Run evaluation with gold predictions (actual fixes) for 1 instance

In [None]:
import subprocess
import json
from pathlib import Path

# Run evaluation for a single instance using gold predictions
# Gold predictions are the actual human-written fixes
def run_single_instance_gold(instance_id, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", "gold",
        "--instance_ids", instance_id,
        "--run_id", f"notebook_test_{instance_id}",
        "--max_workers", "1",
        "--timeout", "300"
    ]
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running: {' '.join(cmd)}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Success!")
        print(result.stdout[-500:])  # Last 500 chars
    else:
        print("Error!")
        print(result.stderr[-500:])
    
    return result

# Example: Run one instance locally (WARNING: requires Docker)
# Uncomment to run locally:
# result = run_single_instance_gold("astropy__astropy-12907", modal=False)

# Example: Run one instance on Modal (recommended)
# Uncomment to run on Modal:
# result = run_single_instance_gold("astropy__astropy-12907", modal=True)

## Example 2: Run evaluation for multiple instances (2-3 examples)

In [None]:
# Run evaluation for multiple instances
def run_multiple_instances_gold(instance_ids, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", "gold",
        "--instance_ids", *instance_ids,  # Unpack the list
        "--run_id", f"notebook_test_multiple_{len(instance_ids)}",
        "--max_workers", "4",
        "--timeout", "600"
    ]
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running {len(instance_ids)} instances...")
    print(f"Command: {' '.join(cmd[:10])}...")  # Show first part of command
    
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Success!")
        # Parse results if available
        try:
            lines = result.stdout.split('\n')
            for line in lines[-20:]:  # Last 20 lines
                if 'resolved' in line or 'completed' in line or 'Resolution' in line:
                    print(line)
        except:
            print(result.stdout[-500:])
    else:
        print("Error!")
        print(result.stderr[-500:])
    
    return result

# Example: Run 3 instances on Modal
three_instances = [
    "astropy__astropy-12907",
    "django__django-11099", 
    "matplotlib__matplotlib-23913"
]

# Uncomment to run:
# result = run_multiple_instances_gold(three_instances, modal=True)

## Example 3: Create and run custom predictions

In [None]:
# Create a custom predictions file
def create_custom_predictions(instance_ids, patches, output_file="custom_predictions.jsonl"):
    """
    Create a predictions file with custom patches
    
    Args:
        instance_ids: List of instance IDs
        patches: List of patch strings (diff format) or None for empty patches
        output_file: Where to save the predictions
    """
    predictions = []
    
    for i, instance_id in enumerate(instance_ids):
        patch = patches[i] if i < len(patches) else ""
        prediction = {
            "instance_id": instance_id,
            "model_name_or_path": "custom-notebook-test",
            "model_patch": patch
        }
        predictions.append(prediction)
    
    # Write as JSONL
    with open(output_file, 'w') as f:
        for pred in predictions:
            f.write(json.dumps(pred) + '\n')
    
    print(f"Created predictions file: {output_file}")
    print(f"Contains {len(predictions)} predictions")
    return output_file

# Example: Create empty patches (will fail) for testing
test_instances = ["astropy__astropy-12907", "django__django-11099"]
empty_patches = ["", ""]  # Empty patches - these will fail

# Create the predictions file
# predictions_file = create_custom_predictions(test_instances, empty_patches)

# Run evaluation with custom predictions
def run_custom_predictions(predictions_file, instance_ids=None, modal=False):
    cmd = [
        "python", "-m", "swebench.harness.run_evaluation",
        "--dataset_name", "princeton-nlp/SWE-bench_Lite",
        "--predictions_path", predictions_file,
        "--run_id", "notebook_custom_test",
        "--max_workers", "2",
        "--timeout", "300"
    ]
    
    if instance_ids:
        cmd.extend(["--instance_ids", *instance_ids])
    
    if modal:
        cmd.extend(["--modal", "true"])
    
    print(f"Running custom predictions from {predictions_file}")
    result = subprocess.run(cmd, capture_output=True, text=True)
    
    if result.returncode == 0:
        print("Evaluation completed!")
        print(result.stdout[-500:])
    else:
        print("Error during evaluation!")
        print(result.stderr[-500:])
    
    return result

# Uncomment to create and run:
# predictions_file = create_custom_predictions(test_instances, empty_patches)
# result = run_custom_predictions(predictions_file, modal=True)

## Example 4: Check evaluation results

In [None]:
# Check evaluation results after running
import os
from pathlib import Path

def check_evaluation_results(run_id):
    """Check the results of an evaluation run"""
    results_dir = Path("./run_evaluation_logs") / run_id
    
    if not results_dir.exists():
        print(f"No results found for run_id: {run_id}")
        return
    
    print(f"Results directory: {results_dir}")
    
    # Look for report files
    for report_file in results_dir.rglob("report.json"):
        print(f"\nFound report: {report_file}")
        with open(report_file) as f:
            report = json.load(f)
            for instance_id, result in report.items():
                print(f"  Instance: {instance_id}")
                print(f"  Resolved: {result.get('resolved', 'N/A')}")
                if 'tests_status' in result:
                    passed = sum(1 for t in result['tests_status'].values() if t == 'PASSED')
                    total = len(result['tests_status'])
                    print(f"  Tests: {passed}/{total} passed")

# Example usage (after running an evaluation):
# check_evaluation_results("notebook_test_astropy__astropy-12907")