In [None]:
# SWE-Bench-CL Baseline Implementation
# For Neural Networks and Deep Learning class at Columbia University

!pip install transformers datasets sentence-transformers scikit-learn numpy pandas tqdm matplotlib anthropic

import json
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import anthropic  # For Claude API access# Data Loading Functions


In [None]:
def load_swe_bench_cl(json_path):
    """Load and parse the SWE-Bench-CL dataset."""
    with open(json_path, 'r') as f:
        data = json.load(f)
    
    print(f"Loaded SWE-Bench-CL v{data['metadata']['version']}")
    print(f"Contains {data['metadata']['num_sequences']} sequences with {data['metadata']['total_tasks']} total tasks")
    
    return data

def get_sequence_by_repo(data, repo_name):
    """Get a specific sequence by repository name."""
    for sequence in data["sequences"]:
        if sequence["repository"] == repo_name:
            return sequence
    return None

def get_task_by_id(data, task_id):
    """Get a specific task by its ID."""
    for sequence in data["sequences"]:
        for task in sequence["tasks"]:
            if task["instance_id"] == task_id:
                return task
    return None

# Add a file upload widget
from google.colab import files
uploaded = files.upload()  # Upload SWE-Bench-CL.json here

# Load data
data_path = next(iter(uploaded.keys()))
data = load_swe_bench_cl(data_path)

# Display available repositories
repositories = [seq["repository"] for seq in data["sequences"]]
print(f"Available repositories: {repositories}")

In [None]:
# Claude API Configuration
CLAUDE_API_KEY = "your_api_key_here"  # Replace with your Claude API key
client = anthropic.Anthropic(api_key=CLAUDE_API_KEY)

def format_prompt(task):
    """Format a SWE-Bench-CL task into a Claude-compatible prompt."""
    prompt = f"""
    I need your help solving a software engineering task. Here are the details:
    
    REPOSITORY: {task.get('repository', '')}
    PROBLEM: {task['problem_statement']}
    
    For context, the following files may be relevant:
    """
    
    # Add file context if available
    for file in task.get('relevant_files', []):
        prompt += f"\nFILE: {file['path']}\nCONTENT:\n{file['content']}\n"
    
    prompt += "\nPlease provide a solution to this problem. Your solution should include the code changes needed."
    
    return prompt

def evaluate_zero_shot(task, model="claude-3-opus-20240229"):
    """Evaluate a single task using Claude in zero-shot mode."""
    prompt = format_prompt(task)
    
    try:
        response = client.messages.create(
            model=model,
            max_tokens=4000,
            system="You are a skilled software engineer tasked with fixing bugs and implementing features in code. Provide clear, correct solutions.",
            messages=[{"role": "user", "content": prompt}]
        )
        
        solution = response.content[0].text
        
        # In a real implementation, you would validate the solution here
        # For this example, we'll just record the response
        
        result = {
            "task_id": task["instance_id"],
            "success": True,  # This would be determined by validation in practice
            "solution": solution,
            "tokens_used": response.usage.input_tokens + response.usage.output_tokens,
            "model": model
        }
        
        return result
    
    except Exception as e:
        print(f"Error evaluating task {task['instance_id']}: {str(e)}")
        return {
            "task_id": task["instance_id"],
            "success": False,
            "error": str(e),
            "model": model
        }

# Example usage
sample_task = get_task_by_id(data, data["sequences"][0]["tasks"][0]["instance_id"])
result = evaluate_zero_shot(sample_task)
print(f"Task evaluation complete: Success = {result['success']}")

In [None]:
class ToolTracker:
    """Track and analyze tool usage patterns in model solutions."""
    
    def __init__(self):
        self.tool_types = [
            "code_search", "code_modification", "code_execution",
            "debugging", "documentation", "version_control"
        ]
        self.tool_usage = {tool: 0 for tool in self.tool_types}
        self.task_tool_map = {}  # Maps task_id to tools used
        
    def analyze_response(self, task_id, response_text):
        """Analyze a model response to identify tool usage."""
        tools_used = set()
        
        # Simple keyword-based detection
        if "search" in response_text.lower() or "find" in response_text.lower():
            self.tool_usage["code_search"] += 1
            tools_used.add("code_search")
            
        if "change" in response_text.lower() or "modify" in response_text.lower() or "edit" in response_text.lower():
            self.tool_usage["code_modification"] += 1
            tools_used.add("code_modification")
            
        if "run" in response_text.lower() or "execute" in response_text.lower() or "test" in response_text.lower():
            self.tool_usage["code_execution"] += 1
            tools_used.add("code_execution")
            
        if "debug" in response_text.lower() or "error" in response_text.lower():
            self.tool_usage["debugging"] += 1
            tools_used.add("debugging")
            
        if "documentation" in response_text.lower() or "docs" in response_text.lower():
            self.tool_usage["documentation"] += 1
            tools_used.add("documentation")
            
        if "git" in response_text.lower() or "commit" in response_text.lower() or "branch" in response_text.lower():
            self.tool_usage["version_control"] += 1
            tools_used.add("version_control")
        
        self.task_tool_map[task_id] = tools_used
        return tools_used
    
    def visualize_usage(self):
        """Visualize tool usage distribution."""
        plt.figure(figsize=(10, 6))
        plt.bar(self.tool_types, [self.tool_usage[tool] for tool in self.tool_types])
        plt.title("Tool Usage Distribution")
        plt.xlabel("Tool Type")
        plt.ylabel("Usage Count")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# Example usage
tool_tracker = ToolTracker()
tools_used = tool_tracker.analyze_response(result["task_id"], result["solution"])
print(f"Tools used: {tools_used}")

In [None]:
class ContextAugmenter:
    """Augment prompts with relevant context from previous tasks."""
    
    def __init__(self, model_name="flax-sentence-embeddings/st-codesearch-t5-base"):
        self.embedding_model = SentenceTransformer(model_name)
        self.memory = []  # Will store (task_embedding, task_content) pairs
    
    def add_to_memory(self, task, solution=None):
        """Add a task to memory with its embedding."""
        if solution is None and "solution" in task:
            solution = task["solution"]
        
        task_text = f"{task['problem_statement']} {solution or ''}"
        embedding = self.embedding_model.encode(task_text)
        
        self.memory.append({
            "embedding": embedding,
            "task_id": task["instance_id"],
            "problem": task["problem_statement"],
            "solution": solution or "",
            "files_modified": task.get("modified_files", []),
            "repository": task.get("repository", "")
        })
    
    def get_relevant_context(self, current_task, top_k=3):
        """Find most relevant previous tasks for context augmentation."""
        if not self.memory:
            return ""
            
        current_text = current_task["problem_statement"]
        current_embedding = self.embedding_model.encode(current_text)
        
        # Calculate similarities with all memory items
        similarities = []
        for i, memory_item in enumerate(self.memory):
            sim = cosine_similarity(
                [current_embedding], 
                [memory_item["embedding"]]
            )[0][0]
            similarities.append((i, sim))
        
        # Sort by similarity (descending)
        similarities.sort(key=lambda x: x[1], reverse=True)
        
        # Format relevant context from top_k similar tasks
        context = "Here are some similar tasks that have been solved before:\n\n"
        for i in range(min(top_k, len(similarities))):
            idx, sim = similarities[i]
            memory_item = self.memory[idx]
            context += f"TASK ID: {memory_item['task_id']}\n"
            context += f"SIMILARITY: {sim:.2f}\n"
            context += f"PROBLEM: {memory_item['problem']}\n"
            context += f"SOLUTION: {memory_item['solution']}\n\n"
            
        return context
    
    def format_augmented_prompt(self, task, top_k=3):
        """Create a prompt augmented with relevant context."""
        relevant_context = self.get_relevant_context(task, top_k)
        
        prompt = f"""
        I need your help solving a software engineering task. Here are the details:
        
        REPOSITORY: {task.get('repository', '')}
        PROBLEM: {task['problem_statement']}
        
        {relevant_context}
        
        For context, the following files may be relevant:
        """
        
        # Add file context if available
        for file in task.get('relevant_files', []):
            prompt += f"\nFILE: {file['path']}\nCONTENT:\n{file['content']}\n"
        
        prompt += "\nPlease provide a solution to this problem. Your solution should include the code changes needed."
        
        return prompt

# Example usage
augmenter = ContextAugmenter()
augmenter.add_to_memory(sample_task, result["solution"])

In [None]:
def run_experiments(data, repo_filter=None, sample_size=None):
    """Run comprehensive experiments on SWE-Bench-CL data."""
    
    if repo_filter:
        sequences = [seq for seq in data["sequences"] if seq["repository"] == repo_filter]
    else:
        sequences = data["sequences"]
    
    results = {
        "zero_shot": {},
        "augmented": {},
        "metrics": {}
    }
    
    for sequence in sequences:
        repo = sequence["repository"]
        print(f"\nEvaluating repository: {repo}")
        
        # Step 1: Zero-shot evaluation
        print("Running zero-shot evaluation...")
        zero_shot_results = run_sequence_evaluation(sequence, sample_size=sample_size)
        results["zero_shot"][repo] = zero_shot_results
        
        # Step 2: Augmented evaluation
        print("Running augmented evaluation...")
        augmenter = ContextAugmenter()
        augmented_results = run_sequence_evaluation(sequence, augmenter, sample_size=sample_size)
        results["augmented"][repo] = augmented_results
        
        # Calculate metrics
        zero_shot_metrics = calculate_metrics(zero_shot_results)
        augmented_metrics = calculate_metrics(augmented_results)
        
        # Calculate improvement
        improvement = augmented_metrics["success_rate"] - zero_shot_metrics["success_rate"]
        
        results["metrics"][repo] = {
            "zero_shot": zero_shot_metrics,
            "augmented": augmented_metrics,
            "improvement": improvement
        }
        
        print(f"Zero-shot success rate: {zero_shot_metrics['success_rate']:.2f}")
        print(f"Augmented success rate: {augmented_metrics['success_rate']:.2f}")
        print(f"Improvement: {improvement:.2f}")
    
    return results

# Full experiment runner (commented out to avoid accidental execution)
# experiment_results = run_experiments(data, repo_filter="django/django", sample_size=5)

# Save results
def save_results(results, output_path="swe_bench_cl_results.json"):
    """Save experiment results to a JSON file."""
    with open(output_path, 'w') as f:
        # Convert NumPy arrays to lists for JSON serialization
        import json
        class NumpyEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, np.ndarray):
                    return obj.tolist()
                return json.JSONEncoder.default(self, obj)
        
        json.dump(results, f, cls=NumpyEncoder, indent=2)
    
    print(f"Results saved to {output_path}")

# Visualization functions
def visualize_results(results):
    """Create visualizations from experiment results."""
    repos = list(results["metrics"].keys())
    zero_shot_rates = [results["metrics"][repo]["zero_shot"]["success_rate"] for repo in repos]
    augmented_rates = [results["metrics"][repo]["augmented"]["success_rate"] for repo in repos]
    
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(repos))
    width = 0.35
    
    ax.bar(x - width/2, zero_shot_rates, width, label='Zero-shot')
    ax.bar(x + width/2, augmented_rates, width, label='Augmented')
    
    ax.set_ylabel('Success Rate')
    ax.set_title('Performance Comparison by Repository')
    ax.set_xticks(x)
    ax.set_xticklabels(repos, rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.show()