In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load data from Google Drive
data_path = '/content/drive/MyDrive/SWE-Bench-CL.json'
data = load_swe_bench_cl(data_path)

# Display basic dataset info
print(f"Available repositories: {data['metadata']['repositories']}")
print(f"Total tasks: {data['metadata']['total_tasks']}")
print(f"Number of sequences: {data['metadata']['num_sequences']}")

In [None]:
# Load CodeLlama model
print("Loading CodeLlama model...")
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
model = AutoModelForCausalLM.from_pretrained(
    "codellama/CodeLlama-7b-hf",
    device_map="auto",
    torch_dtype=torch.bfloat16,
    load_in_8bit=True
)
print("Model loaded successfully!")

# Test the model with a sample task
first_sequence = data['sequences'][0]
first_task = first_sequence['tasks'][0]
prompt = format_prompt(first_task)
print(f"\nExample prompt:\n{prompt}")

# Generate a solution
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_length=4096,
        temperature=0.2,
        top_p=0.95,
        num_return_sequences=1
    )
solution = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
print(f"\nGenerated solution:\n{solution}")

In [None]:
def run_small_experiment(sequence, num_tasks=5):
    """Run a small experiment on a sequence with first N tasks."""
    results = []
    augmenter = ContextAugmenter()
    
    # Get first N tasks from the sequence
    tasks = sequence['tasks'][:num_tasks]
    
    for task in tasks:
        print(f"\nEvaluating task {task['metadata']['instance_id']}")
        
        # Zero-shot evaluation
        zero_shot_result = evaluate_zero_shot(task)
        
        # Track tool usage
        tool_tracker.analyze_response(zero_shot_result["task_id"], zero_shot_result["solution"])
        
        # Augmented evaluation
        augmenter.add_to_memory(task, zero_shot_result["solution"])
        augmented_prompt = augmenter.format_augmented_prompt(task)
        augmented_result = evaluate_zero_shot(task, prompt=augmented_prompt)
        
        results.append({
            "task_id": task["metadata"]["instance_id"],
            "zero_shot": zero_shot_result,
            "augmented": augmented_result
        })
    
    return results

# Run experiment on first sequence
first_sequence = data['sequences'][0]
experiment_results = run_small_experiment(first_sequence)

# Calculate metrics
zero_shot_success = sum(1 for r in experiment_results if r["zero_shot"]["success"]) / len(experiment_results)
augmented_success = sum(1 for r in experiment_results if r["augmented"]["success"]) / len(experiment_results)

print(f"\nExperiment Results:")
print(f"Zero-shot success rate: {zero_shot_success:.2f}")
print(f"Augmented success rate: {augmented_success:.2f}")
print(f"Improvement: {augmented_success - zero_shot_success:.2f}")

# Visualize tool usage
tool_tracker.visualize_usage()

In [None]:
# Save experiment results
import json

def save_experiment_results(results, filename="swe_bench_cl_experiment_results.json"):
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {filename}")

# Save our experiment results
save_experiment_results(experiment_results)

# Visualize results
def plot_experiment_results(results):
    zero_shot_rates = [1 if r["zero_shot"]["success"] else 0 for r in results]
    augmented_rates = [1 if r["augmented"]["success"] else 0 for r in results]
    
    plt.figure(figsize=(10, 6))
    plt.plot(zero_shot_rates, label="Zero-shot")
    plt.plot(augmented_rates, label="Augmented")
    plt.xlabel("Task Number")
    plt.ylabel("Success (1) / Failure (0)")
    plt.title("Performance Comparison")
    plt.legend()
    plt.grid(True)
    plt.show()

# Plot our results
plot_experiment_results(experiment_results)