# Focused Learning: Simulating Different Review Treatments

## Learning Objectives
1. Understand the three review treatments: **MCR**, **ACR**, and **CCR**
2. Implement realistic simulations of each treatment using **LangGraph**
3. Model reviewer behavior under different conditions
4. Analyze treatment effects on review outcomes

## Paper Context
**Section Reference**: Section II-B (Code Review Treatments) and Section II-C (Experimental Setup)

**Treatment Definitions**:
- **MCR (Manual Code Review)**: Classic review without automation
- **ACR (Automated Code Review)**: ChatGPT-generated review as starting point  
- **CCR (Comprehensive Code Review)**: "Perfect" review identifying all injected issues

**Key Quote**:
> "The third treatment simulates an ideal, hypothetical scenario where the automated code review is able to identify all quality issues in a given code."

## 1. Setup and Core Components

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Any, Optional, Set, Tuple
from dataclasses import dataclass, field
from enum import Enum
import random
import time
from datetime import datetime, timedelta
import json
import networkx as nx
from collections import defaultdict

# LangGraph components
from typing import TypedDict
from langgraph.graph import StateGraph, END

# Set random seeds
np.random.seed(42)
random.seed(42)

# Visualization settings
plt.style.use('seaborn-v0_8-notebook')
colors = sns.color_palette("husl", 8)

In [None]:
# Define core data structures based on paper

class Treatment(Enum):
    MCR = "Manual Code Review"
    ACR = "Automated Code Review"
    CCR = "Comprehensive Code Review"

@dataclass
class InjectedIssue:
    """Represents an issue injected into code (from Table I)"""
    id: str
    description: str
    file: str
    line: int
    type: str  # evolvability or functional
    severity: str  # low, medium, high
    example_code: Optional[str] = None

@dataclass
class ReviewAction:
    """Represents an action taken during review"""
    timestamp: datetime
    action_type: str  # open_file, read_code, write_comment, verify_issue, etc.
    file: Optional[str] = None
    line_range: Optional[Tuple[int, int]] = None
    duration_seconds: float = 0
    details: Dict[str, Any] = field(default_factory=dict)

@dataclass 
class ReviewSession:
    """Complete review session data"""
    treatment: Treatment
    reviewer_id: str
    program: str
    start_time: datetime
    end_time: Optional[datetime] = None
    actions: List[ReviewAction] = field(default_factory=list)
    initial_comments: List[Dict] = field(default_factory=list)
    final_comments: List[Dict] = field(default_factory=list)
    confidence_score: Optional[int] = None  # 1-5 scale
    issues_found: Set[str] = field(default_factory=set)

## 2. Simulating Code Programs with Issues

In [None]:
class ProgramSimulator:
    """Simulates programs with injected issues based on paper's approach"""
    
    def __init__(self):
        # Issue distribution from paper: 78% evolvability, 22% functional
        self.issue_type_probs = {'evolvability': 0.78, 'functional': 0.22}
        
        # Programs from Table I
        self.programs = {
            'maze-generator': {'loc': 75, 'issues': 2},
            'number-conversion': {'loc': 81, 'issues': 2},
            'stopwatch': {'loc': 258, 'issues': 4},
            'tic-tac-toe': {'loc': 121, 'issues': 7},
            'todo-list': {'loc': 198, 'issues': 3},
            'word-utils': {'loc': 426, 'issues': 7}
        }
    
    def create_program(self, name: str) -> Dict[str, Any]:
        """Create a program with injected issues"""
        
        if name not in self.programs:
            raise ValueError(f"Unknown program: {name}")
        
        prog_info = self.programs[name]
        issues = []
        
        # Generate issues based on paper's distribution
        for i in range(prog_info['issues']):
            issue_type = np.random.choice(
                list(self.issue_type_probs.keys()),
                p=list(self.issue_type_probs.values())
            )
            
            # Severity distribution (estimated from paper)
            if issue_type == 'functional':
                severity = np.random.choice(['medium', 'high'], p=[0.4, 0.6])
            else:
                severity = np.random.choice(['low', 'medium', 'high'], p=[0.5, 0.35, 0.15])
            
            issue = InjectedIssue(
                id=f"{name}-issue-{i+1}",
                description=self._generate_issue_description(issue_type, severity),
                file=f"{name}.py",
                line=np.random.randint(1, prog_info['loc']),
                type=issue_type,
                severity=severity
            )
            issues.append(issue)
        
        return {
            'name': name,
            'loc': prog_info['loc'],
            'files': [f"{name}.py", f"test_{name}.py"],
            'injected_issues': issues,
            'code_complexity': self._estimate_complexity(prog_info['loc'])
        }
    
    def _generate_issue_description(self, issue_type: str, severity: str) -> str:
        """Generate realistic issue descriptions"""
        
        templates = {
            'evolvability': {
                'low': [
                    "Missing documentation for method",
                    "Variable name could be more descriptive",
                    "Code duplication could be refactored"
                ],
                'medium': [
                    "Method is too long and complex",
                    "Poor separation of concerns",
                    "String concatenation in loop (performance)"
                ],
                'high': [
                    "Architectural issue: tight coupling",
                    "Major performance bottleneck",
                    "Critical maintainability issue"
                ]
            },
            'functional': {
                'medium': [
                    "Missing input validation",
                    "Edge case not handled",
                    "Potential null pointer exception"
                ],
                'high': [
                    "Logic error in algorithm",
                    "Incorrect calculation result",
                    "Race condition in concurrent code"
                ]
            }
        }
        
        return np.random.choice(templates.get(issue_type, {}).get(severity, ["Generic issue"]))
    
    def _estimate_complexity(self, loc: int) -> str:
        """Estimate program complexity based on LOC"""
        if loc < 100:
            return 'low'
        elif loc < 300:
            return 'medium'
        else:
            return 'high'

# Create simulator and example program
program_sim = ProgramSimulator()
example_program = program_sim.create_program('number-conversion')

print(f"Created program: {example_program['name']}")
print(f"Lines of code: {example_program['loc']}")
print(f"Injected issues: {len(example_program['injected_issues'])}")
print("\nIssue breakdown:")
for issue in example_program['injected_issues']:
    print(f"  - {issue.severity.upper()}: {issue.description} (line {issue.line})")

## 3. Treatment-Specific Review Generators

In [None]:
class TreatmentSimulator:
    """Simulates different review treatments based on paper findings"""
    
    def __init__(self):
        # Key statistics from paper
        self.acr_detection_rate = 0.42  # ACR finds 42% of injected issues
        self.acr_keep_rate = 0.89       # 89% of ACR suggestions kept
        self.mcr_detection_rate = 0.50  # MCR finds 50% of injected issues
        
    def generate_mcr_review(self, program: Dict[str, Any]) -> List[Dict]:
        """Generate manual code review (no automation)"""
        
        comments = []
        injected_issues = program['injected_issues']
        
        # MCR finds ~50% of injected issues
        found_issues = np.random.choice(
            injected_issues,
            size=int(len(injected_issues) * self.mcr_detection_rate),
            replace=False
        )
        
        for issue in found_issues:
            # Human reviewers write more concise comments
            comment = {
                'id': f"mcr-{issue.id}",
                'issue_id': issue.id,
                'text': self._humanize_comment(issue.description),
                'file': issue.file,
                'line': issue.line,
                'severity': issue.severity,
                'author': 'human'
            }
            comments.append(comment)
        
        # Add some additional non-injected issues
        if np.random.random() < 0.3:  # 30% chance
            extra_comment = {
                'id': f"mcr-extra-{len(comments)+1}",
                'issue_id': None,
                'text': "Consider adding unit tests for this method",
                'file': program['files'][0],
                'line': np.random.randint(1, program['loc']),
                'severity': 'low',
                'author': 'human'
            }
            comments.append(extra_comment)
        
        return comments
    
    def generate_acr_review(self, program: Dict[str, Any]) -> List[Dict]:
        """Generate automated code review (ChatGPT-style)"""
        
        comments = []
        injected_issues = program['injected_issues']
        
        # ACR finds ~42% of injected issues
        # But biased towards low-severity issues
        weights = []
        for issue in injected_issues:
            if issue.severity == 'low':
                weights.append(3.0)
            elif issue.severity == 'medium':
                weights.append(1.5)
            else:  # high
                weights.append(0.5)
        
        weights = np.array(weights)
        weights = weights / weights.sum()
        
        n_found = int(len(injected_issues) * self.acr_detection_rate)
        found_issues = np.random.choice(
            injected_issues,
            size=n_found,
            replace=False,
            p=weights
        )
        
        for issue in found_issues:
            # LLM comments are more verbose
            comment = {
                'id': f"acr-{issue.id}",
                'issue_id': issue.id,
                'text': self._llm_style_comment(issue.description),
                'file': issue.file,
                'line': issue.line,
                'severity': issue.severity,
                'author': 'llm'
            }
            comments.append(comment)
        
        # ACR tends to add more low-severity suggestions
        extra_suggestions = [
            "Consider using more descriptive variable names for clarity",
            "This method could benefit from additional documentation",
            "The logic is sound, though it might benefit from comments",
            "Consider extracting this logic into a separate method"
        ]
        
        n_extra = np.random.poisson(3)  # Average 3 extra suggestions
        for i in range(min(n_extra, len(extra_suggestions))):
            comment = {
                'id': f"acr-extra-{i+1}",
                'issue_id': None,
                'text': extra_suggestions[i],
                'file': program['files'][0],
                'line': np.random.randint(1, program['loc']),
                'severity': 'low',
                'author': 'llm'
            }
            comments.append(comment)
        
        return comments
    
    def generate_ccr_review(self, program: Dict[str, Any]) -> List[Dict]:
        """Generate comprehensive review (all injected issues found)"""
        
        comments = []
        
        # CCR finds ALL injected issues
        for issue in program['injected_issues']:
            # Rephrased by LLM to seem natural
            comment = {
                'id': f"ccr-{issue.id}",
                'issue_id': issue.id,
                'text': self._rephrase_for_ccr(issue.description, issue.severity),
                'file': issue.file,
                'line': issue.line,
                'severity': issue.severity,
                'author': 'comprehensive'
            }
            comments.append(comment)
        
        return comments
    
    def _humanize_comment(self, text: str) -> str:
        """Make comment sound more human"""
        return text  # Simplified for demo
    
    def _llm_style_comment(self, text: str) -> str:
        """Make comment sound like LLM output"""
        prefixes = [
            "I noticed that ",
            "It appears that ",
            "Consider addressing: ",
            "There's an opportunity to improve: "
        ]
        return np.random.choice(prefixes) + text.lower()
    
    def _rephrase_for_ccr(self, text: str, severity: str) -> str:
        """Rephrase for comprehensive review"""
        severity_prefix = {
            'low': "Minor issue: ",
            'medium': "Important: ",
            'high': "Critical issue: "
        }
        return severity_prefix.get(severity, "") + text

# Generate reviews for all treatments
treatment_sim = TreatmentSimulator()

mcr_comments = treatment_sim.generate_mcr_review(example_program)
acr_comments = treatment_sim.generate_acr_review(example_program)
ccr_comments = treatment_sim.generate_ccr_review(example_program)

print("Generated reviews:")
print(f"  MCR: {len(mcr_comments)} comments")
print(f"  ACR: {len(acr_comments)} comments")
print(f"  CCR: {len(ccr_comments)} comments")

## 4. LangGraph-based Review Process Simulation

In [None]:
# Define state for LangGraph
class ReviewState(TypedDict):
    """State for the review process workflow"""
    treatment: str
    program: Dict[str, Any]
    initial_comments: List[Dict]
    current_file: Optional[str]
    reviewed_lines: Set[Tuple[str, int]]
    final_comments: List[Dict]
    actions: List[ReviewAction]
    time_elapsed: float
    confidence: Optional[int]
    stage: str

class ReviewProcessSimulator:
    """Simulates the complete review process using LangGraph"""
    
    def __init__(self):
        self.treatment_sim = TreatmentSimulator()
        self.graph = self._build_graph()
        
        # Time parameters from paper (in seconds)
        self.time_params = {
            'MCR': {'mean': 42*60, 'std': 10*60},
            'ACR': {'mean': 56*60, 'std': 15*60},
            'CCR': {'mean': 57*60, 'std': 12*60}
        }
    
    def _build_graph(self) -> StateGraph:
        """Build the review process workflow"""
        
        workflow = StateGraph(ReviewState)
        
        # Add nodes
        workflow.add_node("initialize", self._initialize_review)
        workflow.add_node("generate_initial", self._generate_initial_review)
        workflow.add_node("open_file", self._open_file)
        workflow.add_node("review_code", self._review_code)
        workflow.add_node("process_comments", self._process_comments)
        workflow.add_node("finalize", self._finalize_review)
        
        # Add edges
        workflow.add_edge("initialize", "generate_initial")
        workflow.add_edge("generate_initial", "open_file")
        workflow.add_edge("open_file", "review_code")
        workflow.add_edge("review_code", "process_comments")
        
        # Conditional edge: continue reviewing or finalize
        workflow.add_conditional_edges(
            "process_comments",
            self._should_continue,
            {
                "continue": "open_file",
                "finish": "finalize"
            }
        )
        
        workflow.add_edge("finalize", END)
        
        # Set entry point
        workflow.set_entry_point("initialize")
        
        return workflow.compile()
    
    def _initialize_review(self, state: ReviewState) -> ReviewState:
        """Initialize review session"""
        state['reviewed_lines'] = set()
        state['final_comments'] = []
        state['actions'] = []
        state['time_elapsed'] = 0
        state['stage'] = 'initialized'
        
        # Log action
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='start_review',
            details={'treatment': state['treatment'], 'program': state['program']['name']}
        )
        state['actions'].append(action)
        
        return state
    
    def _generate_initial_review(self, state: ReviewState) -> ReviewState:
        """Generate initial automated review if applicable"""
        
        treatment = Treatment[state['treatment']]
        
        if treatment == Treatment.MCR:
            state['initial_comments'] = []
        elif treatment == Treatment.ACR:
            state['initial_comments'] = self.treatment_sim.generate_acr_review(state['program'])
        else:  # CCR
            state['initial_comments'] = self.treatment_sim.generate_ccr_review(state['program'])
        
        # Log action
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='generate_initial_review',
            duration_seconds=np.random.uniform(1, 3),
            details={'num_comments': len(state['initial_comments'])}
        )
        state['actions'].append(action)
        state['time_elapsed'] += action.duration_seconds
        
        return state
    
    def _open_file(self, state: ReviewState) -> ReviewState:
        """Simulate opening a file for review"""
        
        # Choose file to review
        if state['initial_comments'] and np.random.random() < 0.8:
            # 80% chance to follow automated suggestions
            suggested_files = list(set(c['file'] for c in state['initial_comments']))
            state['current_file'] = np.random.choice(suggested_files)
        else:
            # Random file selection
            state['current_file'] = np.random.choice(state['program']['files'])
        
        # Log action
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='open_file',
            file=state['current_file'],
            duration_seconds=np.random.uniform(2, 5)
        )
        state['actions'].append(action)
        state['time_elapsed'] += action.duration_seconds
        
        return state
    
    def _review_code(self, state: ReviewState) -> ReviewState:
        """Simulate reviewing code in current file"""
        
        treatment = Treatment[state['treatment']]
        
        # Different behavior based on treatment
        if treatment == Treatment.MCR:
            # Manual review: systematic exploration
            lines_to_review = np.random.randint(20, 50)
            review_time = np.random.normal(60, 20)  # 1 minute average
        else:
            # Automated-assisted: focused on suggested lines
            suggested_lines = [
                c['line'] for c in state['initial_comments'] 
                if c['file'] == state['current_file']
            ]
            lines_to_review = len(suggested_lines) + np.random.randint(5, 15)
            review_time = np.random.normal(45, 15)  # Slightly faster
        
        # Mark lines as reviewed
        for _ in range(lines_to_review):
            line = np.random.randint(1, state['program']['loc'])
            state['reviewed_lines'].add((state['current_file'], line))
        
        # Log action
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='review_code',
            file=state['current_file'],
            duration_seconds=review_time,
            details={'lines_reviewed': lines_to_review}
        )
        state['actions'].append(action)
        state['time_elapsed'] += action.duration_seconds
        
        return state
    
    def _process_comments(self, state: ReviewState) -> ReviewState:
        """Process and finalize comments"""
        
        treatment = Treatment[state['treatment']]
        
        if treatment == Treatment.MCR:
            # Generate manual comments
            manual_comments = self.treatment_sim.generate_mcr_review(state['program'])
            state['final_comments'].extend(manual_comments)
        else:
            # Process initial comments (ACR/CCR)
            for comment in state['initial_comments']:
                # 89% keep rate for automated comments
                if np.random.random() < 0.89:
                    state['final_comments'].append(comment)
            
            # Small chance to add new issues
            if np.random.random() < 0.1:
                # Find an issue not in initial comments
                found_issue_ids = {c['issue_id'] for c in state['initial_comments'] if c['issue_id']}
                remaining = [
                    issue for issue in state['program']['injected_issues']
                    if issue.id not in found_issue_ids
                ]
                if remaining:
                    new_issue = np.random.choice(remaining)
                    comment = {
                        'id': f"manual-{new_issue.id}",
                        'issue_id': new_issue.id,
                        'text': f"Additionally found: {new_issue.description}",
                        'file': new_issue.file,
                        'line': new_issue.line,
                        'severity': new_issue.severity,
                        'author': 'human'
                    }
                    state['final_comments'].append(comment)
        
        # Log action
        write_time = len(state['final_comments']) * np.random.uniform(20, 40)
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='write_comments',
            duration_seconds=write_time,
            details={'num_comments': len(state['final_comments'])}
        )
        state['actions'].append(action)
        state['time_elapsed'] += action.duration_seconds
        
        return state
    
    def _should_continue(self, state: ReviewState) -> str:
        """Decide whether to continue reviewing"""
        
        # Check time budget
        treatment = state['treatment']
        time_budget = self.time_params[treatment]['mean']
        
        if state['time_elapsed'] > time_budget * 0.8:
            return "finish"
        
        # Check if all files reviewed
        reviewed_files = set(f for f, _ in state['reviewed_lines'])
        if len(reviewed_files) >= len(state['program']['files']):
            return "finish"
        
        return "continue"
    
    def _finalize_review(self, state: ReviewState) -> ReviewState:
        """Finalize the review session"""
        
        # Calculate confidence (no significant difference between treatments)
        state['confidence'] = int(np.random.normal(3.6, 0.8))
        state['confidence'] = max(1, min(5, state['confidence']))
        
        # Log final action
        action = ReviewAction(
            timestamp=datetime.now(),
            action_type='finalize_review',
            duration_seconds=np.random.uniform(30, 60),
            details={
                'total_comments': len(state['final_comments']),
                'confidence': state['confidence'],
                'total_time': state['time_elapsed']
            }
        )
        state['actions'].append(action)
        state['time_elapsed'] += action.duration_seconds
        
        state['stage'] = 'completed'
        
        return state
    
    def run_review(self, treatment: str, program: Dict[str, Any]) -> ReviewState:
        """Run a complete review simulation"""
        
        initial_state = ReviewState(
            treatment=treatment,
            program=program,
            initial_comments=[],
            current_file=None,
            reviewed_lines=set(),
            final_comments=[],
            actions=[],
            time_elapsed=0,
            confidence=None,
            stage='pending'
        )
        
        return self.graph.invoke(initial_state)

# Initialize simulator
process_sim = ReviewProcessSimulator()
print("Review Process Simulator initialized with LangGraph!")

## 5. Running Complete Treatment Simulations

In [None]:
# Run simulations for all treatments
results = {}

for treatment in ['MCR', 'ACR', 'CCR']:
    print(f"\nRunning {treatment} simulation...")
    
    # Run the review
    result = process_sim.run_review(treatment, example_program)
    results[treatment] = result
    
    # Summary
    print(f"  Time: {result['time_elapsed']/60:.1f} minutes")
    print(f"  Comments: {len(result['final_comments'])}")
    print(f"  Confidence: {result['confidence']}/5")
    print(f"  Files reviewed: {len(set(f for f, _ in result['reviewed_lines']))}")
    print(f"  Lines reviewed: {len(result['reviewed_lines'])}")

print("\nAll simulations completed!")

## 6. Analyzing Treatment Differences

In [None]:
def analyze_treatment_differences(results: Dict[str, ReviewState]):
    """Analyze key differences between treatments"""
    
    # Extract metrics
    metrics = []
    
    for treatment, state in results.items():
        # Count issues found
        found_issue_ids = {c['issue_id'] for c in state['final_comments'] if c['issue_id']}
        all_issue_ids = {issue.id for issue in state['program']['injected_issues']}
        
        # Time breakdown
        time_by_action = defaultdict(float)
        for action in state['actions']:
            time_by_action[action.action_type] += action.duration_seconds
        
        metrics.append({
            'Treatment': treatment,
            'Total Time (min)': state['time_elapsed'] / 60,
            'Comments': len(state['final_comments']),
            'Injected Issues Found': len(found_issue_ids),
            'Detection Rate': len(found_issue_ids) / len(all_issue_ids) * 100,
            'Lines Reviewed': len(state['reviewed_lines']),
            'Confidence': state['confidence'],
            'Time Reading (%)': time_by_action['review_code'] / state['time_elapsed'] * 100,
            'Time Writing (%)': time_by_action['write_comments'] / state['time_elapsed'] * 100
        })
    
    # Create DataFrame
    df = pd.DataFrame(metrics)
    
    # Visualize
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Time comparison
    ax = axes[0, 0]
    df.plot(x='Treatment', y='Total Time (min)', kind='bar', ax=ax, color=colors[:3])
    ax.set_title('Review Time by Treatment')
    ax.set_ylabel('Time (minutes)')
    ax.legend().remove()
    
    # Add paper reference line
    ax.axhline(y=42, color='red', linestyle='--', alpha=0.5, label='MCR avg (paper)')
    ax.axhline(y=56, color='blue', linestyle='--', alpha=0.5, label='ACR avg (paper)')
    ax.legend()
    
    # 2. Detection rate
    ax = axes[0, 1]
    df.plot(x='Treatment', y='Detection Rate', kind='bar', ax=ax, color=colors[3:6])
    ax.set_title('Issue Detection Rate')
    ax.set_ylabel('Detection Rate (%)')
    ax.set_ylim(0, 110)
    ax.legend().remove()
    
    # 3. Comments vs Coverage
    ax = axes[1, 0]
    ax.scatter(df['Comments'], df['Lines Reviewed'], s=100, c=colors[:3])
    for i, txt in enumerate(df['Treatment']):
        ax.annotate(txt, (df['Comments'].iloc[i], df['Lines Reviewed'].iloc[i]), 
                   xytext=(5, 5), textcoords='offset points')
    ax.set_xlabel('Number of Comments')
    ax.set_ylabel('Lines Reviewed')
    ax.set_title('Review Verbosity vs Coverage')
    
    # 4. Time allocation
    ax = axes[1, 1]
    time_data = df[['Treatment', 'Time Reading (%)', 'Time Writing (%)']].set_index('Treatment')
    time_data.plot(kind='bar', stacked=True, ax=ax, color=['skyblue', 'lightcoral'])
    ax.set_title('Time Allocation by Treatment')
    ax.set_ylabel('Percentage of Total Time')
    ax.legend(title='Activity')
    
    plt.tight_layout()
    plt.show()
    
    return df

# Analyze differences
analysis_df = analyze_treatment_differences(results)
print("\nTreatment Analysis Summary:")
print(analysis_df.round(2).to_string(index=False))

## 7. Behavioral Pattern Analysis

In [None]:
def analyze_behavioral_patterns(results: Dict[str, ReviewState]):
    """Analyze reviewer behavior patterns across treatments"""
    
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    
    for idx, (treatment, state) in enumerate(results.items()):
        # Extract action timeline
        actions = state['actions']
        start_time = actions[0].timestamp
        
        # Create timeline data
        timeline_data = []
        cumulative_time = 0
        
        for action in actions:
            timeline_data.append({
                'time': cumulative_time / 60,  # Convert to minutes
                'action': action.action_type,
                'duration': action.duration_seconds / 60
            })
            cumulative_time += action.duration_seconds
        
        # Plot timeline
        ax = axes[0, idx]
        
        # Color mapping for actions
        action_colors = {
            'start_review': 'green',
            'generate_initial_review': 'yellow',
            'open_file': 'blue',
            'review_code': 'orange',
            'write_comments': 'red',
            'finalize_review': 'purple'
        }
        
        y_pos = 0
        for item in timeline_data:
            color = action_colors.get(item['action'], 'gray')
            ax.barh(y_pos, item['duration'], left=item['time'], 
                   color=color, alpha=0.7, height=0.8)
            y_pos += 1
        
        ax.set_xlabel('Time (minutes)')
        ax.set_ylabel('Action Sequence')
        ax.set_title(f'{treatment} Review Timeline')
        ax.set_ylim(-0.5, len(timeline_data) - 0.5)
        
        # Create legend for first plot
        if idx == 0:
            from matplotlib.patches import Patch
            legend_elements = [Patch(facecolor=color, label=action.replace('_', ' ').title()) 
                             for action, color in action_colors.items()]
            ax.legend(handles=legend_elements, loc='upper left', 
                     bbox_to_anchor=(1.05, 1), fontsize=8)
        
        # Plot coverage heatmap
        ax = axes[1, idx]
        
        # Create coverage matrix
        max_line = state['program']['loc']
        coverage_matrix = np.zeros((len(state['program']['files']), max_line // 10 + 1))
        
        for file, line in state['reviewed_lines']:
            file_idx = state['program']['files'].index(file)
            line_bucket = line // 10
            if line_bucket < coverage_matrix.shape[1]:
                coverage_matrix[file_idx, line_bucket] += 1
        
        # Plot heatmap
        im = ax.imshow(coverage_matrix, cmap='YlOrRd', aspect='auto')
        ax.set_xlabel('Code Sections (10 lines each)')
        ax.set_ylabel('Files')
        ax.set_title(f'{treatment} Coverage Heatmap')
        ax.set_yticks(range(len(state['program']['files'])))
        ax.set_yticklabels([f.replace('.py', '') for f in state['program']['files']])
        
        # Add colorbar for last plot
        if idx == 2:
            cbar = plt.colorbar(im, ax=ax)
            cbar.set_label('Review Intensity')
    
    plt.tight_layout()
    plt.show()

# Analyze behavior patterns
analyze_behavioral_patterns(results)

## 8. Comment Quality Analysis

In [None]:
def analyze_comment_quality(results: Dict[str, ReviewState]):
    """Analyze the quality characteristics of comments across treatments"""
    
    comment_analysis = []
    
    for treatment, state in results.items():
        for comment in state['final_comments']:
            # Calculate comment metrics
            word_count = len(comment['text'].split())
            
            # Check for actionability
            action_keywords = ['should', 'must', 'need', 'consider', 'fix', 'change', 'update']
            is_actionable = any(keyword in comment['text'].lower() for keyword in action_keywords)
            
            # Check for specificity
            has_line_ref = 'line' in comment['text'].lower()
            has_code_ref = '`' in comment['text'] or 'code' in comment['text'].lower()
            
            comment_analysis.append({
                'Treatment': treatment,
                'Severity': comment['severity'],
                'Word Count': word_count,
                'Actionable': is_actionable,
                'Has Line Ref': has_line_ref,
                'Has Code Ref': has_code_ref,
                'Author': comment['author'],
                'Is Injected': comment['issue_id'] is not None
            })
    
    # Create DataFrame
    comment_df = pd.DataFrame(comment_analysis)
    
    # Visualizations
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Word count distribution
    ax = axes[0, 0]
    for treatment in ['MCR', 'ACR', 'CCR']:
        data = comment_df[comment_df['Treatment'] == treatment]['Word Count']
        ax.hist(data, alpha=0.6, label=treatment, bins=15)
    ax.set_xlabel('Word Count')
    ax.set_ylabel('Frequency')
    ax.set_title('Comment Length Distribution')
    ax.legend()
    
    # 2. Severity distribution
    ax = axes[0, 1]
    severity_pivot = comment_df.pivot_table(
        index='Treatment', 
        columns='Severity', 
        values='Word Count', 
        aggfunc='count',
        fill_value=0
    )
    severity_pivot.plot(kind='bar', ax=ax, color=['lightgreen', 'orange', 'red'])
    ax.set_title('Severity Distribution by Treatment')
    ax.set_ylabel('Number of Comments')
    
    # 3. Quality metrics
    ax = axes[1, 0]
    quality_metrics = comment_df.groupby('Treatment').agg({
        'Actionable': 'mean',
        'Has Line Ref': 'mean',
        'Has Code Ref': 'mean'
    }) * 100  # Convert to percentage
    
    quality_metrics.plot(kind='bar', ax=ax)
    ax.set_title('Comment Quality Metrics')
    ax.set_ylabel('Percentage (%)')
    ax.legend(title='Metric')
    
    # 4. Detection accuracy
    ax = axes[1, 1]
    detection_data = []
    for treatment in ['MCR', 'ACR', 'CCR']:
        treatment_comments = comment_df[comment_df['Treatment'] == treatment]
        injected_found = treatment_comments['Is Injected'].sum()
        total_comments = len(treatment_comments)
        precision = injected_found / total_comments if total_comments > 0 else 0
        
        detection_data.append({
            'Treatment': treatment,
            'Precision': precision,
            'Total Comments': total_comments,
            'Injected Found': injected_found
        })
    
    detection_df = pd.DataFrame(detection_data)
    
    x = range(len(detection_df))
    width = 0.35
    
    bars1 = ax.bar([i - width/2 for i in x], detection_df['Injected Found'], 
                   width, label='Injected Issues', color='darkgreen')
    bars2 = ax.bar([i + width/2 for i in x], 
                   detection_df['Total Comments'] - detection_df['Injected Found'], 
                   width, label='Other Comments', color='lightgray')
    
    ax.set_xlabel('Treatment')
    ax.set_ylabel('Number of Comments')
    ax.set_title('Comment Composition')
    ax.set_xticks(x)
    ax.set_xticklabels(detection_df['Treatment'])
    ax.legend()
    
    # Add precision values on top
    for i, (bar1, bar2, precision) in enumerate(zip(bars1, bars2, detection_df['Precision'])):
        height = bar1.get_height() + bar2.get_height()
        ax.text(i, height + 0.5, f'{precision:.0%}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return comment_df

# Analyze comment quality
comment_analysis_df = analyze_comment_quality(results)

# Summary statistics
print("\nComment Quality Summary:")
summary = comment_analysis_df.groupby('Treatment').agg({
    'Word Count': ['mean', 'std'],
    'Actionable': 'mean',
    'Is Injected': ['sum', 'count']
}).round(2)
print(summary)

## 9. Simulating Multiple Reviewers

In [None]:
def simulate_multiple_reviewers(n_reviewers: int = 10, program_name: str = 'number-conversion'):
    """Simulate multiple reviewers to observe variability"""
    
    # Create program
    program = program_sim.create_program(program_name)
    
    # Store results for each treatment
    multi_results = {treatment: [] for treatment in ['MCR', 'ACR', 'CCR']}
    
    print(f"Simulating {n_reviewers} reviewers for each treatment...")
    
    for treatment in ['MCR', 'ACR', 'CCR']:
        for i in range(n_reviewers):
            # Run review
            result = process_sim.run_review(treatment, program)
            
            # Extract key metrics
            found_issue_ids = {c['issue_id'] for c in result['final_comments'] if c['issue_id']}
            
            metrics = {
                'reviewer_id': i + 1,
                'time_minutes': result['time_elapsed'] / 60,
                'num_comments': len(result['final_comments']),
                'issues_found': len(found_issue_ids),
                'detection_rate': len(found_issue_ids) / len(program['injected_issues']),
                'confidence': result['confidence'],
                'lines_reviewed': len(result['reviewed_lines'])
            }
            
            multi_results[treatment].append(metrics)
    
    # Create DataFrames
    dfs = {}
    for treatment, results in multi_results.items():
        dfs[treatment] = pd.DataFrame(results)
    
    # Visualize variability
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Time variability
    ax = axes[0, 0]
    time_data = [dfs[t]['time_minutes'].values for t in ['MCR', 'ACR', 'CCR']]
    bp = ax.boxplot(time_data, labels=['MCR', 'ACR', 'CCR'], patch_artist=True)
    for patch, color in zip(bp['boxes'], colors[:3]):
        patch.set_facecolor(color)
    ax.set_ylabel('Time (minutes)')
    ax.set_title('Review Time Variability')
    
    # Add paper means
    paper_means = [42, 56, 57]
    for i, mean in enumerate(paper_means):
        ax.axhline(y=mean, xmin=i/3-0.1, xmax=i/3+0.2, 
                  color='red', linestyle='--', linewidth=2)
    
    # 2. Detection rate variability
    ax = axes[0, 1]
    detection_data = [dfs[t]['detection_rate'].values * 100 for t in ['MCR', 'ACR', 'CCR']]
    bp = ax.boxplot(detection_data, labels=['MCR', 'ACR', 'CCR'], patch_artist=True)
    for patch, color in zip(bp['boxes'], colors[3:6]):
        patch.set_facecolor(color)
    ax.set_ylabel('Detection Rate (%)')
    ax.set_title('Issue Detection Variability')
    ax.set_ylim(0, 110)
    
    # 3. Scatter: Time vs Detection
    ax = axes[1, 0]
    for treatment, color in zip(['MCR', 'ACR', 'CCR'], colors[:3]):
        df = dfs[treatment]
        ax.scatter(df['time_minutes'], df['detection_rate'] * 100, 
                  alpha=0.6, s=100, label=treatment, color=color)
    ax.set_xlabel('Time (minutes)')
    ax.set_ylabel('Detection Rate (%)')
    ax.set_title('Time-Efficiency Trade-off')
    ax.legend()
    
    # 4. Confidence distribution
    ax = axes[1, 1]
    confidence_data = pd.DataFrame({
        treatment: dfs[treatment]['confidence'].value_counts(normalize=True).sort_index()
        for treatment in ['MCR', 'ACR', 'CCR']
    }).fillna(0)
    
    confidence_data.T.plot(kind='bar', ax=ax, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd'])
    ax.set_xlabel('Treatment')
    ax.set_ylabel('Proportion')
    ax.set_title('Confidence Score Distribution')
    ax.legend(title='Confidence', bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
    
    # Statistical summary
    print("\nStatistical Summary:")
    for treatment in ['MCR', 'ACR', 'CCR']:
        df = dfs[treatment]
        print(f"\n{treatment}:")
        print(f"  Time: {df['time_minutes'].mean():.1f} ± {df['time_minutes'].std():.1f} min")
        print(f"  Detection: {df['detection_rate'].mean()*100:.1f} ± {df['detection_rate'].std()*100:.1f}%")
        print(f"  Comments: {df['num_comments'].mean():.1f} ± {df['num_comments'].std():.1f}")
        print(f"  Confidence: {df['confidence'].mean():.2f} ± {df['confidence'].std():.2f}")
    
    return dfs

# Run multi-reviewer simulation
multi_reviewer_results = simulate_multiple_reviewers(n_reviewers=20)

## 10. Key Insights and Implementation Guidelines

In [None]:
insights = {
    "Treatment Characteristics": [
        "MCR: High variability in coverage, balanced severity detection",
        "ACR: Biased towards low-severity issues, influences reviewer focus",
        "CCR: Perfect recall but doesn't improve reviewer confidence",
        "All treatments show similar confidence levels (~3.6/5)"
    ],
    
    "Behavioral Differences": [
        "MCR reviewers explore code more systematically",
        "ACR/CCR reviewers focus on suggested locations (anchoring bias)",
        "89% of automated suggestions are kept in final review",
        "Limited exploration beyond automated suggestions in ACR/CCR"
    ],
    
    "Time and Efficiency": [
        "No time savings with automated assistance (MCR: 42min, ACR: 56min)",
        "Time spent verifying automated comments offsets exploration savings",
        "CCR shows diminishing returns despite perfect issue identification",
        "High inter-reviewer variability in all treatments"
    ],
    
    "Implementation with LangGraph": [
        "State management captures complete review process",
        "Conditional edges model decision points effectively",
        "Action tracking enables detailed behavioral analysis",
        "Flexible framework for testing new treatments"
    ],
    
    "Practical Recommendations": [
        "Consider hybrid approaches that combine MCR exploration with ACR assistance",
        "Focus automation on high-severity issue detection",
        "Design interfaces that encourage exploration beyond suggested issues",
        "Track behavioral metrics to identify and mitigate biases",
        "Use A/B testing to evaluate new treatment variations"
    ]
}

print("\n" + "="*80)
print("KEY INSIGHTS: Simulating Different Review Treatments")
print("="*80)

for category, items in insights.items():
    print(f"\n{category}:")
    for item in items:
        print(f"  • {item}")

print("\n" + "="*80)
print("\nConclusion:")
print("This simulation framework demonstrates how different code review treatments")
print("affect reviewer behavior and outcomes. The key finding that automated")
print("assistance doesn't save time but changes focus patterns has important")
print("implications for tool design and deployment strategies.")