# Focused Learning: Experience-Aware Loss Functions (ELF) Deep Dive

## Learning Objective
Master the mathematical foundations and implementation details of Experience-Aware Loss Functions (ELF), understanding how reviewer experience can be embedded into neural model training through weighted loss functions.

## Paper Reference
- **Section 3.3**: Experience-Aware Loss Functions (Pages 8-9)
- **Equation (3)**: L_RCG = ω * Σ(-log P(w_t|c,w_<t))
- **Figure 2**: Overview of Experimental Design

## Why ELF is Complex and Important
1. **Novel Integration**: First method to directly incorporate software engineering metrics into neural loss functions
2. **Dynamic Weighting**: Continuous ownership values replace discrete thresholds
3. **Multi-strategy Design**: Four different weighting strategies for different experience types
4. **Gradient Impact**: Weighted loss affects gradient direction and model convergence

## 1. Mathematical Foundation of ELF

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from typing import List, Tuple, Dict

# Set style for better visualizations
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

### 1.1 Standard Cross-Entropy Loss vs ELF

Let's first understand how ELF modifies the standard loss function.

In [None]:
class StandardLoss(nn.Module):
    """Standard Cross-Entropy Loss for sequence generation"""
    def __init__(self):
        super().__init__()
        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
    
    def forward(self, logits, targets):
        """
        Standard loss: L = Σ(-log P(w_t|c,w_<t))
        """
        batch_size, seq_len, vocab_size = logits.shape
        logits_flat = logits.view(-1, vocab_size)
        targets_flat = targets.view(-1)
        
        losses = self.ce_loss(logits_flat, targets_flat)
        losses = losses.view(batch_size, seq_len)
        
        # Average over sequence and batch
        return losses.mean()

class ExperienceAwareLoss(nn.Module):
    """ELF: Experience-weighted Cross-Entropy Loss"""
    def __init__(self):
        super().__init__()
        self.ce_loss = nn.CrossEntropyLoss(reduction='none')
    
    def forward(self, logits, targets, weights):
        """
        ELF loss: L = ω * Σ(-log P(w_t|c,w_<t))
        where ω is experience-based weight
        """
        batch_size, seq_len, vocab_size = logits.shape
        logits_flat = logits.view(-1, vocab_size)
        targets_flat = targets.view(-1)
        
        losses = self.ce_loss(logits_flat, targets_flat)
        losses = losses.view(batch_size, seq_len)
        
        # Apply experience weights to each sample
        weighted_losses = []
        for i in range(batch_size):
            weighted_loss = weights[i] * losses[i].mean()
            weighted_losses.append(weighted_loss)
        
        return torch.stack(weighted_losses).mean()

# Demonstrate the difference
batch_size, seq_len, vocab_size = 4, 10, 100
logits = torch.randn(batch_size, seq_len, vocab_size)
targets = torch.randint(0, vocab_size, (batch_size, seq_len))

# Experience weights (high vs low experience)
weights = torch.tensor([7.39, 7.39, 2.72, 2.72])  # e^(1+0.3) vs e^(1+0.0)

standard_loss = StandardLoss()
elf_loss = ExperienceAwareLoss()

loss_standard = standard_loss(logits, targets)
loss_elf = elf_loss(logits, targets, weights)

print(f"Standard Loss: {loss_standard:.4f}")
print(f"ELF Loss: {loss_elf:.4f}")
print(f"Ratio: {loss_elf/loss_standard:.2f}x")

### 1.2 The Four ELF Weighting Strategies

Deep dive into each weighting strategy and their mathematical formulations.

In [None]:
class ELFWeightCalculator:
    """Calculate weights for all four ELF strategies"""
    
    @staticmethod
    def weight_aco(aco: float) -> float:
        """ω_aco = e^(1+aco) - Authoring experience only"""
        return np.exp(1 + aco)
    
    @staticmethod
    def weight_rso(rso: float) -> float:
        """ω_rso = e^(1+rso) - Reviewing experience only"""
        return np.exp(1 + rso)
    
    @staticmethod
    def weight_avg(aco: float, rso: float) -> float:
        """ω_avg = e^(1+(rso+aco)/2) - Average of both experiences"""
        return np.exp(1 + (rso + aco) / 2)
    
    @staticmethod
    def weight_max(aco: float, rso: float) -> float:
        """ω_max = e^(1+max(rso,aco)) - Maximum experience"""
        return np.exp(1 + max(rso, aco))

# Visualize weight functions
ownership_range = np.linspace(0, 0.5, 100)
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot 1: ACO weight function
ax1 = axes[0, 0]
weights_aco = [ELFWeightCalculator.weight_aco(x) for x in ownership_range]
ax1.plot(ownership_range, weights_aco, 'b-', linewidth=2)
ax1.set_title('ω_aco = e^(1+aco)', fontsize=14)
ax1.set_xlabel('ACO (Authoring Code Ownership)')
ax1.set_ylabel('Weight')
ax1.grid(True, alpha=0.3)

# Plot 2: RSO weight function
ax2 = axes[0, 1]
weights_rso = [ELFWeightCalculator.weight_rso(x) for x in ownership_range]
ax2.plot(ownership_range, weights_rso, 'g-', linewidth=2)
ax2.set_title('ω_rso = e^(1+rso)', fontsize=14)
ax2.set_xlabel('RSO (Review-Specific Ownership)')
ax2.set_ylabel('Weight')
ax2.grid(True, alpha=0.3)

# Plot 3: 3D surface for AVG strategy
ax3 = fig.add_subplot(223, projection='3d')
aco_grid, rso_grid = np.meshgrid(ownership_range, ownership_range)
weights_avg = np.array([[ELFWeightCalculator.weight_avg(a, r) 
                        for a in ownership_range] for r in ownership_range])
surf = ax3.plot_surface(aco_grid, rso_grid, weights_avg, cmap='viridis', alpha=0.8)
ax3.set_title('ω_avg = e^(1+(rso+aco)/2)', fontsize=14)
ax3.set_xlabel('ACO')
ax3.set_ylabel('RSO')
ax3.set_zlabel('Weight')

# Plot 4: Comparison of all strategies
ax4 = axes[1, 1]
aco_fixed = 0.2
rso_values = ownership_range
ax4.plot(rso_values, [ELFWeightCalculator.weight_aco(aco_fixed) for _ in rso_values], 
         'b-', label=f'ACO only (ACO={aco_fixed})', linewidth=2)
ax4.plot(rso_values, [ELFWeightCalculator.weight_rso(r) for r in rso_values], 
         'g-', label='RSO only', linewidth=2)
ax4.plot(rso_values, [ELFWeightCalculator.weight_avg(aco_fixed, r) for r in rso_values], 
         'r-', label='AVG', linewidth=2)
ax4.plot(rso_values, [ELFWeightCalculator.weight_max(aco_fixed, r) for r in rso_values], 
         'm-', label='MAX', linewidth=2)
ax4.set_title('Strategy Comparison (ACO=0.2)', fontsize=14)
ax4.set_xlabel('RSO')
ax4.set_ylabel('Weight')
ax4.legend()
ax4.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Show example weights for different reviewer profiles
print("\nExample Weights for Different Reviewer Profiles:")
profiles = [
    {"name": "Expert Reviewer", "aco": 0.35, "rso": 0.45},
    {"name": "Mid-level Reviewer", "aco": 0.10, "rso": 0.20},
    {"name": "New Reviewer", "aco": 0.02, "rso": 0.05}
]

for profile in profiles:
    print(f"\n{profile['name']} (ACO={profile['aco']:.2f}, RSO={profile['rso']:.2f}):")
    print(f"  ω_aco = {ELFWeightCalculator.weight_aco(profile['aco']):.3f}")
    print(f"  ω_rso = {ELFWeightCalculator.weight_rso(profile['rso']):.3f}")
    print(f"  ω_avg = {ELFWeightCalculator.weight_avg(profile['aco'], profile['rso']):.3f}")
    print(f"  ω_max = {ELFWeightCalculator.weight_max(profile['aco'], profile['rso']):.3f}")

## 2. Impact on Gradient Updates

Understanding how ELF affects the learning process through gradient modification.

In [None]:
class GradientAnalyzer:
    """Analyze how ELF affects gradient updates"""
    
    def __init__(self, input_dim: int = 10, hidden_dim: int = 20, output_dim: int = 10):
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )
        
    def compute_gradients(self, x, y, weight=1.0):
        """Compute gradients with given weight"""
        self.model.zero_grad()
        output = self.model(x)
        loss = F.cross_entropy(output, y)
        weighted_loss = weight * loss
        weighted_loss.backward()
        
        # Extract gradients
        gradients = []
        for param in self.model.parameters():
            if param.grad is not None:
                gradients.append(param.grad.clone().flatten())
        
        return torch.cat(gradients)
    
    def visualize_gradient_impact(self):
        """Visualize how different weights affect gradients"""
        # Create sample data
        x = torch.randn(1, 10)
        y = torch.tensor([5])
        
        # Different reviewer profiles
        profiles = [
            {"name": "Low Exp (ω=2.72)", "weight": 2.72, "color": "blue"},
            {"name": "Mid Exp (ω=4.48)", "weight": 4.48, "color": "green"},
            {"name": "High Exp (ω=7.39)", "weight": 7.39, "color": "red"}
        ]
        
        plt.figure(figsize=(15, 5))
        
        # Plot 1: Gradient magnitudes
        plt.subplot(1, 3, 1)
        for profile in profiles:
            grads = self.compute_gradients(x, y, profile["weight"])
            grad_magnitude = torch.norm(grads).item()
            plt.bar(profile["name"], grad_magnitude, color=profile["color"], alpha=0.7)
        plt.title("Gradient Magnitude by Experience Level", fontsize=14)
        plt.ylabel("||∇L||")
        
        # Plot 2: Gradient direction comparison
        plt.subplot(1, 3, 2)
        base_grads = self.compute_gradients(x, y, 1.0)
        for i, profile in enumerate(profiles):
            grads = self.compute_gradients(x, y, profile["weight"])
            # Normalize for direction comparison
            grads_norm = grads / torch.norm(grads)
            base_norm = base_grads / torch.norm(base_grads)
            cosine_sim = torch.dot(grads_norm, base_norm).item()
            plt.bar(profile["name"], cosine_sim, color=profile["color"], alpha=0.7)
        plt.title("Gradient Direction Similarity to Baseline", fontsize=14)
        plt.ylabel("Cosine Similarity")
        plt.ylim(0.98, 1.01)
        
        # Plot 3: Learning rate effect
        plt.subplot(1, 3, 3)
        learning_rates = np.logspace(-4, -1, 50)
        for profile in profiles:
            effective_lr = learning_rates * profile["weight"]
            plt.plot(learning_rates, effective_lr, color=profile["color"], 
                    label=profile["name"], linewidth=2)
        plt.xscale('log')
        plt.yscale('log')
        plt.xlabel("Base Learning Rate")
        plt.ylabel("Effective Learning Rate")
        plt.title("Effective Learning Rate by Experience", fontsize=14)
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

# Analyze gradient impacts
analyzer = GradientAnalyzer()
analyzer.visualize_gradient_impact()

# Mathematical explanation
print("\nMathematical Impact of ELF on Gradients:")
print("\nFor standard loss: ∇θ L = ∇θ Σ(-log P(w_t|c,w_<t))")
print("For ELF loss: ∇θ L_ELF = ω * ∇θ Σ(-log P(w_t|c,w_<t))")
print("\nThe weight ω directly scales the gradient magnitude:")
print("- High experience (ω=7.39): 7.39x larger gradient updates")
print("- Low experience (ω=2.72): 2.72x larger gradient updates")
print("- Ratio: High/Low = 2.72x more influence for experienced reviewers")

## 3. Implementing ELF in Practice

Complete implementation with batch processing and optimization considerations.

In [None]:
class ELFTrainer:
    """Complete ELF training implementation"""
    
    def __init__(self, model, strategy="aco", granularity="package"):
        self.model = model
        self.strategy = strategy
        self.granularity = granularity
        self.optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
        self.loss_history = []
        
    def calculate_batch_weights(self, batch_metrics):
        """Calculate ELF weights for a batch of samples"""
        weights = []
        
        for metrics in batch_metrics:
            # Extract ownership values based on granularity
            if self.granularity == "repository":
                aco, rso = metrics['aco_repo'], metrics['rso_repo']
            elif self.granularity == "subsystem":
                aco, rso = metrics['aco_sys'], metrics['rso_sys']
            else:  # package
                aco, rso = metrics['aco_pkg'], metrics['rso_pkg']
            
            # Apply strategy
            if self.strategy == "aco":
                weight = np.exp(1 + aco)
            elif self.strategy == "rso":
                weight = np.exp(1 + rso)
            elif self.strategy == "avg":
                weight = np.exp(1 + (aco + rso) / 2)
            else:  # max
                weight = np.exp(1 + max(aco, rso))
                
            weights.append(weight)
            
        return torch.tensor(weights, dtype=torch.float32)
    
    def train_step(self, batch_data, batch_targets, batch_metrics):
        """Single training step with ELF"""
        self.optimizer.zero_grad()
        
        # Forward pass
        outputs = self.model(batch_data)
        
        # Calculate weights
        weights = self.calculate_batch_weights(batch_metrics)
        
        # Calculate ELF loss
        ce_loss = F.cross_entropy(outputs, batch_targets, reduction='none')
        weighted_losses = weights * ce_loss
        loss = weighted_losses.mean()
        
        # Backward pass
        loss.backward()
        self.optimizer.step()
        
        # Log statistics
        stats = {
            'loss': loss.item(),
            'avg_weight': weights.mean().item(),
            'weight_std': weights.std().item(),
            'max_weight': weights.max().item(),
            'min_weight': weights.min().item()
        }
        
        self.loss_history.append(stats)
        return stats

# Demonstrate training simulation
def simulate_elf_training():
    """Simulate ELF training process"""
    
    # Simple model for demonstration
    model = nn.Sequential(
        nn.Linear(50, 100),
        nn.ReLU(),
        nn.Linear(100, 10)
    )
    
    # Create trainers for different strategies
    strategies = ["aco", "rso", "avg", "max"]
    trainers = {s: ELFTrainer(model, strategy=s) for s in strategies}
    
    # Simulate training
    n_steps = 100
    batch_size = 32
    
    for step in range(n_steps):
        # Generate mock batch
        batch_data = torch.randn(batch_size, 50)
        batch_targets = torch.randint(0, 10, (batch_size,))
        
        # Generate mock metrics (mixed experience levels)
        batch_metrics = []
        for i in range(batch_size):
            if i < batch_size // 3:  # High experience
                metrics = {
                    'aco_repo': np.random.uniform(0.2, 0.3),
                    'aco_sys': np.random.uniform(0.25, 0.35),
                    'aco_pkg': np.random.uniform(0.3, 0.4),
                    'rso_repo': np.random.uniform(0.3, 0.4),
                    'rso_sys': np.random.uniform(0.35, 0.45),
                    'rso_pkg': np.random.uniform(0.4, 0.5)
                }
            elif i < 2 * batch_size // 3:  # Medium experience
                metrics = {
                    'aco_repo': np.random.uniform(0.05, 0.15),
                    'aco_sys': np.random.uniform(0.08, 0.18),
                    'aco_pkg': np.random.uniform(0.1, 0.2),
                    'rso_repo': np.random.uniform(0.1, 0.2),
                    'rso_sys': np.random.uniform(0.15, 0.25),
                    'rso_pkg': np.random.uniform(0.2, 0.3)
                }
            else:  # Low experience
                metrics = {
                    'aco_repo': np.random.uniform(0.01, 0.05),
                    'aco_sys': np.random.uniform(0.02, 0.06),
                    'aco_pkg': np.random.uniform(0.03, 0.08),
                    'rso_repo': np.random.uniform(0.02, 0.08),
                    'rso_sys': np.random.uniform(0.03, 0.1),
                    'rso_pkg': np.random.uniform(0.05, 0.15)
                }
            batch_metrics.append(metrics)
        
        # Train each strategy
        for strategy, trainer in trainers.items():
            trainer.train_step(batch_data, batch_targets, batch_metrics)
    
    # Visualize training dynamics
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    for idx, (strategy, trainer) in enumerate(trainers.items()):
        ax = axes[idx // 2, idx % 2]
        
        # Extract history
        losses = [s['loss'] for s in trainer.loss_history]
        avg_weights = [s['avg_weight'] for s in trainer.loss_history]
        weight_stds = [s['weight_std'] for s in trainer.loss_history]
        
        # Plot loss and weight evolution
        ax2 = ax.twinx()
        
        line1 = ax.plot(losses, 'b-', label='Loss', alpha=0.7)
        line2 = ax2.plot(avg_weights, 'r-', label='Avg Weight', alpha=0.7)
        
        # Add weight std as shaded area
        ax2.fill_between(range(len(avg_weights)), 
                        np.array(avg_weights) - np.array(weight_stds),
                        np.array(avg_weights) + np.array(weight_stds),
                        alpha=0.2, color='red')
        
        ax.set_xlabel('Training Step')
        ax.set_ylabel('Loss', color='b')
        ax2.set_ylabel('Weight', color='r')
        ax.set_title(f'Strategy: {strategy.upper()}', fontsize=14)
        
        # Combine legends
        lines = line1 + line2
        labels = [l.get_label() for l in lines]
        ax.legend(lines, labels, loc='upper right')
        
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nTraining Summary (Final 10 steps):")
    for strategy, trainer in trainers.items():
        recent_stats = trainer.loss_history[-10:]
        avg_loss = np.mean([s['loss'] for s in recent_stats])
        avg_weight = np.mean([s['avg_weight'] for s in recent_stats])
        weight_range = np.mean([s['max_weight'] - s['min_weight'] for s in recent_stats])
        
        print(f"\n{strategy.upper()} Strategy:")
        print(f"  Avg Loss: {avg_loss:.4f}")
        print(f"  Avg Weight: {avg_weight:.3f}")
        print(f"  Weight Range: {weight_range:.3f}")

# Run simulation
simulate_elf_training()

## 4. Advanced Considerations and Optimizations

In [None]:
class AdvancedELF:
    """Advanced ELF implementation with optimizations"""
    
    def __init__(self):
        self.weight_cache = {}  # Cache computed weights
        self.gradient_clipper = nn.utils.clip_grad_norm_
        
    def adaptive_weight_scaling(self, weights: torch.Tensor, epoch: int, max_epochs: int):
        """Gradually increase weight influence during training"""
        # Start with less aggressive weighting, gradually increase
        scale_factor = min(1.0, epoch / (max_epochs * 0.3))
        
        # Scale weights towards 1.0 (no weighting) initially
        scaled_weights = 1.0 + (weights - 1.0) * scale_factor
        return scaled_weights
    
    def weight_regularization(self, weights: torch.Tensor, lambda_reg: float = 0.01):
        """Regularize extreme weights to prevent instability"""
        # Penalize very high weights
        reg_term = lambda_reg * torch.mean(torch.square(weights - weights.mean()))
        return reg_term
    
    def batch_weight_normalization(self, weights: torch.Tensor):
        """Normalize weights within batch to maintain stable gradients"""
        # Z-score normalization then rescale
        normalized = (weights - weights.mean()) / (weights.std() + 1e-8)
        # Rescale to positive range [0.5, 2.0]
        rescaled = 1.25 + 0.75 * torch.tanh(normalized)
        return rescaled
    
    def compute_weight_statistics(self, weights: List[float], strategy: str):
        """Compute and visualize weight statistics"""
        weights_array = np.array(weights)
        
        stats = {
            'mean': np.mean(weights_array),
            'std': np.std(weights_array),
            'min': np.min(weights_array),
            'max': np.max(weights_array),
            'p25': np.percentile(weights_array, 25),
            'p50': np.percentile(weights_array, 50),
            'p75': np.percentile(weights_array, 75),
            'skewness': self._skewness(weights_array),
            'kurtosis': self._kurtosis(weights_array)
        }
        
        return stats
    
    def _skewness(self, x):
        """Calculate skewness of distribution"""
        mean = np.mean(x)
        std = np.std(x)
        return np.mean(((x - mean) / std) ** 3)
    
    def _kurtosis(self, x):
        """Calculate kurtosis of distribution"""
        mean = np.mean(x)
        std = np.std(x)
        return np.mean(((x - mean) / std) ** 4) - 3

# Demonstrate advanced techniques
advanced_elf = AdvancedELF()

# Generate sample weights for different experience distributions
n_samples = 1000
distributions = {
    "Uniform": np.random.uniform(0.0, 0.5, n_samples),
    "Bimodal": np.concatenate([np.random.normal(0.05, 0.02, n_samples//2),
                               np.random.normal(0.35, 0.05, n_samples//2)]),
    "Skewed": np.random.gamma(2, 0.05, n_samples),
    "Real-world": np.concatenate([np.random.gamma(2, 0.02, int(n_samples*0.7)),
                                 np.random.normal(0.25, 0.05, int(n_samples*0.2)),
                                 np.random.uniform(0.35, 0.45, int(n_samples*0.1))])
}

# Visualize weight distributions and their effects
fig, axes = plt.subplots(2, 4, figsize=(20, 10))

for idx, (dist_name, ownership_values) in enumerate(distributions.items()):
    # Calculate weights for ACO strategy
    weights = [np.exp(1 + x) for x in ownership_values]
    
    # Plot ownership distribution
    ax1 = axes[0, idx]
    ax1.hist(ownership_values, bins=30, alpha=0.7, color='blue', edgecolor='black')
    ax1.set_title(f'{dist_name} Ownership Distribution', fontsize=12)
    ax1.set_xlabel('Ownership Value')
    ax1.set_ylabel('Count')
    
    # Plot weight distribution
    ax2 = axes[1, idx]
    ax2.hist(weights, bins=30, alpha=0.7, color='red', edgecolor='black')
    ax2.set_title(f'Resulting Weight Distribution', fontsize=12)
    ax2.set_xlabel('Weight (ω)')
    ax2.set_ylabel('Count')
    
    # Add statistics
    stats = advanced_elf.compute_weight_statistics(weights, "aco")
    stats_text = f"μ={stats['mean']:.2f}\nσ={stats['std']:.2f}\nskew={stats['skewness']:.2f}"
    ax2.text(0.7, 0.9, stats_text, transform=ax2.transAxes, 
             bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5),
             verticalalignment='top')

plt.tight_layout()
plt.show()

# Demonstrate weight normalization techniques
print("\nWeight Normalization Examples:")
test_weights = torch.tensor([2.72, 2.72, 4.48, 7.39, 7.39, 12.18])  # Mix of experience levels
print(f"Original weights: {test_weights.numpy()}")
print(f"Batch normalized: {advanced_elf.batch_weight_normalization(test_weights).numpy()}")

# Show adaptive scaling over epochs
print("\nAdaptive Weight Scaling:")
epochs = [0, 10, 20, 30, 50, 100]
for epoch in epochs:
    scaled = advanced_elf.adaptive_weight_scaling(test_weights, epoch, 100)
    print(f"Epoch {epoch:3d}: {scaled.numpy()}")

## 5. Practical Implementation Challenges and Solutions

In [None]:
class ELFChallenges:
    """Common challenges and solutions when implementing ELF"""
    
    @staticmethod
    def handle_missing_metrics():
        """Handle cases where reviewer metrics are missing"""
        print("Challenge 1: Missing Reviewer Metrics")
        print("Solutions:")
        print("1. Use default weight of 1.0 (no weighting)")
        print("2. Impute based on repository average")
        print("3. Use minimum observed weight")
        print("4. Skip sample in training\n")
        
        # Example implementation
        def get_weight_safe(metrics, default_aco=0.05, default_rso=0.10):
            if metrics is None:
                # Use conservative defaults
                return np.exp(1 + (default_aco + default_rso) / 2)
            return calculate_weight(metrics)
        
        return get_weight_safe
    
    @staticmethod
    def handle_extreme_weights():
        """Handle extremely high or low weights"""
        print("Challenge 2: Extreme Weight Values")
        print("Solutions:")
        print("1. Clip weights to reasonable range [0.5, 10.0]")
        print("2. Use log-scale transformation")
        print("3. Apply weight decay regularization\n")
        
        def clip_weights(weights, min_w=0.5, max_w=10.0):
            return torch.clamp(weights, min=min_w, max=max_w)
        
        return clip_weights
    
    @staticmethod
    def handle_imbalanced_experience():
        """Handle datasets with imbalanced experience distributions"""
        print("Challenge 3: Imbalanced Experience Distribution")
        print("Solutions:")
        print("1. Stratified sampling by experience level")
        print("2. Experience-aware batch construction")
        print("3. Focal loss adaptation\n")
        
        class StratifiedBatchSampler:
            def __init__(self, dataset, batch_size):
                self.dataset = dataset
                self.batch_size = batch_size
                self._stratify_by_experience()
                
            def _stratify_by_experience(self):
                # Group samples by experience level
                self.high_exp = []
                self.mid_exp = []
                self.low_exp = []
                
                # Stratified sampling logic here
                pass
        
        return StratifiedBatchSampler
    
    @staticmethod
    def visualize_challenges():
        """Visualize common challenges"""
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        
        # Challenge 1: Missing metrics distribution
        ax1 = axes[0]
        missing_rates = [0.05, 0.15, 0.10, 0.08, 0.12]  # Per repository
        ax1.bar(range(len(missing_rates)), missing_rates, color='coral')
        ax1.set_title("Missing Metrics by Repository")
        ax1.set_xlabel("Repository ID")
        ax1.set_ylabel("Missing Rate")
        ax1.axhline(y=0.1, color='red', linestyle='--', label='10% threshold')
        ax1.legend()
        
        # Challenge 2: Weight distribution with outliers
        ax2 = axes[1]
        normal_weights = np.random.lognormal(1.5, 0.5, 950)
        outlier_weights = np.random.uniform(15, 25, 50)
        all_weights = np.concatenate([normal_weights, outlier_weights])
        ax2.hist(all_weights, bins=50, color='skyblue', edgecolor='black')
        ax2.axvline(x=10, color='red', linestyle='--', label='Clipping threshold')
        ax2.set_title("Weight Distribution with Outliers")
        ax2.set_xlabel("Weight Value")
        ax2.set_ylabel("Count")
        ax2.set_xlim(0, 30)
        ax2.legend()
        
        # Challenge 3: Experience imbalance
        ax3 = axes[2]
        exp_distribution = [700, 200, 100]  # Low, Mid, High
        ax3.pie(exp_distribution, labels=['Low Exp', 'Mid Exp', 'High Exp'],
                autopct='%1.1f%%', colors=['lightblue', 'lightgreen', 'lightcoral'])
        ax3.set_title("Experience Distribution Imbalance")
        
        plt.tight_layout()
        plt.show()

# Demonstrate challenges and solutions
challenges = ELFChallenges()

# Show each challenge
print("=== ELF Implementation Challenges ===")
print()
challenges.handle_missing_metrics()
challenges.handle_extreme_weights()
challenges.handle_imbalanced_experience()

# Visualize challenges
challenges.visualize_challenges()

# Best practices summary
print("\n=== ELF Implementation Best Practices ===")
print("1. Always validate ownership metrics before training")
print("2. Monitor weight distribution during training")
print("3. Use gradient clipping with high weights")
print("4. Consider curriculum learning: start with uniform weights")
print("5. Log weight statistics for debugging")
print("6. Implement checkpointing for model recovery")
print("7. Use mixed precision training for efficiency")

## 6. Summary and Key Takeaways

### Core Concepts Mastered
1. **ELF Formula**: L_RCG = ω * Σ(-log P(w_t|c,w_<t))
2. **Four Strategies**: ACO, RSO, AVG, MAX - each capturing different aspects of experience
3. **Exponential Weighting**: e^(1+ownership) creates strong differentiation
4. **Gradient Scaling**: Weights directly scale gradient magnitudes

### Implementation Insights
1. **Cache weights** for efficiency in large-scale training
2. **Normalize weights** within batches to prevent instability
3. **Clip extreme values** to maintain training stability
4. **Monitor weight distribution** throughout training

### Research Extensions
1. **Adaptive strategies**: Dynamically select strategy based on data
2. **Multi-granularity fusion**: Combine multiple granularity levels
3. **Temporal dynamics**: Consider how ownership changes over time
4. **Cross-project transfer**: How do weights transfer between projects?

In [None]:
# Final implementation template for researchers
class ELFResearchTemplate:
    """Template for implementing ELF in your own research"""
    
    def __init__(self, config):
        self.strategy = config.get('strategy', 'aco')
        self.granularity = config.get('granularity', 'package')
        self.clip_range = config.get('clip_range', (0.5, 10.0))
        self.normalize_batch = config.get('normalize_batch', False)
        self.adaptive_scaling = config.get('adaptive_scaling', True)
        
    def compute_elf_loss(self, model_output, targets, reviewer_metrics, epoch=0, max_epochs=100):
        """
        Complete ELF loss computation with all optimizations
        
        Args:
            model_output: Model predictions [batch_size, seq_len, vocab_size]
            targets: Ground truth [batch_size, seq_len]
            reviewer_metrics: List of ReviewerMetrics objects
            epoch: Current training epoch
            max_epochs: Total training epochs
            
        Returns:
            loss: Weighted loss scalar
            metrics: Dictionary of training metrics
        """
        # Your implementation here
        pass
    
    def analyze_results(self, generated_comments, ground_truth, reviewer_metrics):
        """Analyze ELF model outputs"""
        # Your analysis here
        pass

print("ELF Deep Dive Complete!")
print("\nNext Steps:")
print("1. Implement ELF with your own code review dataset")
print("2. Experiment with different strategies and granularities")
print("3. Analyze how weights affect generated comment quality")
print("4. Consider hybrid approaches combining multiple strategies")