# CodeRAG Focused Learning 3: Bigraph Mapping and Code Anchor Selection

**Mục tiêu**: Hiểu sâu về Bigraph Mapping mechanism và Code Anchor selection strategy

**Paper Reference**: Section 3.3 - Bigraph Mapping

---

## 🎯 Khái niệm cốt lõi

### Từ Paper (Section 3.3):
> *"After acquiring the requirement graph and DS-code graph, we map the selected sub-requirement nodes and semantically similar requirement nodes of the target requirement into code nodes in DS-code graph."*

> *"The code nodes of sub-requirement are typically invoked by the target code. The code nodes of semantically similar requirements usually have similar functionalities to the target code."*

### Đặc điểm phức tạp:
1. **Bi-directional Mapping**: Map từ requirements sang code nodes
2. **Multi-type Code Anchors**: Sub-requirement codes, similar codes, local codes
3. **Intelligent Selection**: Không phải tất cả mapped codes đều hữu ích
4. **Context-aware Filtering**: Filter based on target requirement context
5. **Confidence Scoring**: Đánh giá relevance của each anchor

---

## 🔧 Environment Setup

In [None]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional, Set, Any
from dataclasses import dataclass, field
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import itertools

# For embeddings and similarity
from langchain.embeddings import OpenAIEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# For LLM-based analysis
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# Set environment
os.environ['OPENAI_API_KEY'] = 'your-openai-api-key'

plt.style.use('seaborn-v0_8')
sns.set_palette("Set1")

## 📚 Lý thuyết sâu: Bigraph Mapping Strategy

### From Paper Section 3.3:
> *"CodeRAG introduces code nodes of the file where the target code locals in because the local file contents are usually related to the target code."*

### Advanced Mapping Features:
1. **Multi-level Mapping**: Direct mapping + contextual expansion
2. **Relevance Scoring**: Đánh giá mức độ liên quan của mapped codes
3. **Anchor Classification**: Phân loại anchors theo importance
4. **Dynamic Filtering**: Lọc anchors based on target context

In [None]:
@dataclass
class CodeAnchor:
    """Enhanced Code Anchor với comprehensive attributes"""
    code_node_id: str
    anchor_type: str  # 'sub_requirement', 'similar_requirement', 'local_file', 'context_expanded'
    source_requirement_id: Optional[str] = None
    relevance_score: float = 0.0
    confidence_score: float = 0.0
    
    # Relationship metadata
    mapping_method: str = "direct"  # direct, semantic, contextual
    mapping_confidence: float = 0.0
    
    # Usage context
    usage_frequency: int = 0
    co_occurrence_score: float = 0.0
    
    # Quality indicators
    is_primary_anchor: bool = False
    dependency_depth: int = 0
    
@dataclass
class MappingResult:
    """Result of bigraph mapping process"""
    target_requirement_id: str
    anchors: List[CodeAnchor] = field(default_factory=list)
    mapping_statistics: Dict[str, Any] = field(default_factory=dict)
    quality_metrics: Dict[str, float] = field(default_factory=dict)
    
class AdvancedBigraphMapper:
    """Advanced Bigraph Mapper với intelligent anchor selection"""
    
    def __init__(self, requirement_graph, ds_code_graph, llm_model="gpt-3.5-turbo"):
        self.req_graph = requirement_graph
        self.code_graph = ds_code_graph
        self.llm = ChatOpenAI(model=llm_model, temperature=0)
        
        # Mapping strategies
        self.mapping_strategies = {
            'direct_name_match': self._direct_name_mapping,
            'semantic_similarity': self._semantic_similarity_mapping,
            'contextual_expansion': self._contextual_expansion_mapping,
            'file_locality': self._file_locality_mapping
        }
        
        # Scoring weights
        self.anchor_weights = {
            'sub_requirement': 0.9,
            'similar_requirement': 0.7,
            'local_file': 0.6,
            'context_expanded': 0.4
        }
        
        # Caching for efficiency
        self.mapping_cache = {}
        
    def map_requirement_to_anchors(self, target_requirement_id: str, 
                                  max_anchors: int = 10,
                                  min_relevance: float = 0.3) -> MappingResult:
        """Complete mapping from requirement to code anchors"""
        
        # Check cache
        cache_key = f"{target_requirement_id}_{max_anchors}_{min_relevance}"
        if cache_key in self.mapping_cache:
            return self.mapping_cache[cache_key]
        
        print(f"Mapping requirement: {target_requirement_id}")
        
        result = MappingResult(target_requirement_id=target_requirement_id)
        
        # Step 1: Find related requirements
        related_reqs = self._find_related_requirements(target_requirement_id)
        print(f"Found {len(related_reqs['sub_requirements'])} sub-requirements, "
              f"{len(related_reqs['similar_requirements'])} similar requirements")
        
        # Step 2: Map requirements to code nodes using multiple strategies
        all_anchors = []
        
        # Map sub-requirements
        for req_id in related_reqs['sub_requirements']:
            anchors = self._map_single_requirement(req_id, 'sub_requirement')
            all_anchors.extend(anchors)
        
        # Map similar requirements
        for req_id in related_reqs['similar_requirements']:
            anchors = self._map_single_requirement(req_id, 'similar_requirement')
            all_anchors.extend(anchors)
        
        # Add local file anchors
        local_anchors = self._get_local_file_anchors(target_requirement_id)
        all_anchors.extend(local_anchors)
        
        # Step 3: Score and rank anchors
        scored_anchors = self._score_anchors(all_anchors, target_requirement_id)
        
        # Step 4: Filter and select top anchors
        selected_anchors = self._select_top_anchors(
            scored_anchors, max_anchors, min_relevance
        )
        
        result.anchors = selected_anchors
        result.mapping_statistics = self._compute_mapping_statistics(selected_anchors)
        result.quality_metrics = self._compute_quality_metrics(result)
        
        # Cache result
        self.mapping_cache[cache_key] = result
        
        print(f"Selected {len(selected_anchors)} code anchors")
        return result
    
    def _find_related_requirements(self, target_requirement_id: str) -> Dict[str, List[str]]:
        """Find sub and similar requirements"""
        related = {
            'sub_requirements': [],
            'similar_requirements': []
        }
        
        if target_requirement_id not in self.req_graph.graph:
            return related
        
        # Find sub-requirements (children)
        for successor in self.req_graph.graph.successors(target_requirement_id):
            edge_data = self.req_graph.graph.get_edge_data(target_requirement_id, successor)
            if edge_data and edge_data.get('relation_type') == 'parent-child':
                related['sub_requirements'].append(successor)
        
        # Find similar requirements
        for successor in self.req_graph.graph.successors(target_requirement_id):
            edge_data = self.req_graph.graph.get_edge_data(target_requirement_id, successor)
            if edge_data and edge_data.get('relation_type') == 'similarity':
                related['similar_requirements'].append(successor)
        
        # Also check incoming edges for bidirectional relationships
        for predecessor in self.req_graph.graph.predecessors(target_requirement_id):
            edge_data = self.req_graph.graph.get_edge_data(predecessor, target_requirement_id)
            if edge_data and edge_data.get('relation_type') == 'similarity':
                if predecessor not in related['similar_requirements']:
                    related['similar_requirements'].append(predecessor)
        
        return related
    
    def _map_single_requirement(self, req_id: str, anchor_type: str) -> List[CodeAnchor]:
        """Map single requirement to code nodes using multiple strategies"""
        anchors = []
        
        if req_id not in self.req_graph.nodes:
            return anchors
        
        req_node = self.req_graph.nodes[req_id]
        
        # Try each mapping strategy
        for strategy_name, strategy_func in self.mapping_strategies.items():
            try:
                strategy_anchors = strategy_func(req_node, anchor_type)
                anchors.extend(strategy_anchors)
            except Exception as e:
                print(f"Strategy {strategy_name} failed for {req_id}: {e}")
        
        # Remove duplicates
        unique_anchors = self._deduplicate_anchors(anchors)
        
        return unique_anchors
    
    def _direct_name_mapping(self, req_node, anchor_type: str) -> List[CodeAnchor]:
        """Direct mapping based on function/class names"""
        anchors = []
        
        # Try exact name match
        target_code_id = f"function:{req_node.file_path}:{req_node.function_name}"
        
        if target_code_id in self.code_graph.nodes:
            anchor = CodeAnchor(
                code_node_id=target_code_id,
                anchor_type=anchor_type,
                source_requirement_id=req_node.id,
                mapping_method="direct",
                mapping_confidence=0.95,
                is_primary_anchor=True
            )
            anchors.append(anchor)
        
        # Try pattern matching for similar names
        for code_id, code_node in self.code_graph.nodes.items():
            if (code_node.node_type in ['Function', 'Method'] and
                code_node.file_path == req_node.file_path and
                self._is_name_similar(req_node.function_name, code_node.name)):
                
                confidence = self._compute_name_similarity(req_node.function_name, code_node.name)
                
                anchor = CodeAnchor(
                    code_node_id=code_id,
                    anchor_type=anchor_type,
                    source_requirement_id=req_node.id,
                    mapping_method="direct",
                    mapping_confidence=confidence
                )
                anchors.append(anchor)
        
        return anchors
    
    def _semantic_similarity_mapping(self, req_node, anchor_type: str) -> List[CodeAnchor]:
        """Mapping based on semantic similarity"""
        anchors = []
        
        # Find semantically similar code nodes
        for code_id, code_node in self.code_graph.nodes.items():
            if code_node.node_type in ['Function', 'Method']:
                similarity = self._compute_semantic_similarity(req_node, code_node)
                
                if similarity >= 0.6:  # Threshold for semantic similarity
                    anchor = CodeAnchor(
                        code_node_id=code_id,
                        anchor_type=anchor_type,
                        source_requirement_id=req_node.id,
                        mapping_method="semantic",
                        mapping_confidence=similarity
                    )
                    anchors.append(anchor)
        
        return anchors
    
    def _contextual_expansion_mapping(self, req_node, anchor_type: str) -> List[CodeAnchor]:
        """Expand mapping based on code context and dependencies"""
        anchors = []
        
        # Find directly mapped nodes first
        direct_anchors = self._direct_name_mapping(req_node, anchor_type)
        
        # Expand to related nodes
        for direct_anchor in direct_anchors:
            related_nodes = self.code_graph.get_one_hop_neighbors(
                direct_anchor.code_node_id, 
                edge_types=['call', 'similarity']
            )
            
            for related_node_id, edge_data in related_nodes:
                confidence = edge_data.get('confidence', 0.5) * 0.7  # Reduced confidence for expansion
                
                anchor = CodeAnchor(
                    code_node_id=related_node_id,
                    anchor_type='context_expanded',
                    source_requirement_id=req_node.id,
                    mapping_method="contextual",
                    mapping_confidence=confidence,
                    dependency_depth=1
                )
                anchors.append(anchor)
        
        return anchors
    
    def _file_locality_mapping(self, req_node, anchor_type: str) -> List[CodeAnchor]:
        """Map based on file locality (as mentioned in paper)"""
        anchors = []
        
        # Find all code nodes in the same file
        for code_id, code_node in self.code_graph.nodes.items():
            if (code_node.file_path == req_node.file_path and 
                code_node.node_type in ['Function', 'Method', 'Class']):
                
                # Calculate file locality score
                locality_score = self._compute_file_locality_score(req_node, code_node)
                
                if locality_score >= 0.3:
                    anchor = CodeAnchor(
                        code_node_id=code_id,
                        anchor_type='local_file',
                        source_requirement_id=req_node.id,
                        mapping_method="file_locality",
                        mapping_confidence=locality_score
                    )
                    anchors.append(anchor)
        
        return anchors
    
    def _get_local_file_anchors(self, target_requirement_id: str) -> List[CodeAnchor]:
        """Get local file anchors as mentioned in paper"""
        anchors = []
        
        if target_requirement_id not in self.req_graph.nodes:
            return anchors
        
        target_req = self.req_graph.nodes[target_requirement_id]
        target_file = target_req.file_path
        
        # Add all relevant nodes from the target file
        for code_id, code_node in self.code_graph.nodes.items():
            if (code_node.file_path == target_file and 
                code_node.node_type in ['Function', 'Method'] and
                code_node.name != target_req.function_name):  # Exclude target itself
                
                anchor = CodeAnchor(
                    code_node_id=code_id,
                    anchor_type='local_file',
                    mapping_method="file_locality",
                    mapping_confidence=0.6  # Default confidence for local files
                )
                anchors.append(anchor)
        
        return anchors
    
    def _score_anchors(self, anchors: List[CodeAnchor], target_requirement_id: str) -> List[CodeAnchor]:
        """Score anchors based on multiple factors"""
        
        for anchor in anchors:
            # Base score from mapping confidence
            base_score = anchor.mapping_confidence * self.anchor_weights.get(anchor.anchor_type, 0.5)
            
            # Adjust based on anchor type priority
            type_bonus = {
                'sub_requirement': 0.2,
                'similar_requirement': 0.1,
                'local_file': 0.05,
                'context_expanded': 0.0
            }.get(anchor.anchor_type, 0.0)
            
            # Primary anchor bonus
            primary_bonus = 0.1 if anchor.is_primary_anchor else 0.0
            
            # Mapping method quality
            method_bonus = {
                'direct': 0.15,
                'semantic': 0.1,
                'contextual': 0.05,
                'file_locality': 0.02
            }.get(anchor.mapping_method, 0.0)
            
            # Compute final scores
            anchor.relevance_score = min(1.0, base_score + type_bonus + primary_bonus + method_bonus)
            anchor.confidence_score = anchor.mapping_confidence
        
        return anchors
    
    def _select_top_anchors(self, anchors: List[CodeAnchor], 
                          max_anchors: int, min_relevance: float) -> List[CodeAnchor]:
        """Select top anchors based on relevance and diversity"""
        
        # Filter by minimum relevance
        filtered = [a for a in anchors if a.relevance_score >= min_relevance]
        
        # Sort by relevance score (descending)
        sorted_anchors = sorted(filtered, key=lambda x: x.relevance_score, reverse=True)
        
        # Ensure diversity in anchor types
        selected = []
        type_counts = defaultdict(int)
        max_per_type = max(2, max_anchors // 3)  # At least 2 per type, or 1/3 of total
        
        for anchor in sorted_anchors:
            if (len(selected) < max_anchors and 
                type_counts[anchor.anchor_type] < max_per_type):
                selected.append(anchor)
                type_counts[anchor.anchor_type] += 1
        
        # Fill remaining slots with best available
        remaining_slots = max_anchors - len(selected)
        remaining_anchors = [a for a in sorted_anchors if a not in selected]
        
        selected.extend(remaining_anchors[:remaining_slots])
        
        return selected
    
    # Helper methods
    def _is_name_similar(self, name1: str, name2: str) -> bool:
        """Check if two names are similar"""
        # Simple similarity check
        return (name1.lower() in name2.lower() or 
                name2.lower() in name1.lower() or
                abs(len(name1) - len(name2)) <= 2)
    
    def _compute_name_similarity(self, name1: str, name2: str) -> float:
        """Compute similarity between two names"""
        from difflib import SequenceMatcher
        return SequenceMatcher(None, name1.lower(), name2.lower()).ratio()
    
    def _compute_semantic_similarity(self, req_node, code_node) -> float:
        """Compute semantic similarity between requirement and code"""
        # Use embeddings if available, otherwise fall back to simple text similarity
        try:
            if hasattr(req_node, 'embedding') and hasattr(code_node, 'embedding'):
                if req_node.embedding is not None and code_node.embedding is not None:
                    return cosine_similarity(
                        req_node.embedding.reshape(1, -1),
                        code_node.embedding.reshape(1, -1)
                    )[0][0]
        except:
            pass
        
        # Fallback to simple text similarity
        from difflib import SequenceMatcher
        return SequenceMatcher(None, req_node.description.lower(), 
                              code_node.source_code.lower()).ratio()
    
    def _compute_file_locality_score(self, req_node, code_node) -> float:
        """Compute file locality score"""
        # Same file gets base score
        base_score = 0.5
        
        # Bonus for same naming patterns
        if req_node.function_name and code_node.name:
            name_similarity = self._compute_name_similarity(req_node.function_name, code_node.name)
            base_score += name_similarity * 0.3
        
        # Bonus for proximity in file (line numbers)
        if (hasattr(req_node, 'line_start') and hasattr(code_node, 'line_start') and
            req_node.line_start and code_node.line_start):
            line_distance = abs(req_node.line_start - code_node.line_start)
            proximity_bonus = max(0, 0.2 - line_distance / 1000)  # Closer = higher score
            base_score += proximity_bonus
        
        return min(1.0, base_score)
    
    def _deduplicate_anchors(self, anchors: List[CodeAnchor]) -> List[CodeAnchor]:
        """Remove duplicate anchors, keeping the best one"""
        anchor_map = {}
        
        for anchor in anchors:
            key = anchor.code_node_id
            
            if key not in anchor_map or anchor.mapping_confidence > anchor_map[key].mapping_confidence:
                anchor_map[key] = anchor
        
        return list(anchor_map.values())
    
    def _compute_mapping_statistics(self, anchors: List[CodeAnchor]) -> Dict[str, Any]:
        """Compute mapping statistics"""
        stats = {
            'total_anchors': len(anchors),
            'by_type': defaultdict(int),
            'by_method': defaultdict(int),
            'avg_relevance': 0.0,
            'avg_confidence': 0.0,
            'primary_anchors': 0
        }
        
        if not anchors:
            return stats
        
        relevance_scores = []
        confidence_scores = []
        
        for anchor in anchors:
            stats['by_type'][anchor.anchor_type] += 1
            stats['by_method'][anchor.mapping_method] += 1
            relevance_scores.append(anchor.relevance_score)
            confidence_scores.append(anchor.confidence_score)
            
            if anchor.is_primary_anchor:
                stats['primary_anchors'] += 1
        
        stats['avg_relevance'] = np.mean(relevance_scores)
        stats['avg_confidence'] = np.mean(confidence_scores)
        
        return dict(stats)
    
    def _compute_quality_metrics(self, result: MappingResult) -> Dict[str, float]:
        """Compute quality metrics for mapping result"""
        metrics = {
            'coverage': 0.0,  # How well different types are covered
            'diversity': 0.0,  # Diversity of anchor types
            'confidence': 0.0,  # Overall confidence
            'relevance': 0.0  # Overall relevance
        }
        
        if not result.anchors:
            return metrics
        
        # Confidence and relevance
        metrics['confidence'] = np.mean([a.confidence_score for a in result.anchors])
        metrics['relevance'] = np.mean([a.relevance_score for a in result.anchors])
        
        # Diversity (Shannon entropy of anchor types)
        type_counts = Counter(a.anchor_type for a in result.anchors)
        total = len(result.anchors)
        entropy = -sum((count/total) * np.log2(count/total) for count in type_counts.values())
        max_entropy = np.log2(len(type_counts)) if type_counts else 1
        metrics['diversity'] = entropy / max_entropy if max_entropy > 0 else 0
        
        # Coverage (how many different types are represented)
        expected_types = {'sub_requirement', 'similar_requirement', 'local_file'}
        actual_types = set(a.anchor_type for a in result.anchors)
        metrics['coverage'] = len(actual_types & expected_types) / len(expected_types)
        
        return metrics

# Test data structures
print("Advanced Bigraph Mapper components defined successfully!")

## 🧠 Deep Dive: Intelligent Anchor Selection

### Key Innovation từ paper:
> *"CodeRAG can map sub-requirement and similar requirement nodes of the target requirement into their corresponding code nodes and successfully find supportive codes."*

### Advanced Selection Features:
1. **Multi-strategy Mapping**: Kết hợp multiple mapping approaches
2. **Quality-based Filtering**: Filter anchors theo relevance và confidence
3. **Diversity Enforcement**: Đảm bảo diverse anchor types
4. **Context-aware Scoring**: Adjust scores based on context

In [None]:
# Mock requirement and code graphs for testing
class MockRequirementGraph:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.nodes = {}
        self._create_mock_data()
    
    def _create_mock_data(self):
        from dataclasses import dataclass
        
        @dataclass
        class MockReqNode:
            id: str
            description: str
            file_path: str
            function_name: str
            line_start: int = 1
        
        # Create mock requirement nodes
        req_nodes = [
            MockReqNode("req1", "Validate user input data", "utils.py", "validate_input", 10),
            MockReqNode("req2", "Clean and sanitize user data", "utils.py", "clean_data", 20),
            MockReqNode("req3", "Hash password securely", "auth.py", "hash_password", 15),
            MockReqNode("req4", "Verify password hash", "auth.py", "verify_password", 25),
            MockReqNode("req5", "Process user authentication", "auth.py", "authenticate_user", 35)
        ]
        
        for node in req_nodes:
            self.nodes[node.id] = node
            self.graph.add_node(node.id)
        
        # Add relationships
        relationships = [
            ("req5", "req3", "parent-child"),  # authenticate_user uses hash_password
            ("req5", "req4", "parent-child"),  # authenticate_user uses verify_password
            ("req5", "req1", "parent-child"),  # authenticate_user uses validate_input
            ("req1", "req2", "similarity"),    # validate and clean are similar
            ("req3", "req4", "similarity"),    # hash and verify are similar
        ]
        
        for source, target, rel_type in relationships:
            self.graph.add_edge(source, target, relation_type=rel_type, confidence=0.8)

class MockDSCodeGraph:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.nodes = {}
        self._create_mock_data()
    
    def _create_mock_data(self):
        from dataclasses import dataclass
        
        @dataclass
        class MockCodeNode:
            id: str
            node_type: str
            name: str
            file_path: str
            source_code: str
            line_start: int = 1
        
        # Create mock code nodes
        code_nodes = [
            MockCodeNode("func:utils.py:validate_input", "Function", "validate_input", "utils.py", 
                        "def validate_input(data): return data is not None", 10),
            MockCodeNode("func:utils.py:clean_data", "Function", "clean_data", "utils.py",
                        "def clean_data(data): return [x for x in data if x]", 20),
            MockCodeNode("func:utils.py:format_data", "Function", "format_data", "utils.py",
                        "def format_data(data): return str(data).strip()", 30),
            MockCodeNode("func:auth.py:hash_password", "Function", "hash_password", "auth.py",
                        "def hash_password(pwd): import hashlib; return hashlib.sha256(pwd.encode()).hexdigest()", 15),
            MockCodeNode("func:auth.py:verify_password", "Function", "verify_password", "auth.py",
                        "def verify_password(pwd, hash_val): return hash_password(pwd) == hash_val", 25),
            MockCodeNode("func:auth.py:authenticate_user", "Function", "authenticate_user", "auth.py",
                        "def authenticate_user(user, pwd): return verify_password(pwd, user.password_hash)", 35),
            MockCodeNode("func:auth.py:create_session", "Function", "create_session", "auth.py",
                        "def create_session(user): return {'user_id': user.id, 'timestamp': time.now()}", 45)
        ]
        
        for node in code_nodes:
            self.nodes[node.id] = node
            self.graph.add_node(node.id)
        
        # Add call relationships
        call_relationships = [
            ("func:auth.py:authenticate_user", "func:auth.py:verify_password", "call"),
            ("func:auth.py:verify_password", "func:auth.py:hash_password", "call"),
        ]
        
        for source, target, edge_type in call_relationships:
            self.graph.add_edge(source, target, edge_type=edge_type, confidence=0.9)
    
    def get_one_hop_neighbors(self, node_id: str, edge_types=None):
        neighbors = []
        for successor in self.graph.successors(node_id):
            edge_data = self.graph.get_edge_data(node_id, successor)
            if edge_types is None or edge_data.get('edge_type') in edge_types:
                neighbors.append((successor, edge_data))
        return neighbors

# Create mock graphs
mock_req_graph = MockRequirementGraph()
mock_code_graph = MockDSCodeGraph()

# Initialize advanced mapper
mapper = AdvancedBigraphMapper(mock_req_graph, mock_code_graph)

# Test mapping for authenticate_user requirement
print("Testing Advanced Bigraph Mapping...")
mapping_result = mapper.map_requirement_to_anchors(
    target_requirement_id="req5",  # authenticate_user
    max_anchors=8,
    min_relevance=0.2
)

print(f"\nMapping Result Summary:")
print(f"Target requirement: {mapping_result.target_requirement_id}")
print(f"Total anchors: {len(mapping_result.anchors)}")
print(f"\nAnchors by type:")
for anchor_type, count in mapping_result.mapping_statistics['by_type'].items():
    print(f"  {anchor_type}: {count}")

print(f"\nQuality metrics:")
for metric, value in mapping_result.quality_metrics.items():
    print(f"  {metric}: {value:.3f}")

print(f"\nTop 5 anchors:")
for i, anchor in enumerate(mapping_result.anchors[:5]):
    code_node = mock_code_graph.nodes[anchor.code_node_id]
    print(f"  {i+1}. {code_node.name} ({anchor.anchor_type}) - Relevance: {anchor.relevance_score:.3f}")

## 📊 Anchor Quality Analysis và Optimization

### Comprehensive analysis của anchor selection quality:

In [None]:
class AnchorQualityAnalyzer:
    """Comprehensive anchor quality analysis và optimization"""
    
    def __init__(self, mapper: AdvancedBigraphMapper):
        self.mapper = mapper
        
    def analyze_mapping_quality(self, mapping_results: List[MappingResult]) -> Dict[str, Any]:
        """Analyze quality across multiple mapping results"""
        
        analysis = {
            'overall_metrics': {},
            'anchor_type_analysis': {},
            'mapping_method_analysis': {},
            'quality_distribution': {},
            'recommendations': []
        }
        
        if not mapping_results:
            return analysis
        
        # Collect all anchors
        all_anchors = []
        for result in mapping_results:
            all_anchors.extend(result.anchors)
        
        # Overall metrics
        relevance_scores = [a.relevance_score for a in all_anchors]
        confidence_scores = [a.confidence_score for a in all_anchors]
        
        analysis['overall_metrics'] = {
            'total_mappings': len(mapping_results),
            'total_anchors': len(all_anchors),
            'avg_anchors_per_mapping': len(all_anchors) / len(mapping_results),
            'avg_relevance': np.mean(relevance_scores) if relevance_scores else 0,
            'avg_confidence': np.mean(confidence_scores) if confidence_scores else 0,
            'relevance_std': np.std(relevance_scores) if relevance_scores else 0,
            'confidence_std': np.std(confidence_scores) if confidence_scores else 0
        }
        
        # Anchor type analysis
        type_analysis = defaultdict(lambda: {
            'count': 0, 'avg_relevance': 0, 'avg_confidence': 0, 'scores': []
        })
        
        for anchor in all_anchors:
            type_analysis[anchor.anchor_type]['count'] += 1
            type_analysis[anchor.anchor_type]['scores'].append({
                'relevance': anchor.relevance_score,
                'confidence': anchor.confidence_score
            })
        
        for anchor_type, data in type_analysis.items():
            scores = data['scores']
            data['avg_relevance'] = np.mean([s['relevance'] for s in scores])
            data['avg_confidence'] = np.mean([s['confidence'] for s in scores])
            del data['scores']  # Remove raw scores to save space
        
        analysis['anchor_type_analysis'] = dict(type_analysis)
        
        # Mapping method analysis
        method_analysis = defaultdict(lambda: {
            'count': 0, 'avg_relevance': 0, 'avg_confidence': 0, 'success_rate': 0
        })
        
        for anchor in all_anchors:
            method = anchor.mapping_method
            method_analysis[method]['count'] += 1
            method_analysis[method]['avg_relevance'] += anchor.relevance_score
            method_analysis[method]['avg_confidence'] += anchor.confidence_score
            
            if anchor.relevance_score >= 0.6:  # Successful mapping threshold
                method_analysis[method]['success_rate'] += 1
        
        for method, data in method_analysis.items():
            count = data['count']
            if count > 0:
                data['avg_relevance'] /= count
                data['avg_confidence'] /= count
                data['success_rate'] = data['success_rate'] / count
        
        analysis['mapping_method_analysis'] = dict(method_analysis)
        
        # Quality distribution analysis
        quality_ranges = {
            'high_quality': sum(1 for a in all_anchors if a.relevance_score >= 0.8),
            'medium_quality': sum(1 for a in all_anchors if 0.5 <= a.relevance_score < 0.8),
            'low_quality': sum(1 for a in all_anchors if a.relevance_score < 0.5)
        }
        
        total = len(all_anchors)
        analysis['quality_distribution'] = {
            k: {'count': v, 'percentage': v/total*100 if total > 0 else 0}
            for k, v in quality_ranges.items()
        }
        
        # Generate recommendations
        recommendations = self._generate_recommendations(analysis)
        analysis['recommendations'] = recommendations
        
        return analysis
    
    def _generate_recommendations(self, analysis: Dict) -> List[str]:
        """Generate optimization recommendations based on analysis"""
        recommendations = []
        
        overall = analysis['overall_metrics']
        type_analysis = analysis['anchor_type_analysis']
        method_analysis = analysis['mapping_method_analysis']
        quality_dist = analysis['quality_distribution']
        
        # Overall quality recommendations
        if overall['avg_relevance'] < 0.6:
            recommendations.append("Overall relevance is low - consider tuning anchor weights or thresholds")
        
        if overall['relevance_std'] > 0.3:
            recommendations.append("High variance in relevance scores - review scoring consistency")
        
        # Type-specific recommendations
        for anchor_type, metrics in type_analysis.items():
            if metrics['avg_relevance'] < 0.5:
                recommendations.append(f"Improve {anchor_type} anchor quality - current avg: {metrics['avg_relevance']:.3f}")
        
        # Method-specific recommendations
        best_method = max(method_analysis.items(), key=lambda x: x[1]['success_rate'])
        worst_method = min(method_analysis.items(), key=lambda x: x[1]['success_rate'])
        
        if best_method[1]['success_rate'] - worst_method[1]['success_rate'] > 0.3:
            recommendations.append(f"Consider emphasizing {best_method[0]} method over {worst_method[0]}")
        
        # Quality distribution recommendations
        high_quality_pct = quality_dist['high_quality']['percentage']
        if high_quality_pct < 30:
            recommendations.append(f"Only {high_quality_pct:.1f}% high-quality anchors - review selection criteria")
        
        low_quality_pct = quality_dist['low_quality']['percentage']
        if low_quality_pct > 40:
            recommendations.append(f"{low_quality_pct:.1f}% low-quality anchors - increase minimum relevance threshold")
        
        return recommendations
    
    def visualize_anchor_analysis(self, analysis: Dict, figsize=(16, 12)):
        """Visualize anchor quality analysis"""
        
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=figsize)
        
        # 1. Anchor type performance
        type_analysis = analysis['anchor_type_analysis']
        if type_analysis:
            types = list(type_analysis.keys())
            relevance_scores = [type_analysis[t]['avg_relevance'] for t in types]
            confidence_scores = [type_analysis[t]['avg_confidence'] for t in types]
            
            x = np.arange(len(types))
            width = 0.35
            
            ax1.bar(x - width/2, relevance_scores, width, label='Relevance', alpha=0.8)
            ax1.bar(x + width/2, confidence_scores, width, label='Confidence', alpha=0.8)
            ax1.set_xlabel('Anchor Type')
            ax1.set_ylabel('Score')
            ax1.set_title('Performance by Anchor Type')
            ax1.set_xticks(x)
            ax1.set_xticklabels(types, rotation=45, ha='right')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
        
        # 2. Mapping method success rates
        method_analysis = analysis['mapping_method_analysis']
        if method_analysis:
            methods = list(method_analysis.keys())
            success_rates = [method_analysis[m]['success_rate'] for m in methods]
            counts = [method_analysis[m]['count'] for m in methods]
            
            # Color by success rate
            colors = ['green' if sr >= 0.7 else 'orange' if sr >= 0.5 else 'red' for sr in success_rates]
            
            bars = ax2.bar(methods, success_rates, color=colors, alpha=0.8)
            ax2.set_xlabel('Mapping Method')
            ax2.set_ylabel('Success Rate')
            ax2.set_title('Success Rate by Mapping Method')
            ax2.tick_params(axis='x', rotation=45)
            
            # Add count labels
            for bar, count in zip(bars, counts):
                ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                        f'n={count}', ha='center', va='bottom', fontsize=8)
        
        # 3. Quality distribution
        quality_dist = analysis['quality_distribution']
        if quality_dist:
            labels = list(quality_dist.keys())
            sizes = [quality_dist[label]['count'] for label in labels]
            colors = ['green', 'orange', 'red']
            
            wedges, texts, autotexts = ax3.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
            ax3.set_title('Quality Distribution')
        
        # 4. Overall metrics summary
        overall = analysis['overall_metrics']
        if overall:
            metrics = ['Avg Relevance', 'Avg Confidence', 'Anchors per Mapping']
            values = [
                overall['avg_relevance'],
                overall['avg_confidence'],
                overall['avg_anchors_per_mapping'] / 10  # Normalize for visualization
            ]
            
            colors = ['green' if v >= 0.7 else 'orange' if v >= 0.5 else 'red' for v in values]
            bars = ax4.bar(metrics, values, color=colors, alpha=0.8)
            ax4.set_ylabel('Normalized Score')
            ax4.set_title('Overall Quality Metrics')
            ax4.tick_params(axis='x', rotation=45)
            
            # Add value labels
            for bar, value in zip(bars, values):
                display_value = value * 10 if 'Anchors' in bar.get_x() else value
                ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                        f'{display_value:.2f}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()
        
        # Print detailed analysis
        print("\n" + "="*70)
        print("ANCHOR QUALITY ANALYSIS REPORT")
        print("="*70)
        
        print(f"\n📊 Overall Metrics:")
        for metric, value in overall.items():
            print(f"• {metric.replace('_', ' ').title()}: {value:.3f}")
        
        print(f"\n🎯 Quality Distribution:")
        for quality, data in quality_dist.items():
            print(f"• {quality.replace('_', ' ').title()}: {data['count']} ({data['percentage']:.1f}%)")
        
        print(f"\n💡 Recommendations:")
        for i, rec in enumerate(analysis['recommendations'], 1):
            print(f"{i}. {rec}")
        
        return analysis

# Test with multiple mappings
test_requirements = ["req1", "req3", "req5"]
mapping_results = []

print("\nTesting multiple requirement mappings...")
for req_id in test_requirements:
    if req_id in mock_req_graph.nodes:
        result = mapper.map_requirement_to_anchors(req_id, max_anchors=6, min_relevance=0.1)
        mapping_results.append(result)
        print(f"Mapped {req_id}: {len(result.anchors)} anchors")

# Analyze quality
analyzer = AnchorQualityAnalyzer(mapper)
quality_analysis = analyzer.analyze_mapping_quality(mapping_results)
analyzer.visualize_anchor_analysis(quality_analysis)

## 🧪 Comprehensive Testing với Mock Data

### Test scenarios để validate bigraph mapping correctness:

In [None]:
def create_bigraph_mapping_test_scenarios():
    """Create comprehensive test scenarios for bigraph mapping"""
    
    scenarios = {
        'direct_mapping_test': {
            'description': 'Test direct name-based mapping',
            'target_req': 'req3',  # hash_password
            'expected_anchors': {
                'direct': ['hash_password'],
                'similar': ['verify_password'],  # Similar functionality
                'local': ['authenticate_user', 'create_session']  # Same file
            },
            'min_expected_total': 3
        },
        
        'sub_requirement_mapping_test': {
            'description': 'Test sub-requirement mapping',
            'target_req': 'req5',  # authenticate_user (has sub-requirements)
            'expected_anchors': {
                'sub_requirement': ['hash_password', 'verify_password', 'validate_input'],
                'local': ['create_session'],
                'context_expanded': []  # May include related functions
            },
            'min_expected_total': 4
        },
        
        'similarity_mapping_test': {
            'description': 'Test similarity-based mapping',
            'target_req': 'req1',  # validate_input (has similar requirement)
            'expected_anchors': {
                'direct': ['validate_input'],
                'similar': ['clean_data'],  # Similar requirement
                'local': ['format_data']  # Same file
            },
            'min_expected_total': 2
        },
        
        'quality_threshold_test': {
            'description': 'Test quality thresholding',
            'target_req': 'req2',  # clean_data
            'config': {'min_relevance': 0.7, 'max_anchors': 3},
            'expected_behavior': 'Only high-quality anchors should be selected'
        }
    }
    
    return scenarios

def run_bigraph_mapping_test(scenario_name: str, scenario_data: Dict, mapper: AdvancedBigraphMapper) -> Dict:
    """Run a single bigraph mapping test scenario"""
    
    print(f"\n🧪 Testing: {scenario_name}")
    print(f"Description: {scenario_data['description']}")
    
    target_req = scenario_data['target_req']
    config = scenario_data.get('config', {})
    
    # Run mapping
    result = mapper.map_requirement_to_anchors(
        target_requirement_id=target_req,
        max_anchors=config.get('max_anchors', 8),
        min_relevance=config.get('min_relevance', 0.2)
    )
    
    test_result = {
        'scenario': scenario_name,
        'target_req': target_req,
        'total_anchors': len(result.anchors),
        'anchors_by_type': defaultdict(list),
        'quality_metrics': result.quality_metrics,
        'passed_checks': [],
        'failed_checks': [],
        'success_score': 0.0
    }
    
    # Organize anchors by type
    for anchor in result.anchors:
        code_node = mapper.code_graph.nodes[anchor.code_node_id]
        test_result['anchors_by_type'][anchor.anchor_type].append(code_node.name)
    
    print(f"\nResults:")
    print(f"• Total anchors: {test_result['total_anchors']}")
    for anchor_type, names in test_result['anchors_by_type'].items():
        print(f"• {anchor_type}: {names}")
    
    # Run specific tests
    if 'expected_anchors' in scenario_data:
        expected = scenario_data['expected_anchors']
        
        # Check each expected type
        for exp_type, exp_names in expected.items():
            actual_names = test_result['anchors_by_type'].get(exp_type, [])
            
            # Check if expected anchors are found
            found_expected = [name for name in exp_names if name in actual_names]
            
            if len(found_expected) >= len(exp_names) * 0.7:  # 70% threshold
                test_result['passed_checks'].append(f"{exp_type}: Found {len(found_expected)}/{len(exp_names)} expected")
            else:
                test_result['failed_checks'].append(f"{exp_type}: Only found {len(found_expected)}/{len(exp_names)} expected")
    
    # Check minimum total anchors
    if 'min_expected_total' in scenario_data:
        min_expected = scenario_data['min_expected_total']
        if test_result['total_anchors'] >= min_expected:
            test_result['passed_checks'].append(f"Minimum anchors: {test_result['total_anchors']} >= {min_expected}")
        else:
            test_result['failed_checks'].append(f"Minimum anchors: {test_result['total_anchors']} < {min_expected}")
    
    # Check quality metrics
    quality = result.quality_metrics
    if quality.get('relevance', 0) >= 0.5:
        test_result['passed_checks'].append(f"Good relevance: {quality['relevance']:.3f}")
    else:
        test_result['failed_checks'].append(f"Low relevance: {quality['relevance']:.3f}")
    
    if quality.get('diversity', 0) >= 0.3:
        test_result['passed_checks'].append(f"Good diversity: {quality['diversity']:.3f}")
    else:
        test_result['failed_checks'].append(f"Low diversity: {quality['diversity']:.3f}")
    
    # Calculate success score
    total_checks = len(test_result['passed_checks']) + len(test_result['failed_checks'])
    if total_checks > 0:
        test_result['success_score'] = len(test_result['passed_checks']) / total_checks
    
    # Print results
    print(f"\n✅ Passed checks: {len(test_result['passed_checks'])}")
    for check in test_result['passed_checks']:
        print(f"  • {check}")
    
    if test_result['failed_checks']:
        print(f"\n❌ Failed checks: {len(test_result['failed_checks'])}")
        for check in test_result['failed_checks']:
            print(f"  • {check}")
    
    print(f"\n🎯 Success score: {test_result['success_score']:.1%}")
    
    return test_result

def run_comprehensive_bigraph_tests() -> List[Dict]:
    """Run all bigraph mapping test scenarios"""
    
    scenarios = create_bigraph_mapping_test_scenarios()
    test_results = []
    
    for scenario_name, scenario_data in scenarios.items():
        result = run_bigraph_mapping_test(scenario_name, scenario_data, mapper)
        test_results.append(result)
    
    # Overall summary
    print("\n" + "="*70)
    print("BIGRAPH MAPPING TEST SUMMARY")
    print("="*70)
    
    total_success = sum(r['success_score'] for r in test_results)
    avg_success = total_success / len(test_results) if test_results else 0
    
    print(f"\n📊 Overall Results:")
    print(f"• Total test scenarios: {len(test_results)}")
    print(f"• Average success rate: {avg_success:.1%}")
    
    print(f"\n📋 Individual Results:")
    for result in test_results:
        status = "✅" if result['success_score'] >= 0.8 else "⚠️" if result['success_score'] >= 0.6 else "❌"
        print(f"{status} {result['scenario']}: {result['success_score']:.1%} ({result['total_anchors']} anchors)")
    
    # Visualization
    plt.figure(figsize=(12, 8))
    
    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
    
    # Success rates by scenario
    scenario_names = [r['scenario'].replace('_test', '').replace('_', ' ').title() for r in test_results]
    success_scores = [r['success_score'] for r in test_results]
    
    colors = ['green' if score >= 0.8 else 'orange' if score >= 0.6 else 'red' for score in success_scores]
    bars1 = ax1.bar(scenario_names, success_scores, color=colors, alpha=0.8)
    ax1.set_title('Test Success Rates by Scenario')
    ax1.set_ylabel('Success Rate')
    ax1.set_ylim(0, 1)
    ax1.tick_params(axis='x', rotation=45)
    
    # Add percentage labels
    for bar, score in zip(bars1, success_scores):
        ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                f'{score:.1%}', ha='center', va='bottom')
    
    # Anchor counts by scenario
    anchor_counts = [r['total_anchors'] for r in test_results]
    bars2 = ax2.bar(scenario_names, anchor_counts, alpha=0.8, color='skyblue')
    ax2.set_title('Total Anchors by Scenario')
    ax2.set_ylabel('Number of Anchors')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add count labels
    for bar, count in zip(bars2, anchor_counts):
        ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
                str(count), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return test_results

# Run comprehensive tests
test_results = run_comprehensive_bigraph_tests()

print("\n" + "="*70)
print("BIGRAPH MAPPING FOCUSED LEARNING COMPLETE")
print("="*70)
print("Key Learnings:")
print("1. Multi-strategy mapping improves anchor discovery")
print("2. Quality scoring helps select relevant anchors")
print("3. Diversity enforcement prevents anchor bias")
print("4. Context-aware filtering improves precision")
print("5. Comprehensive testing validates mapping correctness")
print("6. Local file anchors provide important context")
print("="*70)