In [22]:
# 02_character_context.ipynb

# Phase 2: Character Context Extraction & Evidence Construction
# Builds directly from Phase 1 outputs

print("=" * 80)
print("PHASE 2: CHARACTER CONTEXT EXTRACTION & EVIDENCE CONSTRUCTION")
print("=" * 80)
print("Goal: Isolate narrative evidence relevant to character development")
print("      while preserving causal and emotional context.")
print("=" * 80)

# ============================================
# STEP 1: Recreate Phase 1 Environment
# ============================================

print("\n1. Recreating Phase 1 environment...")

# First, install Pathway if needed
!pip install pathway pandas numpy -q

import pathway as pw
import pandas as pd
import re
import numpy as np
from typing import List, Dict, Tuple, Optional
from dataclasses import dataclass
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# Define the same paths as Phase 1
PROJECT_ROOT = Path("/root/DataDivas_KDSH_2026")
DATA_DIR = PROJECT_ROOT / "Data"
BOOKS_DIR = DATA_DIR / "Books"

print(f"   Project root: {PROJECT_ROOT}")
print(f"   Data directory: {DATA_DIR}")
print(f"   Books directory: {BOOKS_DIR}")

# ============================================
# STEP 2: Recreate Phase 1 Tables EXACTLY
# ============================================

print("\n2. Recreating Phase 1 tables...")

# Recreate the exact same schema from Phase 1
class TrainSchema(pw.Schema):
    uid: int
    book_name: str
    char: str
    caption: str
    content: str
    label: str

class BooksSchema(pw.Schema):
    title: str
    full_text: str
    file_path: str
    char_count: int
    word_count: int

# Load training data (same as Phase 1)
train_csv_path = str(DATA_DIR / "train.csv")
test_csv_path = str(DATA_DIR / "test.csv")

train_table = pw.io.csv.read(
    train_csv_path,
    schema=TrainSchema,
    mode="static"
)

test_table = pw.io.csv.read(
    test_csv_path,
    schema=TrainSchema,
    mode='static'
)

# Load books (same as Phase 1)
def load_book_content(file_path: str) -> dict:
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    return {
        "title": Path(file_path).stem,
        "full_text": content,
        "file_path": file_path,
        "char_count": len(content),
        "word_count": len(content.split())
    }

book_data_list = []
for book_file in BOOKS_DIR.glob("*.txt"):
    book_info = load_book_content(str(book_file))
    book_data_list.append(book_info)
    print(f"   Loaded: {book_info['title']} - {book_info['word_count']:,} words")

books_df = pd.DataFrame(book_data_list)
books_table = pw.debug.table_from_pandas(
    books_df,
    schema=BooksSchema
)

# Create joined_table (EXACTLY as in Phase 1)
joined_table = train_table.join(
    books_table,
    pw.this.book_name == books_table.title
).select(
    sample_id=pw.this.uid,
    character_name=pw.this.char,
    backstory=pw.this.content,
    label=pw.this.label,
    novel_title=pw.this.book_name,
    full_text=books_table.full_text,
    char_count=books_table.char_count,
    word_count=books_table.word_count
)

print(f"\n3. Phase 1 tables recreated successfully:")
train_df = pw.debug.table_to_pandas(train_table)
books_pd = pw.debug.table_to_pandas(books_table)
joined_df = pw.debug.table_to_pandas(joined_table)

print(f"   ‚Ä¢ train_table: {len(train_df)} rows")
print(f"   ‚Ä¢ books_table: {len(books_pd)} rows")
print(f"   ‚Ä¢ joined_table: {len(joined_df)} rows")

# Display sample of joined data
print(f"\n   Sample from joined_table:")
print(f"   Characters: {joined_df['character_name'].unique().tolist()}")
print(f"   Novels: {joined_df['novel_title'].unique().tolist()}")
print(f"   Labels: {joined_df['label'].value_counts().to_dict()}")

# ============================================
# STEP 3: Character Profile Creation
# ============================================

print("\n4. Creating character profiles...")

@dataclass
class CharacterProfile:
    """Store character identification information"""
    primary_name: str
    name_variants: List[str]
    pronouns: List[str]
    titles: List[str]
    novel_title: str
    novel_text: str
    backstories: List[str]  # Multiple backstories from training data
    labels: List[int]       # Corresponding labels

class CharacterIdentifier:
    """Handle character name ambiguity and variant identification"""
    
    @staticmethod
    def extract_name_variants(character_name: str) -> List[str]:
        """Extract possible name variants from a character name"""
        variants = set()
        
        # Add full name
        clean_name = character_name.strip()
        variants.add(clean_name)
        
        # Split by common separators
        parts = re.split(r'[\s,\.\-]+', clean_name)
        
        # Add first name only
        if len(parts) > 0 and len(parts[0]) > 1:
            variants.add(parts[0])
            # Add common abbreviations
            if len(parts[0]) > 2:
                variants.add(parts[0][0] + ".")  # E. for Edmond
            
        # Add last name only if exists
        if len(parts) > 1 and len(parts[-1]) > 1:
            variants.add(parts[-1])
            
        # Add common French/English variants for known characters
        name_lower = clean_name.lower()
        if "edmond" in name_lower or "dant√®s" in name_lower:
            variants.update(["Edmond", "Dant√®s", "Dantes", "Count", "Monte Cristo"])
        if "paganel" in name_lower:
            variants.update(["Paganel", "Jacques", "Professor"])
            
        return [v for v in variants if v and len(v) > 1]
    
    @staticmethod
    def infer_pronouns(character_name: str, novel_text: str) -> List[str]:
        """Infer pronouns from context"""
        if not novel_text or len(novel_text) < 1000:
            # Default based on name patterns
            if any(female in character_name.lower() for female in ['mary', 'anne', 'elizabeth', 'sophie']):
                return ['she', 'her', 'hers', 'herself']
            else:
                return ['he', 'him', 'his', 'himself']
        
        # Sample text for analysis
        sample_size = min(5000, len(novel_text))
        samples = [
            novel_text[:sample_size],
            novel_text[-sample_size:] if len(novel_text) > sample_size else ""
        ]
        
        male_indicators = {' he ', ' him ', ' his ', ' himself ', ' mr ', ' monsieur ', ' sir '}
        female_indicators = {' she ', ' her ', ' hers ', ' herself ', ' mrs ', ' madame ', ' lady ', ' miss '}
        
        male_score = 0
        female_score = 0
        
        for sample in samples:
            if not sample:
                continue
            sample_lower = ' ' + sample.lower() + ' '
            male_score += sum(sample_lower.count(' ' + indicator + ' ') for indicator in male_indicators)
            female_score += sum(sample_lower.count(' ' + indicator + ' ') for indicator in female_indicators)
        
        # Check for gendered titles near character mentions
        first_name = character_name.split()[0].lower() if ' ' in character_name else character_name.lower()
        for i in range(0, min(10000, len(novel_text)), 1000):
            snippet = novel_text[i:i+1000].lower()
            if first_name in snippet:
                if ' mr ' in snippet or ' monsieur ' in snippet or ' sir ' in snippet:
                    male_score += 2
                if ' mrs ' in snippet or ' madame ' in snippet or ' lady ' in snippet:
                    female_score += 2
        
        if female_score > male_score:
            return ['she', 'her', 'hers', 'herself']
        else:
            return ['he', 'him', 'his', 'himself']
    
    @staticmethod
    def infer_titles(character_name: str, novel_text: str) -> List[str]:
        """Infer titles from context"""
        if not novel_text:
            return []
        
        titles = set()
        first_name = character_name.split()[0] if ' ' in character_name else character_name
        
        # Look for patterns in first 20000 characters
        search_text = novel_text[:20000].lower()
        
        # Pattern: "the [title] [name]"
        patterns = [
            rf'the ([a-z]+) {re.escape(first_name.lower())}',
            rf'([A-Z][a-z]+) {re.escape(first_name)}',
        ]
        
        for pattern in patterns:
            matches = re.finditer(pattern, search_text, re.IGNORECASE)
            for match in matches:
                title = match.group(1).lower()
                # Filter out common words that aren't titles
                if len(title) > 3 and title not in {'said', 'asked', 'replied', 'exclaimed'}:
                    titles.add(f"the {title}")
        
        # Add common titles based on character type
        name_lower = character_name.lower()
        if any(word in name_lower for word in ['count', 'lord', 'duke', 'earl']):
            titles.add("the count")
        if 'professor' in name_lower or 'doctor' in name_lower:
            titles.add("the professor")
            titles.add("the doctor")
        if 'captain' in name_lower:
            titles.add("the captain")
            
        return list(titles)[:5]

# Build comprehensive character profiles
character_profiles = {}
for _, row in joined_df.iterrows():
    char_name = row['character_name']
    novel_title = row['novel_title']
    novel_text = row['full_text']
    backstory = row['backstory']
    label = str(row['label'])  # Convert to Python int
    
    if char_name not in character_profiles:
        identifier = CharacterIdentifier()
        name_variants = identifier.extract_name_variants(char_name)
        pronouns = identifier.infer_pronouns(char_name, novel_text)
        titles = identifier.infer_titles(char_name, novel_text)
        
        profile = CharacterProfile(
            primary_name=char_name,
            name_variants=name_variants,
            pronouns=pronouns,
            titles=titles,
            novel_title=novel_title,
            novel_text=novel_text,
            backstories=[backstory],
            labels=[label]
        )
        character_profiles[char_name] = profile
    else:
        # Add additional backstory if this character appears multiple times
        if backstory not in character_profiles[char_name].backstories:
            character_profiles[char_name].backstories.append(backstory)
            character_profiles[char_name].labels.append(label)

print(f"   Created profiles for {len(character_profiles)} characters")
for char, profile in list(character_profiles.items())[:5]:  # Show first 5
    print(f"   ‚Ä¢ {char}: {len(profile.name_variants)} variants, {len(profile.backstories)} backstories")

# ============================================
# STEP 4: Advanced Context Extraction
# ============================================

@dataclass
class ExtractedEvidence:
    """Store extracted evidence with metadata"""
    evidence_id: str
    character_name: str
    novel_title: str
    passage_text: str
    passage_start: int
    passage_end: int
    context_window: int
    evidence_type: str
    causal_markers: List[str]
    emotional_markers: List[str]
    temporal_position: float
    has_pronoun: bool
    has_direct_name: bool
    character_role: str
    narrative_importance: float
    backstory_relevance: float  # How relevant to known backstories

class ContextExtractor:
    """Intelligent context extraction with backstory awareness"""
    
    def __init__(self, config: Dict = None):
        self.config = config or {}
        self.min_window = self.config.get('min_window', 150)
        self.max_window = self.config.get('max_window', 1200)
        self.target_window = self.config.get('target_window', 500)
        
        # Comprehensive marker sets
        self.causal_markers = {
            'because', 'since', 'therefore', 'thus', 'so', 'hence', 'consequently',
            'as a result', 'due to', 'owing to', 'for this reason', 'accordingly',
            'then', 'ergo', 'wherefore'
        }
        
        self.emotional_markers = {
            'fear', 'afraid', 'terrified', 'scared', 'frightened', 'dread',
            'love', 'adore', 'cherish', 'treasure', 'passion', 'affection',
            'hate', 'despise', 'loathe', 'abhor', 'detest', 'resent',
            'anger', 'fury', 'rage', 'wrath', 'ire', 'outrage',
            'joy', 'happiness', 'delight', 'elation', 'ecstasy', 'bliss',
            'sadness', 'sorrow', 'grief', 'melancholy', 'despair', 'misery',
            'hope', 'optimism', 'expectation', 'anticipation',
            'regret', 'remorse', 'guilt', 'shame', 'contrition',
            'surprise', 'amazement', 'astonishment', 'wonder'
        }
        
        self.action_verbs = {
            'ran', 'walked', 'jumped', 'fled', 'chased', 'fought', 'attacked',
            'spoke', 'said', 'shouted', 'whispered', 'asked', 'replied', 'answered',
            'looked', 'saw', 'watched', 'observed', 'noticed', 'glanced',
            'thought', 'considered', 'decided', 'chose', 'determined', 'resolved',
            'felt', 'experienced', 'sensed', 'perceived', 'realized',
            'gave', 'took', 'received', 'accepted', 'refused', 'rejected',
            'created', 'built', 'destroyed', 'saved', 'killed', 'helped',
            'traveled', 'went', 'came', 'arrived', 'departed', 'left'
        }
        
        self.negative_markers = {
            'refused', 'avoided', 'rejected', 'denied', 'declined',
            'would not', 'could not', 'did not', 'never', 'no', 'not',
            'failed', 'unable', 'incapable', 'impossible', 'unwilling',
            'refrained', 'abstained', 'withheld', 'resisted'
        }
        
        self.internal_markers = {
            'thought', 'wondered', 'considered', 'pondered', 'mused',
            'felt', 'believed', 'knew', 'understood', 'realized',
            'remembered', 'recalled', 'recollected', 'reflected',
            'wished', 'hoped', 'dreamed', 'imagined', 'feared'
        }
    
    def find_character_mentions(self, text: str, profile: CharacterProfile) -> List[Tuple[int, int, str, float]]:
        """Find all mentions with position, type, and confidence"""
        mentions = []
        
        # Look for name variants with word boundaries
        for variant in profile.name_variants:
            if len(variant) < 2:
                continue
                
            # Escape special regex characters
            escaped = re.escape(variant)
            pattern = re.compile(rf'\b{escaped}\b', re.IGNORECASE)
            
            for match in pattern.finditer(text):
                confidence = 1.0 if variant == profile.primary_name else 0.8
                # Higher confidence if capitalized (likely proper noun)
                if match.group()[0].isupper():
                    confidence += 0.1
                mentions.append((match.start(), match.end(), 'name', confidence))
        
        # Look for pronouns with context awareness
        if len(text) > 0:
            for pronoun in profile.pronouns:
                pattern = re.compile(rf'\b{pronoun}\b', re.IGNORECASE)
                for match in pattern.finditer(text):
                    # Analyze context window
                    context_start = max(0, match.start() - 100)
                    context_end = min(len(text), match.end() + 100)
                    context = text[context_start:context_end].lower()
                    
                    # Check for other character mentions
                    other_character_count = 0
                    for other_char, other_profile in character_profiles.items():
                        if other_char != profile.primary_name:
                            for other_variant in other_profile.name_variants[:3]:
                                if other_variant.lower() in context:
                                    other_character_count += 1
                                    break
                    
                    confidence = max(0.3, 0.7 - (other_character_count * 0.15))
                    mentions.append((match.start(), match.end(), 'pronoun', confidence))
        
        # Look for titles
        for title in profile.titles:
            pattern = re.compile(rf'\b{re.escape(title)}\b', re.IGNORECASE)
            for match in pattern.finditer(text):
                mentions.append((match.start(), match.end(), 'title', 0.6))
        
        # Sort and deduplicate
        mentions.sort(key=lambda x: x[0])
        
        filtered = []
        current_end = -50  # Allow small overlap
        
        for mention in mentions:
            start, end, mtype, confidence = mention
            if start >= current_end:
                filtered.append(mention)
                current_end = end
            else:
                # Overlap - keep higher confidence
                prev_start, prev_end, prev_type, prev_conf = filtered[-1]
                if confidence > prev_conf:
                    filtered[-1] = mention
                    current_end = end
        
        return filtered
    
    def calculate_backstory_relevance(self, passage: str, profile: CharacterProfile) -> float:
        """Calculate how relevant passage is to known backstories"""
        if not profile.backstories:
            return 0.5  # Neutral if no backstory
        
        passage_lower = passage.lower()
        relevance_score = 0.0
        
        # Check for backstory keywords in passage
        backstory_keywords = set()
        for backstory in profile.backstories:
            # Extract meaningful words from backstory
            words = re.findall(r'\b[a-zA-Z]{4,}\b', backstory.lower())
            backstory_keywords.update(words)
        
        # Count keyword matches
        for keyword in backstory_keywords:
            if keyword in passage_lower:
                relevance_score += 0.1
        
        # Check for thematic consistency
        themes = {
            'revenge': ['revenge', 'vengeance', 'retribution'],
            'love': ['love', 'affection', 'devotion'],
            'betrayal': ['betray', 'treachery', 'deception'],
            'redemption': ['redemption', 'forgiveness', 'salvation'],
            'ambition': ['ambition', 'goal', 'purpose']
        }
        
        for theme, keywords in themes.items():
            for keyword in keywords:
                if keyword in passage_lower:
                    relevance_score += 0.05
        
        return min(1.0, relevance_score)
    
    def extract_passage_with_context(self, text: str, mention_positions: List[Tuple[int, int]], 
                                    profile: CharacterProfile) -> Optional[ExtractedEvidence]:
        """Extract a meaningful passage around mentions"""
        if not mention_positions:
            return None
        
        # Calculate center of mentions
        positions = [pos for start, end, _, _ in mention_positions for pos in (start, end)]
        if not positions:
            return None
            
        center = int(np.mean(positions))
        
        # Initial window
        start = max(0, center - self.min_window // 2)
        end = min(len(text), center + self.min_window // 2)
        
        # Expand to natural boundaries
        start = self.find_previous_boundary(text, start)
        end = self.find_next_boundary(text, end)
        
        # Ensure we don't exceed max window
        if end - start > self.max_window:
            # Center the window
            start = max(0, center - self.max_window // 2)
            end = min(len(text), center + self.max_window // 2)
            start = self.find_previous_boundary(text, start)
            end = self.find_next_boundary(text, end)
        
        passage = text[start:end]
        if len(passage) < 100:  # Too short to be meaningful
            return None
        
        # Analyze passage
        analysis = self.analyze_passage(passage, profile)
        
        # Skip if low quality
        if analysis['narrative_importance'] < 0.1:
            return None
        
        # Create evidence ID
        evidence_id = f"{profile.primary_name}_{start}_{end}_{hash(passage[:50]) & 0xffffffff}"
        
        return ExtractedEvidence(
            evidence_id=evidence_id,
            character_name=profile.primary_name,
            novel_title=profile.novel_title,
            passage_text=passage,
            passage_start=start,
            passage_end=end,
            context_window=end - start,
            evidence_type=analysis['evidence_type'],
            causal_markers=analysis['causal_markers'],
            emotional_markers=analysis['emotional_markers'],
            temporal_position=float(start) / len(text) if len(text) > 0 else 0.0,
            has_pronoun=any('pronoun' in m[2] for m in mention_positions),
            has_direct_name=any('name' in m[2] for m in mention_positions),
            character_role=analysis['character_role'],
            narrative_importance=analysis['narrative_importance'],
            backstory_relevance=self.calculate_backstory_relevance(passage, profile)
        )
    
    def find_previous_boundary(self, text: str, position: int) -> int:
        """Find previous sentence or paragraph boundary"""
        # Look for paragraph boundary first
        for i in range(position, max(0, position - 500), -1):
            if i > 1 and text[i-2:i] == '\n\n':
                return max(0, i-2)
        
        # Look for sentence boundary
        for i in range(position, max(0, position - 300), -1):
            if i > 0 and text[i-1] in '.!?' and (i == len(text) or text[i] in ' \n\t'):
                return max(0, i)
        
        return max(0, position - 200)
    
    def find_next_boundary(self, text: str, position: int) -> int:
        """Find next sentence or paragraph boundary"""
        # Look for paragraph boundary first
        for i in range(position, min(len(text), position + 500)):
            if i + 2 <= len(text) and text[i:i+2] == '\n\n':
                return min(len(text), i+2)
        
        # Look for sentence boundary
        for i in range(position, min(len(text), position + 300)):
            if text[i] in '.!?' and (i+1 >= len(text) or text[i+1] in ' \n\t'):
                return min(len(text), i+1)
        
        return min(len(text), position + 200)
    
    def analyze_passage(self, passage: str, profile: CharacterProfile) -> Dict:
        """Analyze passage content comprehensively"""
        passage_lower = passage.lower()
        char_name_lower = profile.primary_name.lower()
        
        # Count markers
        action_count = sum(1 for verb in self.action_verbs if f' {verb} ' in passage_lower)
        dialogue_count = passage.count('"') + passage.count("'")
        internal_count = sum(passage_lower.count(f' {marker} ') for marker in self.internal_markers)
        negative_count = sum(1 for marker in self.negative_markers if f' {marker} ' in passage_lower)
        
        # Find causal markers
        causal_found = [marker for marker in self.causal_markers if marker in passage_lower]
        
        # Find emotional markers
        emotional_found = [marker for marker in self.emotional_markers if marker in passage_lower]
        
        # Determine character role (subject vs object/mentioned)
        char_role = 'mentioned'
        sentences = re.split(r'[.!?]+', passage)
        for sentence in sentences[:3]:  # Check first few sentences
            sentence_lower = sentence.lower()
            # Check if character name is in sentence
            name_in_sentence = any(variant.lower() in sentence_lower for variant in profile.name_variants[:3])
            
            if name_in_sentence:
                # Simple grammar check: name followed by verb
                words = sentence.strip().split()
                for i, word in enumerate(words):
                    word_lower = word.lower()
                    # Check if this word contains a character name variant
                    for variant in profile.name_variants[:3]:
                        if variant.lower() in word_lower:
                            # Check next word for verb
                            if i < len(words) - 1:
                                next_word = words[i+1].lower()
                                if any(verb in next_word for verb in self.action_verbs):
                                    char_role = 'subject'
                                    break
                    if char_role == 'subject':
                        break
            if char_role == 'subject':
                break
        
        # Calculate narrative importance score
        importance = 0.0
        
        # Base importance
        importance += min(action_count * 0.05, 0.2)
        importance += min(dialogue_count * 0.02, 0.1)
        importance += min(internal_count * 0.1, 0.3)
        importance += min(len(causal_found) * 0.15, 0.3)
        importance += min(len(emotional_found) * 0.1, 0.2)
        
        # Bonus for character as subject
        if char_role == 'subject':
            importance += 0.15
        
        # Bonus for negative evidence (rare but important)
        if negative_count > 0:
            importance += 0.1
        
        importance = min(importance, 1.0)
        
        # Determine evidence type
        type_scores = {
            'action': action_count,
            'dialogue': dialogue_count * 3,  # Weight dialogue more
            'internal': internal_count * 2,  # Weight internal more
            'negative': negative_count * 5   # Weight negative evidence highest
        }
        
        evidence_type = max(type_scores.items(), key=lambda x: x[1])[0]
        
        return {
            'evidence_type': evidence_type,
            'causal_markers': causal_found,
            'emotional_markers': emotional_found,
            'character_role': char_role,
            'narrative_importance': float(importance),
            'action_count': int(action_count),
            'dialogue_count': int(dialogue_count),
            'internal_count': int(internal_count),
            'negative_count': int(negative_count)
        }
    
    def extract_evidence_for_profile(self, profile: CharacterProfile) -> List[ExtractedEvidence]:
        """Extract all evidence for a character profile"""
        if not profile.novel_text or len(profile.novel_text) < 1000:
            return []
        
        evidence_list = []
        text = profile.novel_text
        
        # Find all mentions
        mentions = self.find_character_mentions(text, profile)
        
        if not mentions:
            return []
        
        # Group nearby mentions
        groups = []
        current_group = []
        group_threshold = 300  # characters
        
        for mention in mentions:
            if not current_group:
                current_group.append(mention)
            else:
                last_end = current_group[-1][1]
                if mention[0] - last_end < group_threshold:
                    current_group.append(mention)
                else:
                    if current_group:
                        groups.append(current_group)
                    current_group = [mention]
        
        if current_group:
            groups.append(current_group)
        
        # Extract passages for each group
        for group in groups[:50]:  # Limit to 50 groups per character
            evidence = self.extract_passage_with_context(text, group, profile)
            if evidence:
                evidence_list.append(evidence)
        
        return evidence_list

print("\n5. Extracting character evidence...")

# Configure and run extractor
extractor_config = {
    'min_window': 200,
    'max_window': 1500,
    'target_window': 600
}
extractor = ContextExtractor(extractor_config)

# Extract evidence for all characters
all_evidence = []
for char_name, profile in character_profiles.items():
    evidence = extractor.extract_evidence_for_profile(profile)
    all_evidence.extend(evidence)
    print(f"   ‚Ä¢ {char_name}: {len(evidence)} evidence passages")

# ============================================
# STEP 5: Create Pathway Evidence Table
# ============================================

print("\n6. Creating Pathway evidence table...")

# Convert to DataFrame
evidence_data = []
for ev in all_evidence:
    evidence_data.append({
        'evidence_id': ev.evidence_id,
        'character_name': ev.character_name,
        'novel_title': ev.novel_title,
        'passage_text': ev.passage_text[:10000],  # Limit text length
        'passage_start': int(ev.passage_start),
        'passage_end': int(ev.passage_end),
        'context_window': int(ev.context_window),
        'evidence_type': ev.evidence_type,
        'causal_markers': '|'.join(ev.causal_markers[:10]),  # Limit markers
        'emotional_markers': '|'.join(ev.emotional_markers[:10]),
        'temporal_position': float(ev.temporal_position),
        'has_pronoun': bool(ev.has_pronoun),
        'has_direct_name': bool(ev.has_direct_name),
        'character_role': ev.character_role,
        'narrative_importance': float(ev.narrative_importance),
        'backstory_relevance': float(ev.backstory_relevance),
        'passage_length': int(len(ev.passage_text))
    })

evidence_df = pd.DataFrame(evidence_data)

# Create Pathway schema
class EvidenceSchema(pw.Schema):
    evidence_id: str
    character_name: str
    novel_title: str
    passage_text: str
    passage_start: int
    passage_end: int
    context_window: int
    evidence_type: str
    causal_markers: str
    emotional_markers: str
    temporal_position: float
    has_pronoun: bool
    has_direct_name: bool
    character_role: str
    narrative_importance: float
    backstory_relevance: float
    passage_length: int

# Create Pathway table
character_evidence_table = pw.debug.table_from_pandas(evidence_df, schema=EvidenceSchema)

print(f"   Evidence table created with {len(evidence_df)} rows")

# ============================================
# STEP 6: Create Statistics Table
# ============================================

print("\n7. Creating statistics table...")

# Calculate statistics
stats_data = []
for char_name in evidence_df['character_name'].unique():
    char_ev = evidence_df[evidence_df['character_name'] == char_name]
    
    if len(char_ev) == 0:
        continue
    
    stats = {
        'character_name': char_name,
        'total_passages': int(len(char_ev)),
        'avg_passage_length': float(char_ev['passage_length'].mean()),
        'total_chars_extracted': int(char_ev['passage_length'].sum()),
        'action_count': int(len(char_ev[char_ev['evidence_type'] == 'action'])),
        'dialogue_count': int(len(char_ev[char_ev['evidence_type'] == 'dialogue'])),
        'internal_count': int(len(char_ev[char_ev['evidence_type'] == 'internal'])),
        'negative_count': int(len(char_ev[char_ev['evidence_type'] == 'negative'])),
        'with_causal': int((char_ev['causal_markers'] != '').sum()),
        'with_emotional': int((char_ev['emotional_markers'] != '').sum()),
        'temporal_range': float(char_ev['temporal_position'].max() - char_ev['temporal_position'].min()),
        'avg_importance': float(char_ev['narrative_importance'].mean()),
        'avg_backstory_relevance': float(char_ev['backstory_relevance'].mean()),
        'pronoun_ratio': float(char_ev['has_pronoun'].mean()),
        'direct_name_ratio': float(char_ev['has_direct_name'].mean()),
        'subject_ratio': float((char_ev['character_role'] == 'subject').mean())
    }
    stats_data.append(stats)

stats_df = pd.DataFrame(stats_data)

class StatsSchema(pw.Schema):
    character_name: str
    total_passages: int
    avg_passage_length: float
    total_chars_extracted: int
    action_count: int
    dialogue_count: int
    internal_count: int
    negative_count: int
    with_causal: int
    with_emotional: int
    temporal_range: float
    avg_importance: float
    avg_backstory_relevance: float
    pronoun_ratio: float
    direct_name_ratio: float
    subject_ratio: float

character_statistics_table = pw.debug.table_from_pandas(stats_df, schema=StatsSchema)

# ============================================
# STEP 7: Create Joined Evidence-Backstory Table
# ============================================

print("\n8. Creating joined evidence-backstory table...")

# Join evidence with original backstories
evidence_with_backstory = joined_table.join(
    character_evidence_table,
    joined_table.character_name == character_evidence_table.character_name
).select(
    sample_id=joined_table.sample_id,
    character_name=joined_table.character_name,
    backstory=joined_table.backstory,
    label=joined_table.label,
    novel_title=joined_table.novel_title,
    evidence_id=character_evidence_table.evidence_id,
    evidence_text=character_evidence_table.passage_text,
    evidence_type=character_evidence_table.evidence_type,
    causal_markers=character_evidence_table.causal_markers,
    emotional_markers=character_evidence_table.emotional_markers,
    temporal_position=character_evidence_table.temporal_position,
    character_role=character_evidence_table.character_role,
    narrative_importance=character_evidence_table.narrative_importance,
    backstory_relevance=character_evidence_table.backstory_relevance
)

# Convert to pandas for inspection
joined_evidence_df = pw.debug.table_to_pandas(evidence_with_backstory)
print(f"   Joined table created with {len(joined_evidence_df)} rows")

# ============================================
# STEP 8: Save Outputs and Generate Report
# ============================================

print("\n9. Saving outputs and generating report...")

# Create output directory
output_dir = PROJECT_ROOT / "phase2_output"
output_dir.mkdir(exist_ok=True)

# Convert numpy types to Python native types for JSON serialization
def convert_to_serializable(obj):
    """Convert numpy types to Python native types for JSON serialization"""
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, dict):
        return {key: convert_to_serializable(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_serializable(item) for item in obj]
    else:
        return obj

# Save DataFrames
evidence_df.to_csv(output_dir / "character_evidence.csv", index=False)
stats_df.to_csv(output_dir / "character_statistics.csv", index=False)
joined_evidence_df.to_csv(output_dir / "evidence_with_backstory.csv", index=False)

# Generate quality report
total_evidence = len(evidence_df)
total_characters = evidence_df['character_name'].nunique()

quality_report = {
    'phase': 'Phase 2 - Character Context Extraction',
    'timestamp': pd.Timestamp.now().isoformat(),
    'summary': {
        'total_evidence_passages': int(total_evidence),
        'unique_characters': int(total_characters),
        'avg_passages_per_character': float(total_evidence / total_characters if total_characters > 0 else 0),
        'total_chars_extracted': int(evidence_df['passage_length'].sum()),
        'avg_passage_length': float(evidence_df['passage_length'].mean())
    },
    'evidence_quality': {
        'causal_context_present': float((evidence_df['causal_markers'] != '').mean()),
        'emotional_context_present': float((evidence_df['emotional_markers'] != '').mean()),
        'character_as_subject': float((evidence_df['character_role'] == 'subject').mean()),
        'avg_narrative_importance': float(evidence_df['narrative_importance'].mean()),
        'avg_backstory_relevance': float(evidence_df['backstory_relevance'].mean())
    },
    'evidence_type_distribution': convert_to_serializable(evidence_df['evidence_type'].value_counts().to_dict()),
    'extraction_config': extractor_config,
    'character_coverage': convert_to_serializable({
        char: int(len(evidence_df[evidence_df['character_name'] == char])) 
        for char in evidence_df['character_name'].unique()
    }),
    'acknowledged_limitations': [
        "Pronoun resolution without full coreference",
        "Limited handling of name ambiguity across characters",
        "Simplified grammatical role detection",
        "Fixed context window sizes",
        "Basic backstory relevance scoring"
    ],
    'success_metrics_achieved': [
        "Temporal ordering preserved via position tracking",
        "Causal markers explicitly extracted",
        "Emotional context captured",
        "Negative evidence specifically identified",
        "Character actions prioritized",
        "Backstory relevance calculated",
        "Comprehensive metadata for Phase 3"
    ]
}

# Save report
with open(output_dir / "quality_report.json", 'w') as f:
    json.dump(convert_to_serializable(quality_report), f, indent=2)

# ============================================
# STEP 9: Final Summary
# ============================================

print("\n" + "=" * 80)
print("PHASE 2 COMPLETE - SUCCESSFULLY BUILT FROM PHASE 1")
print("=" * 80)

print(f"\nüéØ RESULTS SUMMARY:")
print(f"   Total evidence passages: {total_evidence}")
print(f"   Unique characters covered: {total_characters}")
if total_characters > 0:
    print(f"   Average passages per character: {total_evidence/total_characters:.1f}")

print(f"\nüìä EVIDENCE QUALITY:")
print(f"   Passages with causal context: {quality_report['evidence_quality']['causal_context_present']*100:.1f}%")
print(f"   Passages with emotional context: {quality_report['evidence_quality']['emotional_context_present']*100:.1f}%")
print(f"   Character as active subject: {quality_report['evidence_quality']['character_as_subject']*100:.1f}%")
print(f"   Average narrative importance: {quality_report['evidence_quality']['avg_narrative_importance']*100:.1f}%")
print(f"   Average backstory relevance: {quality_report['evidence_quality']['avg_backstory_relevance']*100:.1f}%")

print(f"\nüìÅ EVIDENCE TYPE DISTRIBUTION:")
evidence_dist = quality_report['evidence_type_distribution']
for ev_type in ['action', 'dialogue', 'internal', 'negative']:
    if ev_type in evidence_dist:
        count = evidence_dist[ev_type]
        percentage = (count / total_evidence) * 100
        print(f"   ‚Ä¢ {ev_type.title()}: {count} ({percentage:.1f}%)")

print(f"\nüíæ OUTPUTS SAVED TO: {output_dir}")
print(f"   1. character_evidence.csv - {total_evidence} evidence passages")
print(f"   2. character_statistics.csv - Statistics per character")
print(f"   3. evidence_with_backstory.csv - Joined with Phase 1 backstories")
print(f"   4. quality_report.json - Comprehensive quality metrics")

print(f"\nüîó PATHWAY TABLES CREATED:")
print(f"   1. character_evidence_table - All extracted evidence")
print(f"   2. character_statistics_table - Character-level statistics")
print(f"   3. evidence_with_backstory - Joined with original backstories")

print(f"\nüöÄ READY FOR PHASE 3:")
print(f"   The evidence is now structured for contradiction detection.")
print(f"   Each passage has causal/emotional context, narrative importance,")
print(f"   backstory relevance, and temporal positioning.")

print("\n" + "=" * 80)
print("Character evidence successfully extracted with causal and emotional context preserved.")
print("Phase 3 can now analyze backstory consistency using this rich evidence set.")
print("=" * 80)

# Display sample evidence
if len(evidence_df) > 0:
    print("\nüìù SAMPLE EVIDENCE (first 3 passages):")
    print("-" * 80)
    for i, row in evidence_df.head(3).iterrows():
        print(f"\nCharacter: {row['character_name']}")
        print(f"Type: {row['evidence_type'].upper()} | Importance: {row['narrative_importance']:.2f}")
        print(f"Causal markers: {row['causal_markers'] if row['causal_markers'] else 'None'}")
        print(f"Emotional markers: {row['emotional_markers'] if row['emotional_markers'] else 'None'}")
        print(f"Preview: {row['passage_text'][:200]}...")
        print("-" * 80)

PHASE 2: CHARACTER CONTEXT EXTRACTION & EVIDENCE CONSTRUCTION
Goal: Isolate narrative evidence relevant to character development
      while preserving causal and emotional context.

1. Recreating Phase 1 environment...
   Project root: /root/DataDivas_KDSH_2026
   Data directory: /root/DataDivas_KDSH_2026/Data
   Books directory: /root/DataDivas_KDSH_2026/Data/Books

2. Recreating Phase 1 tables...
   Loaded: The Count of Monte Cristo - 464,020 words
   Loaded: In search of the castaways - 138,830 words


[2026-01-08T16:26:08]:INFO:Preparing Pathway computation
[2026-01-08T16:26:08]:INFO:Enter read_snapshot method with reader PosixLike
[2026-01-08T16:26:08]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 0 entries (1 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:08]:INFO:subscribe-0: Done writing 0 entries, time 1767889568374. Current batch writes took: 0 ms. All writes so far took: 0 ms.
[2026-01-08T16:26:08]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 80 entries (2 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:08]:INFO:subscribe-0: Done writing 80 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms.
[2026-01-08T16:26:08]:INFO:Preparing Pathway computation



3. Phase 1 tables recreated successfully:


[2026-01-08T16:26:08]:INFO:subscribe-0: Done writing 2 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms.
[2026-01-08T16:26:08]:INFO:Preparing Pathway computation
[2026-01-08T16:26:08]:INFO:Enter read_snapshot method with reader PosixLike
[2026-01-08T16:26:09]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 0 entries (1 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:09]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 80 entries (1 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:09]:INFO:subscribe-0: Done writing 31 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms.


   ‚Ä¢ train_table: 80 rows
   ‚Ä¢ books_table: 2 rows
   ‚Ä¢ joined_table: 31 rows

   Sample from joined_table:
   Characters: ['Faria', 'Noirtier']
   Novels: ['The Count of Monte Cristo']
   Labels: {'contradict': 16, 'consistent': 15}

4. Creating character profiles...
   Created profiles for 2 characters
   ‚Ä¢ Faria: 2 variants, 15 backstories
   ‚Ä¢ Noirtier: 2 variants, 16 backstories

5. Extracting character evidence...
   ‚Ä¢ Faria: 46 evidence passages
   ‚Ä¢ Noirtier: 45 evidence passages

6. Creating Pathway evidence table...
   Evidence table created with 91 rows

7. Creating statistics table...

8. Creating joined evidence-backstory table...


[2026-01-08T16:26:12]:INFO:Preparing Pathway computation
[2026-01-08T16:26:12]:INFO:Enter read_snapshot method with reader PosixLike
[2026-01-08T16:26:12]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 0 entries (1 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:12]:INFO:FileSystem(/root/DataDivas_KDSH_2026/Data/train.csv): 80 entries (2 minibatch(es)) have been sent to the engine
[2026-01-08T16:26:13]:INFO:subscribe-0: Done writing 1410 entries, closing data sink. Current batch writes took: 0 ms. All writes so far took: 0 ms.


   Joined table created with 1410 rows

9. Saving outputs and generating report...

PHASE 2 COMPLETE - SUCCESSFULLY BUILT FROM PHASE 1

üéØ RESULTS SUMMARY:
   Total evidence passages: 91
   Unique characters covered: 2
   Average passages per character: 45.5

üìä EVIDENCE QUALITY:
   Passages with causal context: 64.8%
   Passages with emotional context: 48.4%
   Character as active subject: 0.0%
   Average narrative importance: 32.4%
   Average backstory relevance: 55.0%

üìÅ EVIDENCE TYPE DISTRIBUTION:
   ‚Ä¢ Action: 38 (41.8%)
   ‚Ä¢ Internal: 4 (4.4%)
   ‚Ä¢ Negative: 49 (53.8%)

üíæ OUTPUTS SAVED TO: /root/DataDivas_KDSH_2026/phase2_output
   1. character_evidence.csv - 91 evidence passages
   2. character_statistics.csv - Statistics per character
   3. evidence_with_backstory.csv - Joined with Phase 1 backstories
   4. quality_report.json - Comprehensive quality metrics

üîó PATHWAY TABLES CREATED:
   1. character_evidence_table - All extracted evidence
   2. character_stat