# Cross-Border Tax Workflow Testing (Phase 3 Enhanced)

This notebook tests the complete science pipeline with all Phase 3 enhancements.

**Features Tested:**
- ✅ Phase 2: LLM-based tag assignment & question selection
- ✅ Phase 3: Multi-fact extraction, smart module skipping, explanations, clarifications, follow-ups, verification, corrections

**Owner:** Science Team  
**Last Updated:** 2025-09-30

## 1. Setup & Imports

In [None]:
import sys
import os
from pathlib import Path
import json
from datetime import datetime

# Add backend to path
backend_path = Path.cwd()
if str(backend_path) not in sys.path:
    sys.path.insert(0, str(backend_path))

print(f"✓ Backend path: {backend_path}")
print(f"✓ Python path configured")

In [None]:
# Load environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

# Import science modules
from science.agents.workflow import TaxConsultationWorkflow
from science.agents.state import create_initial_state
from science.config import science_config

print("✓ Science modules imported successfully")
print(f"\n📋 Current Configuration:")
print(f"  AI Provider: {science_config.AI_MODEL_PROVIDER}")
print(f"  Model: {science_config.OPENAI_MODEL if science_config.AI_MODEL_PROVIDER == 'openai' else science_config.GEMINI_MODEL}")

print(f"\n🔧 Phase 2 Features:")
print(f"  LLM Tag Assignment: {science_config.USE_LLM_TAG_ASSIGNMENT}")
print(f"  LLM Question Selection: {science_config.USE_LLM_QUESTION_SELECTION}")

print(f"\n🚀 Phase 3 Features:")
print(f"  Multi-Fact Extraction: {science_config.USE_MULTI_FACT_EXTRACTION}")
print(f"  Smart Module Skipping: {science_config.USE_SMART_MODULE_SKIPPING}")
print(f"  Explanation Generation: {science_config.USE_EXPLANATION_GENERATION}")
print(f"  Auto-Clarification: {science_config.USE_AUTO_CLARIFICATION}")
print(f"  Adaptive Follow-ups: {science_config.USE_ADAPTIVE_FOLLOWUPS}")
print(f"  Verification Phase: {science_config.USE_VERIFICATION_PHASE}")
print(f"  Progressive Assignment: {science_config.USE_PROGRESSIVE_ASSIGNMENT}")
print(f"  Context Correction: {science_config.USE_CONTEXT_CORRECTION}")

## 2. Helper Functions

In [None]:
def print_separator(char="=", length=80):
    print(char * length)

def print_message(role: str, content: str, quick_replies: list = None):
    """Print a formatted conversation message"""
    print_separator()
    print(f"{role.upper()}:")
    print(content)
    if quick_replies:
        print(f"\nQuick Replies: {', '.join(quick_replies)}")
    print_separator()

def print_state_summary(state: dict):
    """Print a comprehensive state summary"""
    print("\n" + "="*80)
    print("STATE SUMMARY")
    print("="*80)
    
    # Basic info
    print(f"Phase: {state['current_phase']}")
    print(f"Current Module: {state.get('current_module', 'None')}")
    print(f"Conversation Turns: {len(state['messages'])}")
    print(f"Session ID: {state.get('session_id', 'N/A')}")
    
    # Tags
    print(f"\n📌 Assigned Tags ({len(state['assigned_tags'])}):")
    if state['assigned_tags']:
        for tag in state['assigned_tags']:
            confidence = state['tag_confidence'].get(tag, 'unknown')
            reasoning = state['tag_assignment_reasoning'].get(tag, {})
            method = reasoning.get('method', 'llm_analysis')
            print(f"  • {tag}")
            print(f"    - Confidence: {confidence}")
            print(f"    - Method: {method}")
    else:
        print("  (none)")
    
    # Phase 3: Extracted facts
    if state.get('extracted_facts'):
        print(f"\n💡 Extracted Facts ({len(state['extracted_facts'])}):")
        for fact in state['extracted_facts'][-5:]:  # Last 5
            print(f"  • {fact.get('fact', 'N/A')}")
            print(f"    - Confidence: {fact.get('confidence', 'N/A')}")
            print(f"    - Evidence: {fact.get('evidence', 'N/A')[:60]}...")
    
    # Phase 3: Skipped modules
    if state.get('skipped_modules'):
        print(f"\n⏭️  Skipped Modules: {', '.join(state['skipped_modules'])}")
    
    # Phase 3: Corrections
    if state.get('corrections_made'):
        print(f"\n✏️  Corrections Made: {len(state['corrections_made'])}")
        for corr in state['corrections_made']:
            print(f"  • Turn {corr.get('conversation_turn', 'N/A')}: {corr.get('reasoning', 'N/A')[:60]}...")
    
    # Phase 3: Verification
    if state.get('verification_needed'):
        print(f"\n🔍 Tags Needing Verification: {len(state['verification_needed'])}")
        for v in state['verification_needed']:
            print(f"  • {v.get('tag', 'N/A')} ({v.get('confidence', 'N/A')})")
    
    # Module progress
    print(f"\n📚 Completed Modules: {', '.join(state.get('completed_modules', [])) or 'None'}")
    print(f"Questions Asked: {len(state.get('asked_question_ids', []))}")
    print(f"Questions Skipped: {len(state.get('skipped_question_ids', []))}")
    
    # Transition status
    print(f"\n⚡ Ready to Transition: {state.get('should_transition', False)}")
    if state.get('transition_reason'):
        print(f"Transition Reason: {state['transition_reason']}")
    
    print("="*80 + "\n")

def print_forms_analysis(state: dict):
    """Print forms analysis results"""
    print("\n" + "="*80)
    print("FORMS ANALYSIS RESULTS")
    print("="*80)
    
    print(f"\nComplexity: {state.get('estimated_complexity', 'N/A').upper()}")
    
    print(f"\n📄 Required Forms ({len(state.get('required_forms', []))}):")
    for form in state.get('required_forms', []):
        print(f"\n  {form.get('form', 'N/A')} - {form.get('jurisdiction', 'N/A')}")
        print(f"    Priority: {form.get('priority', 'N/A')}")
        print(f"    Due: {form.get('due_date', 'N/A')}")
        print(f"    Description: {form.get('description', 'N/A')}")
    
    print(f"\n💡 Recommendations:")
    for i, rec in enumerate(state.get('recommendations', []), 1):
        print(f"  {i}. {rec}")
    
    print(f"\n📝 Next Steps:")
    for i, step in enumerate(state.get('next_steps', []), 1):
        print(f"  {i}. {step}")
    
    print("="*80 + "\n")

print("✓ Helper functions loaded")

## 3. Test Scenario 1: Multi-Fact Extraction

Tests Phase 3 multi-fact extraction by providing a complex initial response.

In [None]:
print("\n" + "#"*80)
print("# TEST SCENARIO 1: MULTI-FACT EXTRACTION")
print("#"*80 + "\n")

# Create workflow
workflow = TaxConsultationWorkflow()
session_id = "test_multifact_" + datetime.now().strftime("%Y%m%d_%H%M%S")

# Complex initial message with multiple facts
initial_message = """Hi, I'm a US citizen who moved to Canada last year for a job at a tech company. 
I still have my 401k account from my previous US employer, and I'm renting out a condo I own in Seattle. 
I also opened an RRSP account here in Canada for retirement savings."""

print("🧪 Testing multi-fact extraction from complex initial response...\n")
print_message("user", initial_message)

# Start consultation using SYNC method
result = workflow.start_consultation_sync(initial_message, session_id=session_id)

# Print response
print_message(
    "assistant",
    result['assistant_response'],
    result.get('quick_replies', [])
)

# Print state
print_state_summary(result)

print("\n📊 EXPECTED BEHAVIOR:")
print("System should extract multiple facts from single response:")
print("  ✓ US citizenship → us_person_worldwide_filing")
print("  ✓ Moved to Canada → cross_border_residency, residency_change_dual_status")
print("  ✓ 401k → cross_border_retirement_plans")
print("  ✓ Seattle rental → us_person_us_rental")
print("  ✓ RRSP → tfsa_resp_us_person (potentially)")
print("\n📌 CHECK: Did system assign 4-5 tags from this single response?")

## 4. Test Scenario 2: Smart Module Skipping

Tests Phase 3 smart module skipping when user clearly indicates no business.

In [None]:
print("\n" + "#"*80)
print("# TEST SCENARIO 2: SMART MODULE SKIPPING")
print("#"*80 + "\n")

# New workflow
workflow2 = TaxConsultationWorkflow()
session_id2 = "test_skip_" + datetime.now().strftime("%Y%m%d_%H%M%S")

# Clear message about W-2 employment only
message1 = "I'm a W-2 employee at a Canadian company. No business ownership, just regular employment."
print_message("user", message1)

result2 = workflow2.start_consultation_sync(message1, session_id=session_id2)
print_message("assistant", result2['assistant_response'], result2.get('quick_replies', []))
print_state_summary(result2)

# Continue
message2 = "Yes, I'm a Canadian resident"
print_message("user", message2)

result2 = workflow2.continue_consultation_sync(session_id2, message2)
print_message("assistant", result2['assistant_response'], result2.get('quick_replies', []))
print_state_summary(result2)

print("\n📊 EXPECTED BEHAVIOR:")
print("System should:")
print("  ✓ Detect 'W-2 employee' and 'No business ownership'")
print("  ✓ Mark 'business_entities' module as skipped")
print("  ✓ Never ask business-related questions")
print("\n📌 CHECK: Is 'business_entities' in Skipped Modules list above?")

## 5. Test Scenario 3: Context Correction

Tests Phase 3 correction handling when user corrects previous answer.

In [None]:
print("\n" + "#"*80)
print("# TEST SCENARIO 3: CONTEXT CORRECTION")
print("#"*80 + "\n")

# New workflow
workflow3 = TaxConsultationWorkflow()
session_id3 = "test_correct_" + datetime.now().strftime("%Y%m%d_%H%M%S")

# Initial conversation
msg1 = "I'm a Canadian resident"
print_message("user", msg1)
result3 = workflow3.start_consultation_sync(msg1, session_id=session_id3)
print_message("assistant", result3['assistant_response'], result3.get('quick_replies', []))

msg2 = "No, I don't have any businesses"
print_message("user", msg2)
result3 = workflow3.continue_consultation_sync(session_id3, msg2)
print_message("assistant", result3['assistant_response'], result3.get('quick_replies', []))

print("\n⏳ A few turns later...\n")

# Make a correction
correction = """Actually, wait - I forgot to mention I do have a small LLC for freelance consulting work. 
It's not very active, so I didn't think of it as a 'business' initially."""

print_message("user", correction)
result3 = workflow3.continue_consultation_sync(session_id3, correction)
print_message("assistant", result3['assistant_response'], result3.get('quick_replies', []))
print_state_summary(result3)

print("\n📊 EXPECTED BEHAVIOR:")
print("System should:")
print("  ✓ Detect correction keywords ('Actually, wait')")
print("  ✓ Add business-related tags (business_entity_foreign_ownership, etc.)")
print("  ✓ Re-enable business_entities module")
print("  ✓ Log correction in corrections_made[]")
print("\n📌 CHECK: Are there entries in 'Corrections Made' section above?")

## 6. Test Scenario 4: Full Conversation to Forms Analysis

Complete conversation flow through to forms analysis phase.

In [None]:
print("\n" + "#"*80)
print("# TEST SCENARIO 4: FULL CONVERSATION FLOW")
print("#"*80 + "\n")

# New workflow
workflow4 = TaxConsultationWorkflow()
session_id4 = "test_full_" + datetime.now().strftime("%Y%m%d_%H%M%S")

# Conversation messages
messages = [
    "I'm a US citizen living in Canada",
    "Yes, I moved here last year",
    "I work as a W-2 employee",
    "I have bank accounts in both countries",
    "Yes, I own a rental property in the US",
    "I have an RRSP in Canada",
    "No other investments",
]

# Start
print_message("user", messages[0])
result4 = workflow4.start_consultation_sync(messages[0], session_id=session_id4)
print_message("assistant", result4['assistant_response'], result4.get('quick_replies', []))
print_state_summary(result4)

# Continue through conversation
for i, msg in enumerate(messages[1:], 2):
    print(f"\n--- Turn {i} ---\n")
    print_message("user", msg)
    result4 = workflow4.continue_consultation_sync(session_id4, msg)
    print_message("assistant", result4['assistant_response'], result4.get('quick_replies', []))
    print_state_summary(result4)
    
    # Check if transitioned
    if result4['current_phase'] == 'forms_analysis':
        print("\n🎉 TRANSITIONED TO FORMS ANALYSIS!")
        print_forms_analysis(result4)
        break

# If not transitioned, force it
if result4['current_phase'] != 'forms_analysis':
    print("\n⏭️  Forcing transition to forms analysis...")
    result4 = workflow4.force_transition_to_forms_analysis_sync(session_id4)
    print_message("assistant", result4['assistant_response'])
    print_forms_analysis(result4)

## 7. Interactive Testing

Run your own custom conversation.

In [None]:
print("\n" + "#"*80)
print("# INTERACTIVE TESTING")
print("#"*80 + "\n")

# Create interactive workflow
interactive_workflow = TaxConsultationWorkflow()
interactive_session = "test_interactive_" + datetime.now().strftime("%Y%m%d_%H%M%S")

print("Type your messages below.")
print("Commands: 'quit' to exit, 'state' to see state summary, 'force' to force forms analysis\n")

# Get initial message
user_input = input("Your message: ")

if user_input.lower() != 'quit':
    result = interactive_workflow.start_consultation_sync(user_input, session_id=interactive_session)
    print_message("assistant", result['assistant_response'], result.get('quick_replies', []))
    
    # Conversation loop
    while True:
        user_input = input("\nYour message: ")
        
        if user_input.lower() == 'quit':
            print("Ending session...")
            break
        
        if user_input.lower() == 'state':
            print_state_summary(result)
            continue
        
        if user_input.lower() == 'force':
            result = interactive_workflow.force_transition_to_forms_analysis_sync(interactive_session)
            print_message("assistant", result['assistant_response'])
            print_forms_analysis(result)
            break
        
        result = interactive_workflow.continue_consultation_sync(interactive_session, user_input)
        print_message("assistant", result['assistant_response'], result.get('quick_replies', []))
        
        if result['current_phase'] == 'forms_analysis':
            print("\n🎉 Conversation complete! Forms analysis generated.")
            print_forms_analysis(result)
            break

## 8. Feature Flag Testing

Compare behavior with Phase 3 features on vs off.

In [None]:
print("\n" + "#"*80)
print("# FEATURE FLAG TESTING: PHASE 2 vs PHASE 3")
print("#"*80 + "\n")

# Save original config
original_flags = {
    'multi_fact': science_config.USE_MULTI_FACT_EXTRACTION,
    'smart_skip': science_config.USE_SMART_MODULE_SKIPPING,
    'explanation': science_config.USE_EXPLANATION_GENERATION,
    'clarification': science_config.USE_AUTO_CLARIFICATION,
    'followups': science_config.USE_ADAPTIVE_FOLLOWUPS,
    'verification': science_config.USE_VERIFICATION_PHASE,
    'progressive': science_config.USE_PROGRESSIVE_ASSIGNMENT,
    'correction': science_config.USE_CONTEXT_CORRECTION,
}

# Test message
test_msg = "I'm a US citizen living in Canada with rental property in Seattle and an RRSP account"

print("🧪 Test Message:")
print(f"  '{test_msg}'\n")

# Test with Phase 3 ON
print("\n" + "="*80)
print("PHASE 3 ENABLED")
print("="*80 + "\n")

workflow_p3 = TaxConsultationWorkflow()
session_p3 = "test_p3on_" + datetime.now().strftime("%Y%m%d_%H%M%S")
result_p3 = workflow_p3.start_consultation_sync(test_msg, session_id=session_p3)

print(f"Tags assigned: {len(result_p3['assigned_tags'])}")
print(f"Tags: {result_p3['assigned_tags']}")
print(f"Extracted facts: {len(result_p3.get('extracted_facts', []))}")

# Disable Phase 3
science_config.USE_MULTI_FACT_EXTRACTION = False
science_config.USE_SMART_MODULE_SKIPPING = False
science_config.USE_EXPLANATION_GENERATION = False
science_config.USE_AUTO_CLARIFICATION = False
science_config.USE_ADAPTIVE_FOLLOWUPS = False
science_config.USE_VERIFICATION_PHASE = False
science_config.USE_PROGRESSIVE_ASSIGNMENT = False
science_config.USE_CONTEXT_CORRECTION = False

# Test with Phase 3 OFF
print("\n" + "="*80)
print("PHASE 3 DISABLED (Phase 2 only)")
print("="*80 + "\n")

workflow_p2 = TaxConsultationWorkflow()
session_p2 = "test_p3off_" + datetime.now().strftime("%Y%m%d_%H%M%S")
result_p2 = workflow_p2.start_consultation_sync(test_msg, session_id=session_p2)

print(f"Tags assigned: {len(result_p2['assigned_tags'])}")
print(f"Tags: {result_p2['assigned_tags']}")
print(f"Extracted facts: {len(result_p2.get('extracted_facts', []))}")

# Comparison
print("\n" + "="*80)
print("COMPARISON")
print("="*80 + "\n")
print(f"Phase 3: {len(result_p3['assigned_tags'])} tags assigned")
print(f"Phase 2: {len(result_p2['assigned_tags'])} tags assigned")
print(f"\nDifference: {len(result_p3['assigned_tags']) - len(result_p2['assigned_tags']):+d} tags")
print(f"\n📊 Phase 3 multi-fact extraction should assign MORE tags from same response.")

# Restore config
science_config.USE_MULTI_FACT_EXTRACTION = original_flags['multi_fact']
science_config.USE_SMART_MODULE_SKIPPING = original_flags['smart_skip']
science_config.USE_EXPLANATION_GENERATION = original_flags['explanation']
science_config.USE_AUTO_CLARIFICATION = original_flags['clarification']
science_config.USE_ADAPTIVE_FOLLOWUPS = original_flags['followups']
science_config.USE_VERIFICATION_PHASE = original_flags['verification']
science_config.USE_PROGRESSIVE_ASSIGNMENT = original_flags['progressive']
science_config.USE_CONTEXT_CORRECTION = original_flags['correction']

print("\n✓ Configuration restored")

## 9. Export Session Data

Export session for debugging or analysis.

In [None]:
# Choose session to export (use any session_id from above)
export_session_id = session_id4  # From full conversation test

# Get state
session_state = workflow4.get_session_state(export_session_id)

if session_state:
    # Save to file
    output_file = f"session_export_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(session_state, f, indent=2, default=str)
    
    print(f"✓ Session exported to: {output_file}")
    print(f"\nSession Info:")
    print(f"  Phase: {session_state['current_phase']}")
    print(f"  Messages: {len(session_state['messages'])}")
    print(f"  Tags: {len(session_state['assigned_tags'])}")
    print(f"  Forms: {len(session_state.get('required_forms', []))}")
else:
    print(f"Session {export_session_id} not found")

---

## Summary

**Features Tested:**
- ✅ Multi-fact extraction from complex responses
- ✅ Smart module skipping based on user situation  
- ✅ Context correction handling
- ✅ Full conversation flow to forms analysis
- ✅ Interactive testing capability
- ✅ Configuration comparison (Phase 2 vs Phase 3)
- ✅ Session data export

**Next Steps:**
1. Test with real user scenarios
2. Validate forms analysis accuracy
3. Monitor LLM costs
4. Collect user feedback
5. Test clarification and verification flows
6. Test adaptive follow-up generation

**Tips:**
- Modify `science/config.py` to toggle feature flags
- Edit `science/agents/prompts.py` to adjust LLM prompts
- Update `tax_team/knowledge_base/*.md` and re-run parser
- Use `print_state_summary()` to inspect internal state

Happy testing! 🚀