In [None]:
# DMAS Long-Context Memory Evaluation Notebook
# This notebook evaluates the accuracy and cost of long-context vector vs graph memory
# in distributed LLM-based multi-agent systems

import requests
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
import time
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Configuration
LOCOMO_URL = "http://localhost:8002"
COORDINATOR_URL = "http://localhost:8003"
MEMORY_URL = "http://localhost:8005"

print("üìä DMAS Long-Context Memory Evaluation")
print("=" * 50)
print(f"Locomo URL: {LOCOMO_URL}")
print(f"Coordinator URL: {COORDINATOR_URL}")
print(f"Memory URL: {MEMORY_URL}")
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 50)

In [None]:
# Utility Functions for API Communication

def check_service_health(url: str, service_name: str) -> bool:
    """Check if a service is healthy"""
    try:
        response = requests.get(f"{url}/health", timeout=5)
        if response.status_code == 200:
            print(f"‚úÖ {service_name} is healthy")
            return True
        else:
            print(f"‚ùå {service_name} returned status {response.status_code}")
            return False
    except Exception as e:
        print(f"‚ùå {service_name} is not reachable: {e}")
        return False

def load_conversations_from_locomo(conv_index: Optional[int] = None) -> Dict[str, Any]:
    """Load conversations from locomo service"""
    try:
        if conv_index is not None:
            url = f"{LOCOMO_URL}/conversations/index/{conv_index}"
        else:
            url = f"{LOCOMO_URL}/conversations"
        
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error loading conversations: {e}")
        return {"error": str(e)}

def load_questions_from_locomo(conv_index: Optional[int] = None) -> Dict[str, Any]:
    """Load questions from locomo service"""
    try:
        if conv_index is not None:
            url = f"{LOCOMO_URL}/conversations/index/{conv_index}/questions"
        else:
            # Get all conversations first, then extract questions
            conversations = load_conversations_from_locomo()
            all_questions = []
            if "conversations" in conversations:
                for conv in conversations["conversations"]:
                    sample_id = conv.get("sample_id")
                    if sample_id:
                        questions_response = requests.get(f"{LOCOMO_URL}/conversations/{sample_id}/questions", timeout=30)
                        if questions_response.status_code == 200:
                            questions_data = questions_response.json()
                            all_questions.extend(questions_data.get("questions", []))
            return {"questions": all_questions}
        
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error loading questions: {e}")
        return {"error": str(e)}

def ask_question_via_coordinator(question: str) -> Dict[str, Any]:
    """Ask a question via the coordinator service"""
    try:
        response = requests.post(
            f"{COORDINATOR_URL}/ask",
            json={"question": question},
            timeout=60
        )
        response.raise_for_status()
        return response.json()
    except Exception as e:
        print(f"Error asking question: {e}")
        return {"error": str(e)}

# Check all services
print("üîç Checking service health...")
services_healthy = {
    "locomo": check_service_health(LOCOMO_URL, "Locomo"),
    "coordinator": check_service_health(COORDINATOR_URL, "Coordinator"),
    "memory": check_service_health(MEMORY_URL, "Memory")
}

if all(services_healthy.values()):
    print("‚úÖ All services are healthy!")
else:
    print("‚ö†Ô∏è Some services are not healthy. Please check docker-compose status.")

In [None]:
# Load Conversations from Locomo
print("üìö Loading conversations from Locomo...")

# Load all conversations
conversations_data = load_conversations_from_locomo()
if "error" not in conversations_data:
    print(f"‚úÖ Loaded {conversations_data.get('total', 0)} conversations")
    
    # Display conversation summary
    if "conversations" in conversations_data:
        conv_summary = []
        for i, conv in enumerate(conversations_data["conversations"]):
            conv_summary.append({
                "index": i,
                "sample_id": conv.get("sample_id", "N/A"),
                "speaker_a": conv.get("speaker_a", "N/A"),
                "speaker_b": conv.get("speaker_b", "N/A"),
                "sessions_count": len([k for k in conv.get("sessions", {}).keys() if k.startswith("session_")])
            })
        
        conv_df = pd.DataFrame(conv_summary)
        print("\nüìã Conversation Summary:")
        print(conv_df.to_string(index=False))
        
        # Store conversations for later use
        conversations = conversations_data["conversations"]
    else:
        conversations = []
        print("‚ö†Ô∏è No conversations found in response")
else:
    print(f"‚ùå Failed to load conversations: {conversations_data['error']}")
    conversations = []


In [None]:
# Load Questions from Locomo
print("‚ùì Loading questions from Locomo...")

# Load all questions
questions_data = load_questions_from_locomo()
if "error" not in questions_data:
    questions = questions_data.get("questions", [])
    print(f"‚úÖ Loaded {len(questions)} questions")
    
    if questions:
        # Display question summary
        question_summary = []
        for i, q in enumerate(questions[:10]):  # Show first 10 questions
            question_summary.append({
                "index": i,
                "sample_id": q.get("sample_id", "N/A"),
                "question": q.get("question", "N/A")[:50] + "..." if len(q.get("question", "")) > 50 else q.get("question", "N/A"),
                "answer": q.get("answer", "N/A")[:30] + "..." if len(q.get("answer", "")) > 30 else q.get("answer", "N/A"),
                "category": q.get("category", "N/A")
            })
        
        questions_df = pd.DataFrame(question_summary)
        print("\n‚ùì Questions Summary (first 10):")
        print(questions_df.to_string(index=False))
        
        # Analyze question categories
        categories = [q.get("category") for q in questions if q.get("category")]
        if categories:
            category_counts = pd.Series(categories).value_counts()
            print(f"\nüìä Question Categories:")
            print(category_counts.to_string())
    else:
        print("‚ö†Ô∏è No questions found")
else:
    print(f"‚ùå Failed to load questions: {questions_data['error']}")
    questions = []


In [None]:
# F1 Textual Similarity Evaluation
print("üéØ Implementing F1 Textual Similarity Evaluation...")

# Initialize sentence transformer for semantic similarity
try:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("‚úÖ Sentence transformer model loaded")
except Exception as e:
    print(f"‚ùå Error loading sentence transformer: {e}")
    model = None

def calculate_textual_similarity(text1: str, text2: str) -> float:
    """Calculate cosine similarity between two texts"""
    if not model or not text1 or not text2:
        return 0.0
    
    try:
        embeddings = model.encode([text1, text2])
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        return float(similarity)
    except Exception as e:
        print(f"Error calculating similarity: {e}")
        return 0.0

def evaluate_answer_similarity(predicted_answer: str, ground_truth_answer: str) -> Dict[str, float]:
    """Evaluate similarity between predicted and ground truth answers"""
    if not predicted_answer or not ground_truth_answer:
        return {"similarity": 0.0, "f1_score": 0.0}
    
    # Calculate semantic similarity
    semantic_similarity = calculate_textual_similarity(predicted_answer, ground_truth_answer)
    
    # For F1 score, we'll use a threshold-based approach
    # Convert similarity to binary classification (similar/not similar)
    threshold = 0.7  # Adjust this threshold as needed
    is_similar = semantic_similarity >= threshold
    
    # For demonstration, we'll use semantic similarity as a proxy for F1
    # In a real scenario, you might want to use more sophisticated metrics
    f1_proxy = semantic_similarity  # This is a simplified approach
    
    return {
        "similarity": semantic_similarity,
        "f1_score": f1_proxy,
        "is_similar": is_similar,
        "threshold": threshold
    }

def run_evaluation_on_questions(questions: List[Dict], max_questions: int = 5) -> List[Dict]:
    """Run evaluation on a subset of questions"""
    if not questions:
        print("‚ö†Ô∏è No questions available for evaluation")
        return []
    
    print(f"üîÑ Running evaluation on {min(max_questions, len(questions))} questions...")
    
    results = []
    for i, question_data in enumerate(questions[:max_questions]):
        question = question_data.get("question", "")
        ground_truth = question_data.get("answer", "")
        
        if not question or not ground_truth:
            print(f"‚ö†Ô∏è Skipping question {i}: missing question or answer")
            continue
        
        print(f"üìù Question {i+1}: {question[:50]}...")
        
        # Ask the question via coordinator
        start_time = time.time()
        response = ask_question_via_coordinator(question)
        end_time = time.time()
        
        if "error" in response:
            print(f"‚ùå Error asking question {i+1}: {response['error']}")
            continue
        
        predicted_answer = response.get("answer", "")
        
        # Evaluate similarity
        evaluation = evaluate_answer_similarity(predicted_answer, ground_truth)
        
        result = {
            "question_index": i,
            "question": question,
            "ground_truth": ground_truth,
            "predicted_answer": predicted_answer,
            "response_time": end_time - start_time,
            **evaluation
        }
        
        results.append(result)
        
        print(f"‚úÖ Question {i+1} completed - Similarity: {evaluation['similarity']:.3f}")
        time.sleep(1)  # Rate limiting
    
    return results

# Run evaluation if we have questions
if questions and model:
    evaluation_results = run_evaluation_on_questions(questions, max_questions=3)
    
    if evaluation_results:
        print(f"\nüìä Evaluation Results Summary:")
        similarities = [r["similarity"] for r in evaluation_results]
        response_times = [r["response_time"] for r in evaluation_results]
        
        print(f"Average Similarity: {np.mean(similarities):.3f}")
        print(f"Average Response Time: {np.mean(response_times):.2f}s")
        print(f"Max Similarity: {np.max(similarities):.3f}")
        print(f"Min Similarity: {np.min(similarities):.3f}")
        
        # Store results for visualization
        eval_df = pd.DataFrame(evaluation_results)
    else:
        print("‚ö†Ô∏è No evaluation results generated")
        eval_df = pd.DataFrame()
else:
    print("‚ö†Ô∏è Cannot run evaluation - missing questions or model")
    eval_df = pd.DataFrame()


In [None]:
# Visualization and Analysis
print("üìà Creating visualizations and analysis...")

if not eval_df.empty:
    # Set up the plotting style
    plt.style.use('default')
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('DMAS Long-Context Memory Evaluation Results', fontsize=16, fontweight='bold')
    
    # 1. Similarity Distribution
    axes[0, 0].hist(eval_df['similarity'], bins=10, alpha=0.7, color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Similarity Score Distribution')
    axes[0, 0].set_xlabel('Similarity Score')
    axes[0, 0].set_ylabel('Frequency')
    axes[0, 0].axvline(eval_df['similarity'].mean(), color='red', linestyle='--', 
                      label=f'Mean: {eval_df["similarity"].mean():.3f}')
    axes[0, 0].legend()
    
    # 2. Response Time Analysis
    axes[0, 1].bar(range(len(eval_df)), eval_df['response_time'], color='lightcoral', alpha=0.7)
    axes[0, 1].set_title('Response Time per Question')
    axes[0, 1].set_xlabel('Question Index')
    axes[0, 1].set_ylabel('Response Time (seconds)')
    axes[0, 1].axhline(eval_df['response_time'].mean(), color='red', linestyle='--',
                      label=f'Mean: {eval_df["response_time"].mean():.2f}s')
    axes[0, 1].legend()
    
    # 3. Similarity vs Response Time Scatter
    axes[1, 0].scatter(eval_df['response_time'], eval_df['similarity'], 
                      alpha=0.7, s=100, color='green')
    axes[1, 0].set_title('Similarity vs Response Time')
    axes[1, 0].set_xlabel('Response Time (seconds)')
    axes[1, 0].set_ylabel('Similarity Score')
    
    # Add correlation coefficient
    corr = eval_df['response_time'].corr(eval_df['similarity'])
    axes[1, 0].text(0.05, 0.95, f'Correlation: {corr:.3f}', 
                    transform=axes[1, 0].transAxes, fontsize=10,
                    bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.8))
    
    # 4. Performance Summary
    axes[1, 1].axis('off')
    summary_text = f"""
    üìä Performance Summary
    
    Total Questions Evaluated: {len(eval_df)}
    
    Average Similarity: {eval_df['similarity'].mean():.3f}
    Max Similarity: {eval_df['similarity'].max():.3f}
    Min Similarity: {eval_df['similarity'].min():.3f}
    
    Average Response Time: {eval_df['response_time'].mean():.2f}s
    Max Response Time: {eval_df['response_time'].max():.2f}s
    Min Response Time: {eval_df['response_time'].min():.2f}s
    
    Questions Above Threshold: {sum(eval_df['is_similar'])}/{len(eval_df)}
    Success Rate: {sum(eval_df['is_similar'])/len(eval_df)*100:.1f}%
    """
    
    axes[1, 1].text(0.1, 0.9, summary_text, transform=axes[1, 1].transAxes,
                    fontsize=11, verticalalignment='top',
                    bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    # Detailed results table
    print("\nüìã Detailed Evaluation Results:")
    display_cols = ['question_index', 'similarity', 'response_time', 'is_similar']
    print(eval_df[display_cols].to_string(index=False))
    
else:
    print("‚ö†Ô∏è No evaluation data available for visualization")


In [None]:
# Unit Testing for Memory System
print("üß™ Implementing Unit Tests for Memory System...")

import unittest
from unittest.mock import Mock, patch
import asyncio

class MemorySystemTests(unittest.TestCase):
    """Unit tests for the memory system"""
    
    def setUp(self):
        """Set up test fixtures"""
        self.test_conversation_data = {
            "sample_id": "test_conv_001",
            "speaker_a": "Alice",
            "speaker_b": "Bob",
            "sessions": {
                "session_1": [
                    {"speaker": "Alice", "text": "Hello Bob, how are you?", "dia_id": "1"},
                    {"speaker": "Bob", "text": "I'm doing well, thanks!", "dia_id": "2"}
                ]
            },
            "session_datetimes": {
                "session_1_date_time": "10:00 AM on 1 January, 2024"
            }
        }
        
        self.test_questions = [
            {
                "question": "How is Bob doing?",
                "answer": "Bob is doing well",
                "category": "emotional_state"
            },
            {
                "question": "Who spoke first?",
                "answer": "Alice spoke first",
                "category": "speaker_identification"
            }
        ]
    
    def test_conversation_data_structure(self):
        """Test that conversation data has required structure"""
        self.assertIn("sample_id", self.test_conversation_data)
        self.assertIn("sessions", self.test_conversation_data)
        self.assertIn("session_datetimes", self.test_conversation_data)
        
        # Check session structure
        sessions = self.test_conversation_data["sessions"]
        self.assertIsInstance(sessions, dict)
        self.assertIn("session_1", sessions)
        
        # Check session content
        session_1 = sessions["session_1"]
        self.assertIsInstance(session_1, list)
        self.assertEqual(len(session_1), 2)
        
        # Check turn structure
        turn = session_1[0]
        self.assertIn("speaker", turn)
        self.assertIn("text", turn)
        self.assertIn("dia_id", turn)
    
    def test_question_data_structure(self):
        """Test that question data has required structure"""
        for question in self.test_questions:
            self.assertIn("question", question)
            self.assertIn("answer", question)
            self.assertIn("category", question)
            
            # Check that fields are not empty
            self.assertTrue(question["question"].strip())
            self.assertTrue(question["answer"].strip())
    
    def test_similarity_calculation(self):
        """Test similarity calculation function"""
        # Test with identical texts
        similarity = calculate_textual_similarity("Hello world", "Hello world")
        self.assertAlmostEqual(similarity, 1.0, places=2)
        
        # Test with completely different texts
        similarity = calculate_textual_similarity("Hello world", "xyz abc def")
        self.assertLess(similarity, 0.5)
        
        # Test with empty texts
        similarity = calculate_textual_similarity("", "Hello world")
        self.assertEqual(similarity, 0.0)
    
    def test_evaluation_metrics(self):
        """Test evaluation metrics calculation"""
        # Test with identical answers
        evaluation = evaluate_answer_similarity("Bob is doing well", "Bob is doing well")
        self.assertGreater(evaluation["similarity"], 0.9)
        self.assertTrue(evaluation["is_similar"])
        
        # Test with different answers
        evaluation = evaluate_answer_similarity("Bob is doing well", "Alice is happy")
        self.assertLess(evaluation["similarity"], 0.8)
    
    @patch('requests.post')
    def test_api_error_handling(self, mock_post):
        """Test API error handling"""
        # Mock API error response
        mock_response = Mock()
        mock_response.status_code = 500
        mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError("Server Error")
        mock_post.return_value = mock_response
        
        # Test that error is handled gracefully
        result = ask_question_via_coordinator("Test question")
        self.assertIn("error", result)
    
    def test_data_validation(self):
        """Test data validation functions"""
        # Test valid conversation data
        self.assertTrue(self._is_valid_conversation_data(self.test_conversation_data))
        
        # Test invalid conversation data
        invalid_data = {"sample_id": "test"}
        self.assertFalse(self._is_valid_conversation_data(invalid_data))
    
    def _is_valid_conversation_data(self, data):
        """Helper function to validate conversation data"""
        required_fields = ["sample_id", "sessions", "session_datetimes"]
        return all(field in data for field in required_fields)

def run_memory_tests():
    """Run all memory system tests"""
    print("üß™ Running Memory System Unit Tests...")
    
    # Create test suite
    suite = unittest.TestLoader().loadTestsFromTestCase(MemorySystemTests)
    
    # Run tests
    runner = unittest.TextTestRunner(verbosity=2)
    result = runner.run(suite)
    
    # Print summary
    print(f"\nüìä Test Results Summary:")
    print(f"Tests run: {result.testsRun}")
    print(f"Failures: {len(result.failures)}")
    print(f"Errors: {len(result.errors)}")
    print(f"Success rate: {((result.testsRun - len(result.failures) - len(result.errors)) / result.testsRun * 100):.1f}%")
    
    if result.failures:
        print("\n‚ùå Test Failures:")
        for test, traceback in result.failures:
            print(f"- {test}: {traceback}")
    
    if result.errors:
        print("\n‚ùå Test Errors:")
        for test, traceback in result.errors:
            print(f"- {test}: {traceback}")
    
    return result.wasSuccessful()

# Run the tests
test_success = run_memory_tests()

if test_success:
    print("‚úÖ All memory system tests passed!")
else:
    print("‚ö†Ô∏è Some memory system tests failed. Check the output above for details.")


In [None]:
# Export Results and Generate Report
print("üìÑ Generating comprehensive evaluation report...")

def generate_evaluation_report():
    """Generate a comprehensive evaluation report"""
    report = {
        "timestamp": datetime.now().isoformat(),
        "system_info": {
            "locomo_url": LOCOMO_URL,
            "coordinator_url": COORDINATOR_URL,
            "memory_url": MEMORY_URL
        },
        "data_summary": {
            "total_conversations": len(conversations) if conversations else 0,
            "total_questions": len(questions) if questions else 0,
            "services_healthy": services_healthy
        },
        "evaluation_results": {},
        "test_results": {
            "unit_tests_passed": test_success if 'test_success' in locals() else False
        }
    }
    
    # Add evaluation results if available
    if not eval_df.empty:
        report["evaluation_results"] = {
            "total_evaluated": len(eval_df),
            "average_similarity": float(eval_df['similarity'].mean()),
            "average_response_time": float(eval_df['response_time'].mean()),
            "max_similarity": float(eval_df['similarity'].max()),
            "min_similarity": float(eval_df['similarity'].min()),
            "success_rate": float(sum(eval_df['is_similar']) / len(eval_df) * 100),
            "detailed_results": eval_df.to_dict('records')
        }
    
    return report

def save_results_to_files():
    """Save results to various file formats"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Generate report
    report = generate_evaluation_report()
    
    # Save JSON report
    json_filename = f"dmas_evaluation_report_{timestamp}.json"
    with open(json_filename, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"‚úÖ JSON report saved: {json_filename}")
    
    # Save evaluation results as CSV if available
    if not eval_df.empty:
        csv_filename = f"dmas_evaluation_results_{timestamp}.csv"
        eval_df.to_csv(csv_filename, index=False)
        print(f"‚úÖ Evaluation results saved: {csv_filename}")
    
    # Save conversation summary as CSV if available
    if conversations:
        conv_summary = []
        for i, conv in enumerate(conversations):
            conv_summary.append({
                "index": i,
                "sample_id": conv.get("sample_id", "N/A"),
                "speaker_a": conv.get("speaker_a", "N/A"),
                "speaker_b": conv.get("speaker_b", "N/A"),
                "sessions_count": len([k for k in conv.get("sessions", {}).keys() if k.startswith("session_")])
            })
        
        conv_df = pd.DataFrame(conv_summary)
        conv_csv_filename = f"dmas_conversations_summary_{timestamp}.csv"
        conv_df.to_csv(conv_csv_filename, index=False)
        print(f"‚úÖ Conversations summary saved: {conv_csv_filename}")
    
    return report

# Generate and save the report
final_report = save_results_to_files()

# Print final summary
print("\n" + "="*60)
print("üéØ DMAS LONG-CONTEXT MEMORY EVALUATION COMPLETE")
print("="*60)
print(f"üìÖ Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"üìä Total conversations loaded: {len(conversations) if conversations else 0}")
print(f"‚ùì Total questions loaded: {len(questions) if questions else 0}")
print(f"üß™ Unit tests passed: {'‚úÖ' if test_success else '‚ùå'}")

if not eval_df.empty:
    print(f"üéØ Questions evaluated: {len(eval_df)}")
    print(f"üìà Average similarity: {eval_df['similarity'].mean():.3f}")
    print(f"‚è±Ô∏è Average response time: {eval_df['response_time'].mean():.2f}s")
    print(f"üéØ Success rate: {sum(eval_df['is_similar'])/len(eval_df)*100:.1f}%")

print("\nüìÅ Files generated:")
print("- dmas_evaluation_report_[timestamp].json")
if not eval_df.empty:
    print("- dmas_evaluation_results_[timestamp].csv")
if conversations:
    print("- dmas_conversations_summary_[timestamp].csv")

print("\nüöÄ Next steps:")
print("1. Review the generated visualizations above")
print("2. Check the saved CSV files for detailed analysis")
print("3. Compare results between different memory backends")
print("4. Run additional evaluations with different question sets")
print("5. Implement cost analysis for different memory approaches")

print("="*60)
