In [1]:
# Database Inspector
# Inspect database contents and test get_data/update_by_indexes functions

import sys
import os
import json
import pandas as pd
from collections import Counter

# Add project root to path
sys.path.insert(0, os.getcwd())

from utils.enhanced_conversation_db import EnhancedConversationDB

print("✅ Database inspector loaded")

✅ Database inspector loaded


In [1]:
import sys
import os
from pathlib import Path
from layoutparser.models import Detectron2LayoutModel

# Add project root to Python path
current_path = Path.cwd()
project_root = current_path.parents[1]  # Go up two levels from notebooks/ to project root
sys.path.insert(0, str(project_root))

from rag_functions.main import process_document

sample_references = [
        "Sample reference document about AI ethics...",
        "Another reference about machine learning applications..."
    ]


output = process_document('/Users/alexander/Library/CloudStorage/Dropbox/Personal Research/cortex_bridge/paper/bsp_2.pdf', sample_references)
print(output)

  Referenced from: <FB2FD416-6C4D-3621-B677-61F07C02A3C5> /Users/alexander/miniforge3/envs/layout_parser/lib/python3.9/site-packages/torchvision/image.so
  warn(
  from .autonotebook import tqdm as notebook_tqdm
  return torch.load(f, map_location=torch.device("cpu"))
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


🔄 Switching to gemma3n:e4b...
✅ gemma3n:e4b warmed up in 0.11s
⚡ Model loaded in 0.11s
⚠️  Slow response: 23.70s
## Comprehensive Analysis of the Behavior Support Plan for John Doe

Here's a structured analysis of the provided Behavior Support Plan (BSP) for John Doe, covering key topics, findings, evidence, implications, and recommendations.

**1. Main Topics and Themes:**

*   **Behavioral Support:** The central theme is the development and implementation of a BSP to address John Doe's wandering and agitation.
*   **Geriatric Care:** The plan is framed within a geriatric care context, considering John Doe's age and potential cognitive decline.
*   **Non-Restrictive Interventions:** A strong emphasis is placed on avoiding restrictive practices and utilizing positive behavioral support strategies.
*   **Assessment-Driven Approach:** The plan is informed by assessments of John Doe's cognitive and behavioral state.
*   **Consultation and Documentation:** The plan highlights the importanc

# Inspect Database

In [5]:
import sys
import os
from pathlib import Path

# Find the project root and add to Python path
current_path = Path.cwd()
project_root = None

for parent in [current_path] + list(current_path.parents):
    if (parent / 'program_files').exists():
        project_root = parent
        break

if project_root is None:
    raise RuntimeError("Could not find project root directory")

sys.path.insert(0, str(project_root))

from datetime import datetime, timedelta
from program_files.database.enhanced_conversation_db import EnhancedConversationDB

def get_gemma_conversations_with_feedback(days_back=1):
    """
    Get conversations with Gemma from the last N days, grouped by session.
    
    Args:
        days_back (int): Number of days to look back (default: 1)
    
    Returns:
        dict: Dictionary with conversation sessions as keys and metadata as values
        Format: {
            "session_id": {
                "session_feedback": str,
                "full_text": str,
                "timestamp": datetime,
                "message_count": int
            }
        }
    """
    # Initialize the database
    db = EnhancedConversationDB()
    
    # Get all conversations with documents included
    results = db.conversations.get(include=['metadatas', 'documents'])
    
    # Calculate date range
    end_time = datetime.now()
    start_time = end_time - timedelta(days=days_back)
    
    # Filter for date range and group by session
    sessions = {}
    
    for i, metadata in enumerate(results['metadatas']):
        timestamp_str = metadata.get('timestamp')
        if not timestamp_str:
            continue
        
        try:
            # Parse timestamp
            conv_time = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) if 'T' in timestamp_str else datetime.strptime(timestamp_str, '%Y-%m-%d %H:%M:%S')
            
            # Check if within date range
            if start_time <= conv_time <= end_time:
                session_id = metadata.get('session_id')
                if session_id not in sessions:
                    sessions[session_id] = []
                
                sessions[session_id].append({
                    'id': results['ids'][i],
                    'text': results['documents'][i],
                    'metadata': metadata,
                    'timestamp': conv_time,
                    'feedback_helpful': metadata.get('feedback_helpful', '')
                })
        except Exception as e:
            print(f"Error parsing timestamp {timestamp_str}: {e}")
            continue
    
    # Filter sessions that contain Gemma conversations and build result
    result_dict = {}
    conversation_counter = 1
    
    for session_id, session_conversations in sessions.items():
        # Check if this session has any Gemma conversations
        has_gemma = any('[GEMMA]' in conv['text'] for conv in session_conversations)
        
        if not has_gemma:
            continue  # Skip sessions without Gemma
        
        # Sort by timestamp
        session_conversations.sort(key=lambda x: x['timestamp'])
        
        # Get feedback for the session (use the first non-empty feedback found)
        session_feedback = None
        for conv in session_conversations:
            if conv['feedback_helpful'] and conv['feedback_helpful'] != 'unknown':
                session_feedback = conv['feedback_helpful']
                break
        
        # Build full conversation text
        conversation_lines = []
        for conv in session_conversations:
            # Extract just the speaker text (remove speaker/role prefix and [GEMMA] tag)
            full_text = conv['text']
            
            # Remove the speaker/role prefix (e.g., "Speaker_A (user): ")
            if ': ' in full_text:
                speaker_text = full_text.split(': ', 1)[1]
            else:
                speaker_text = full_text
            
            # Remove [GEMMA] tag if present
            speaker_text = speaker_text.replace(' [GEMMA]', '')
            
            # Add speaker name and text
            speaker = conv['metadata'].get('speaker', 'Unknown')
            conversation_lines.append(f"{speaker}: {speaker_text}")
        
        full_text = "\n".join(conversation_lines)
        
        # Add to result dictionary
        result_dict[f"conversation_{conversation_counter}"] = {
            "session_feedback": session_feedback,
            "full_text": full_text,
            "session_id": session_id,
            "timestamp": session_conversations[0]['timestamp'],
            "message_count": len(session_conversations)
        }
        
        conversation_counter += 1
    
    return result_dict

# Example usage:
conversations = get_gemma_conversations_with_feedback(days_back=1)

# Display the results
for conv_key, conv_data in conversations.items():
    print(f"\n{conv_key.upper()}:")
    print(f"Session Feedback: {conv_data['session_feedback']}")
    print(f"Session ID: {conv_data['session_id']}")
    print(f"Timestamp: {conv_data['timestamp']}")
    print(f"Message Count: {conv_data['message_count']}")
    print(f"Full Text:\n{conv_data['full_text']}")
    print("-" * 50)


CONVERSATION_1:
Session Feedback: True
Session ID: session_20250805_115307_390
Timestamp: 2025-08-05 11:53:07.390773
Message Count: 5
Full Text:
Speaker_A: can you help me
Gemma: Yes, I can definitely try! 😊

Please tell me what you need help with. The more information you can give me, the better I can assist you.  For example:

* **What are you trying to do?** (e.g., write an email, solve a math problem, find information, brainstorm ideas)
* **What's the problem you're encountering?** (e.g., I don't know where to start, I'm getting an error message, I'm stuck on a specific step)
* **What have you tried already?** (This helps me avoid suggesting things you've already done.)
* **What kind of help are you looking for?** (e.g., guidance, suggestions, code, explanations)

I'm here to help in any way I can. Just let me know what's on your mind!
Speaker_A: what is your name
Gemma: As a large language model, I don't have a name in the traditional sense. I was created by the Gemma team at Goo

In [7]:
conversations

{'conversation_1': {'session_feedback': 'True',
  'full_text': "Speaker_A: can you help me\nGemma: Yes, I can definitely try! 😊\n\nPlease tell me what you need help with. The more information you can give me, the better I can assist you.  For example:\n\n* **What are you trying to do?** (e.g., write an email, solve a math problem, find information, brainstorm ideas)\n* **What's the problem you're encountering?** (e.g., I don't know where to start, I'm getting an error message, I'm stuck on a specific step)\n* **What have you tried already?** (This helps me avoid suggesting things you've already done.)\n* **What kind of help are you looking for?** (e.g., guidance, suggestions, code, explanations)\n\nI'm here to help in any way I can. Just let me know what's on your mind!\nSpeaker_A: what is your name\nGemma: As a large language model, I don't have a name in the traditional sense. I was created by the Gemma team at Google DeepMind. You can think of me as an open-weights AI assistant! I'm

In [9]:
import json

with open('conversations.json', 'w') as f:
    json.dump(conversations, f, indent=4, default=str)
print(f"✅ Saved to conversations.json")

✅ Saved to conversations.json


# ML Clustering text

In [6]:
# Test 2: ML Clustering System Verification

print("🧪 ML CLUSTERING SYSTEM TEST")
print("=" * 50)

# Test ML clustering system
from ml.speaker_clustering import SpeakerClustering
clustering = SpeakerClustering()

# Check initial state
initial_stats = clustering.get_clustering_stats()
print(f"✅ Initial Clustering State:")
print(f"   Is clustered: {initial_stats['is_clustered']}")

# Create synthetic training data
import numpy as np

# Generate synthetic audio features for testing
def generate_synthetic_features(speaker_type="male", num_samples=5):
    """Generate synthetic audio features for testing"""
    features_list = []
    metadata_list = []
    
    base_features = {
        'male': {
            'energy': 1200.0, 'pitch_estimate': 100.0, 'formant_1': 400.0
        },
        'female': {
            'energy': 2000.0, 'pitch_estimate': 180.0, 'formant_1': 600.0
        }
    }
    
    base = base_features[speaker_type]
    
    for i in range(num_samples):
        # Add some variation
        features = {
            'energy': base['energy'] + np.random.normal(0, 100),
            'pitch_estimate': base['pitch_estimate'] + np.random.normal(0, 10),
            'zero_crossings': 35 + np.random.normal(0, 5),
            'spectral_centroid': 600 + np.random.normal(0, 50),
            'energy_variance': 150 + np.random.normal(0, 20),
            'peak_amplitude': 2500 + np.random.normal(0, 200),
            'rms_energy': 1000 + np.random.normal(0, 100),
            'mfcc_1': 0.3 + np.random.normal(0, 0.05),
            'mfcc_2': 0.2 + np.random.normal(0, 0.05),
            'mfcc_3': 0.1 + np.random.normal(0, 0.05),
            'formant_1': base['formant_1'] + np.random.normal(0, 20),
            'formant_2': 1200 + np.random.normal(0, 50),
            'jitter': 0.018 + np.random.normal(0, 0.005),
            'shimmer': 0.025 + np.random.normal(0, 0.005)
        }
        
        features_list.append(list(features.values()))
        metadata_list.append({
            'conversation_id': f'test_conv_{speaker_type}_{i}',
            'session_id': f'test_session_{speaker_type}',
            'speaker': f'{speaker_type.capitalize()}Speaker',
            'timestamp': datetime.now().isoformat(),
            'feature_count': 14
        })
    
    return features_list, metadata_list

# Generate test data
male_features, male_metadata = generate_synthetic_features("male", 8)
female_features, female_metadata = generate_synthetic_features("female", 8)

# Combine all features
all_features = male_features + female_features
all_metadata = male_metadata + female_metadata

print(f"✅ Generated Test Data:")
print(f"   Total samples: {len(all_features)}")
print(f"   Male samples: {len(male_features)}")
print(f"   Female samples: {len(female_features)}")
print(f"   Feature dimensions: {len(all_features[0])}")

# Test clustering
print(f"\n�� Testing K-means Clustering...")
result = clustering.perform_clustering(
    all_features, all_metadata,
    method='kmeans',
    n_clusters=2
)

if result['success']:
    print(f"✅ Clustering Successful!")
    print(f"   Clusters: {result['n_clusters']}")
    print(f"   Samples: {result['n_samples']}")
    print(f"   Silhouette score: {result['silhouette_score']:.3f}")
    print(f"   Method: {result['method']}")
    
    # Test speaker identification
    test_features = {
        'energy': 1300.0, 'pitch_estimate': 110.0, 'zero_crossings': 38,
        'spectral_centroid': 650.0, 'energy_variance': 160.0, 'peak_amplitude': 2600.0,
        'rms_energy': 1050.0, 'mfcc_1': 0.32, 'mfcc_2': 0.22, 'mfcc_3': 0.12,
        'formant_1': 420.0, 'formant_2': 1250.0, 'jitter': 0.019, 'shimmer': 0.026
    }
    
    speaker_name, confidence = clustering.identify_speaker_from_features(test_features)
    print(f"✅ Speaker Identification Test:")
    print(f"   Identified speaker: {speaker_name}")
    print(f"   Confidence: {confidence:.3f}")
    
else:
    print(f"❌ Clustering failed: {result.get('reason', 'unknown')}")

print("🎉 ML clustering system is working correctly!")

🧪 ML CLUSTERING SYSTEM TEST
✅ Initial Clustering State:
   Is clustered: False
✅ Generated Test Data:
   Total samples: 16
   Male samples: 8
   Female samples: 8
   Feature dimensions: 14

�� Testing K-means Clustering...
✅ Clustering Successful!
   Clusters: 2
   Samples: 16
   Silhouette score: 0.187
   Method: kmeans


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AttributeError: 'SpeakerClustering' object has no attribute 'identify_speaker_from_features'

In [None]:
# integration test

# Test 3: Database + ML Integration Test
print("🧪 INTEGRATION TEST")
print("=" * 50)

# Test the complete pipeline
from speech.speech_processor import SpeakerDetector
from ml.speaker_clustering import SpeakerClustering

# Initialize components
db = EnhancedConversationDB()
clustering = SpeakerClustering()
speaker_detector = SpeakerDetector(
    enhanced_db=db,
    speaker_clustering=clustering
)

print(f"✅ Components Initialized:")
print(f"   Database: {type(db).__name__}")
print(f"   Clustering: {type(clustering).__name__}")
print(f"   Speaker Detector: {type(speaker_detector).__name__}")

# Test speaker detection with ML
import numpy as np

# Simulate audio processing
dummy_audio = np.random.randint(-1000, 1000, size=2048, dtype=np.int16).tobytes()

print(f"\n🔧 Testing Speaker Detection with ML:")
print(f"   Initial speaker: {speaker_detector.current_speaker}")

# Process multiple audio frames to build up features
for i in range(10):
    speaker_detector.update_speaker_count(dummy_audio, 0)

print(f"   After processing: {speaker_detector.current_speaker}")
print(f"   Speaker changes: {speaker_detector.speaker_changes}")
print(f"   Feature buffer size: {len(speaker_detector.feature_buffer)}")

# Test feature extraction
current_features = speaker_detector.get_current_features()
print(f"   Current features available: {current_features is not None}")
if current_features:
    print(f"   Feature count: {len(current_features)}")

# Test database integration
if current_features:
    # Add to database
    db.add_conversation_with_audio(
        session_id="integration_test",
        text="Integration test conversation",
        speaker=speaker_detector.current_speaker,
        role="user",
        is_gemma_mode=False,
        audio_features=current_features
    )
    
    # Verify storage
    stats = db.get_conversation_stats()
    print(f"✅ Database Integration:")
    print(f"   Total conversations: {stats['total_conversations']}")
    print(f"   Audio features: {stats['total_audio_features']}")

print("🎉 Integration test completed successfully!")

In [None]:
# training pipeline test

# Test 4: Training Pipeline Test
print("🧪 TRAINING PIPELINE TEST")
print("=" * 50)

# Test the training script functionality
import sys
import os

# Add the tests directory to path
sys.path.append('tests')

# Import the training function
from train_unsupervised_speakers_new import train_unsupervised_speakers, analyze_clustering_quality

print("✅ Training functions imported successfully")

# Check if we have enough data for training
stats = db.get_conversation_stats()
print(f"📊 Current Data Status:")
print(f"   Audio features: {stats['total_audio_features']}")
print(f"   Need for training: 10+")

if stats['total_audio_features'] >= 10:
    print("✅ Sufficient data for training!")
    print("   Run: train_unsupervised_speakers() to train clustering")
else:
    print("⏳ Need more data for training")
    print("   Continue using the system to collect more audio features")

# Test analysis function
print(f"\n🔍 Testing Analysis Function:")
try:
    analyze_clustering_quality()
    print("✅ Analysis function works!")
except Exception as e:
    print(f"⚠️  Analysis function error: {e}")

print("🎉 Training pipeline test completed!")

# Delete database

In [3]:
import shutil
import os

if os.path.exists("data/vector_db"):
    shutil.rmtree("data/vector_db")
    print("✅ Old database deleted - ready for 14-feature data")

✅ Old database deleted - ready for 14-feature data


In [2]:
# Final database clear
import shutil
import os

if os.path.exists("data/vector_db"):
    shutil.rmtree("data/vector_db")
    print("✅ Database cleared for fresh start")

# Create completely fresh database
db = EnhancedConversationDB()
print("✅ Fresh database ready")

✅ Database cleared for fresh start


NameError: name 'EnhancedConversationDB' is not defined