In [1]:
# Database Inspector
# Inspect database contents and test get_data/update_by_indexes functions

import sys
import os
import json
import pandas as pd
from collections import Counter

# Add project root to path
sys.path.insert(0, os.getcwd())

from utils.enhanced_conversation_db import EnhancedConversationDB

print("✅ Database inspector loaded")

✅ Database inspector loaded


In [2]:
# Database Overview
db = EnhancedConversationDB()
stats = db.get_conversation_stats()

print("📊 Database Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

print(f"\n📈 Audio coverage: {stats['conversations_with_audio']}/{stats['total_conversations']} ({stats['conversations_with_audio']/max(stats['total_conversations'],1)*100:.1f}%)")

📊 Database Statistics:
  total_conversations: 27
  total_audio_features: 27
  conversations_with_audio: 27

📈 Audio coverage: 27/27 (100.0%)


In [3]:
# Audio Features Inspection
# Get all audio features using the general get_data function
features, metadata = db.get_data("audio_features", return_features=True)

print(f"🎵 Audio Features: {len(features)} samples")

if len(features) > 0:
    # Check dimensions
    dimensions = [len(f) for f in features]
    dim_counts = Counter(dimensions)
    print(f"\n📏 Feature dimensions: {dict(dim_counts)}")
    
    # Show sample features
    print(f"\n🔍 Sample features (first 3):")
    for i, (feat, meta) in enumerate(zip(features[:3], metadata[:3])):
        print(f"  Sample {i}: {len(feat)}D - {feat[:3]}... | Speaker: {meta.get('speaker', 'Unknown')}")
    
    # Get feature names from raw document
    raw_data = db.audio_features.get()
    if raw_data['documents']:
        latest_doc = json.loads(raw_data['documents'][-1])
        print(f"\n📝 Feature names: {latest_doc.get('feature_names', 'Not available')}")
else:
    print("No audio features found")

🎵 Audio Features: 27 samples

📏 Feature dimensions: {9: 27}

🔍 Sample features (first 3):
  Sample 0: 9D - [133.470703125, 168.15994262695312, 0.13678553700447083]... | Speaker: Speaker I
  Sample 1: 9D - [24.6630859375, 30.28441047668457, 0.02589154802262783]... | Speaker: Speaker R
  Sample 2: 9D - [344.06787109375, 407.9437255859375, 0.20420126616954803]... | Speaker: Speaker H

📝 Feature names: ['energy', 'rms_energy', 'zcr', 'spectral_centroid', 'spectral_rolloff', 'pitch', 'mfcc1', 'mfcc2', 'mfcc3']


In [4]:
metadata

[{'timestamp': '2025-08-04T18:00:56.641535',
  'conversation_id': 'session_20250804_180050_295_20250804_180056_536824',
  'speaker': 'Speaker I',
  'ml_speaker': 'C',
  'test_field': 'test_value_updated',
  'gmm_speaker': 'C',
  'test_number': 42.5,
  'gmm_confidence': 0.9999999999999992,
  'session_id': 'session_20250804_180050_295',
  'ml_speaker_confidence': 0.9999999999999992},
 {'ml_speaker_confidence': 1.0,
  'session_id': 'session_20250804_180050_295',
  'gmm_confidence': 1.0,
  'gmm_speaker': 'D',
  'timestamp': '2025-08-04T18:01:02.088907',
  'ml_speaker': 'D',
  'conversation_id': 'session_20250804_180050_295_20250804_180102_008583',
  'speaker': 'Speaker R'},
 {'session_id': 'session_20250804_180050_295',
  'speaker': 'Speaker H',
  'gmm_confidence': 1.0,
  'ml_speaker_confidence': 1.0,
  'timestamp': '2025-08-04T18:01:06.715642',
  'gmm_speaker': 'K',
  'ml_speaker': 'K',
  'conversation_id': 'session_20250804_180050_295_20250804_180106_641743'},
 {'gmm_speaker': 'D',
  'sp

In [4]:
# Test get_data Function
print("🧪 Testing get_data function:")

# Test 1: Audio features with features=True
features1, meta1 = db.get_data("audio_features", return_features=True)
print(f"✅ Audio features (return_features=True): {len(features1)} samples")
if len(features1) > 0:
    print(f"   Sample type: {type(features1[0])} with {len(features1[0])} values")

# Test 2: Audio features with features=False (raw documents)
docs1, meta2 = db.get_data("audio_features", return_features=False)
print(f"✅ Audio features (return_features=False): {len(docs1)} documents")
if len(docs1) > 0:
    print(f"   Sample type: {type(docs1[0])} (JSON string)")

# Test 3: Conversations
docs2, meta3 = db.get_data("conversations", return_features=False)
print(f"✅ Conversations: {len(docs2)} documents")
if len(docs2) > 0:
    print(f"   Sample: '{docs2[0][:50]}...'")

print("\n🎯 get_data function works correctly!")

🧪 Testing get_data function:
✅ Audio features (return_features=True): 15 samples
   Sample type: <class 'list'> with 9 values
✅ Audio features (return_features=False): 15 documents
   Sample type: <class 'str'> (JSON string)
✅ Conversations: 15 documents
   Sample: 'Speaker I (user): the two friends...'

🎯 get_data function works correctly!


In [5]:
# Test update_by_indexes Function
print("🧪 Testing update_by_indexes function:")

# Test update on audio features
if len(features) > 0:
    # Add a test field to first sample
    test_updates = {
        0: {'test_field': 'test_value_updated', 'test_number': 42.5}
    }
    
    print(f"📝 Updating index 0 with: {test_updates[0]}")
    updated_count = db.update_by_indexes(test_updates, "audio_features")
    print(f"✅ Updated {updated_count} entries")
    
    # Verify the update
    _, updated_meta = db.get_data("audio_features", return_features=True)
    if len(updated_meta) > 0:
        first_meta = updated_meta[0]
        print(f"🔍 Verification - Index 0 metadata:")
        print(f"   test_field: {first_meta.get('test_field', 'NOT FOUND')}")
        print(f"   test_number: {first_meta.get('test_number', 'NOT FOUND')}")
        print(f"   speaker: {first_meta.get('speaker', 'NOT FOUND')}")
        
        if first_meta.get('test_field') == 'test_value_updated':
            print("✅ update_by_indexes works correctly!")
        else:
            print("❌ Update verification failed")
else:
    print("⚠️ No audio features to test with")

🧪 Testing update_by_indexes function:
📝 Updating index 0 with: {'test_field': 'test_value_updated', 'test_number': 42.5}
✅ Updated 1 entries
🔍 Verification - Index 0 metadata:
   test_field: test_value_updated
   test_number: 42.5
   speaker: Speaker I
✅ update_by_indexes works correctly!


In [6]:
# Quick Actions and Commands
print("⚡ Quick Actions:")
print("\n# Reset database:")
print("!python reset_database.py")

print("\n# Run GMM clustering:")
print("!python speaker_gmm/gmm_clustering.py auto")
print("!python speaker_gmm/gmm_clustering.py update")

print("\n# Test pipeline:")
print("!python main.py")

⚡ Quick Actions:

# Reset database:
!python reset_database.py

# Run GMM clustering:
!python speaker_gmm/gmm_clustering.py auto
!python speaker_gmm/gmm_clustering.py update

# Test pipeline:
!python main.py


# ML Clustering text

In [6]:
# Test 2: ML Clustering System Verification

print("🧪 ML CLUSTERING SYSTEM TEST")
print("=" * 50)

# Test ML clustering system
from ml.speaker_clustering import SpeakerClustering
clustering = SpeakerClustering()

# Check initial state
initial_stats = clustering.get_clustering_stats()
print(f"✅ Initial Clustering State:")
print(f"   Is clustered: {initial_stats['is_clustered']}")

# Create synthetic training data
import numpy as np

# Generate synthetic audio features for testing
def generate_synthetic_features(speaker_type="male", num_samples=5):
    """Generate synthetic audio features for testing"""
    features_list = []
    metadata_list = []
    
    base_features = {
        'male': {
            'energy': 1200.0, 'pitch_estimate': 100.0, 'formant_1': 400.0
        },
        'female': {
            'energy': 2000.0, 'pitch_estimate': 180.0, 'formant_1': 600.0
        }
    }
    
    base = base_features[speaker_type]
    
    for i in range(num_samples):
        # Add some variation
        features = {
            'energy': base['energy'] + np.random.normal(0, 100),
            'pitch_estimate': base['pitch_estimate'] + np.random.normal(0, 10),
            'zero_crossings': 35 + np.random.normal(0, 5),
            'spectral_centroid': 600 + np.random.normal(0, 50),
            'energy_variance': 150 + np.random.normal(0, 20),
            'peak_amplitude': 2500 + np.random.normal(0, 200),
            'rms_energy': 1000 + np.random.normal(0, 100),
            'mfcc_1': 0.3 + np.random.normal(0, 0.05),
            'mfcc_2': 0.2 + np.random.normal(0, 0.05),
            'mfcc_3': 0.1 + np.random.normal(0, 0.05),
            'formant_1': base['formant_1'] + np.random.normal(0, 20),
            'formant_2': 1200 + np.random.normal(0, 50),
            'jitter': 0.018 + np.random.normal(0, 0.005),
            'shimmer': 0.025 + np.random.normal(0, 0.005)
        }
        
        features_list.append(list(features.values()))
        metadata_list.append({
            'conversation_id': f'test_conv_{speaker_type}_{i}',
            'session_id': f'test_session_{speaker_type}',
            'speaker': f'{speaker_type.capitalize()}Speaker',
            'timestamp': datetime.now().isoformat(),
            'feature_count': 14
        })
    
    return features_list, metadata_list

# Generate test data
male_features, male_metadata = generate_synthetic_features("male", 8)
female_features, female_metadata = generate_synthetic_features("female", 8)

# Combine all features
all_features = male_features + female_features
all_metadata = male_metadata + female_metadata

print(f"✅ Generated Test Data:")
print(f"   Total samples: {len(all_features)}")
print(f"   Male samples: {len(male_features)}")
print(f"   Female samples: {len(female_features)}")
print(f"   Feature dimensions: {len(all_features[0])}")

# Test clustering
print(f"\n�� Testing K-means Clustering...")
result = clustering.perform_clustering(
    all_features, all_metadata,
    method='kmeans',
    n_clusters=2
)

if result['success']:
    print(f"✅ Clustering Successful!")
    print(f"   Clusters: {result['n_clusters']}")
    print(f"   Samples: {result['n_samples']}")
    print(f"   Silhouette score: {result['silhouette_score']:.3f}")
    print(f"   Method: {result['method']}")
    
    # Test speaker identification
    test_features = {
        'energy': 1300.0, 'pitch_estimate': 110.0, 'zero_crossings': 38,
        'spectral_centroid': 650.0, 'energy_variance': 160.0, 'peak_amplitude': 2600.0,
        'rms_energy': 1050.0, 'mfcc_1': 0.32, 'mfcc_2': 0.22, 'mfcc_3': 0.12,
        'formant_1': 420.0, 'formant_2': 1250.0, 'jitter': 0.019, 'shimmer': 0.026
    }
    
    speaker_name, confidence = clustering.identify_speaker_from_features(test_features)
    print(f"✅ Speaker Identification Test:")
    print(f"   Identified speaker: {speaker_name}")
    print(f"   Confidence: {confidence:.3f}")
    
else:
    print(f"❌ Clustering failed: {result.get('reason', 'unknown')}")

print("🎉 ML clustering system is working correctly!")

🧪 ML CLUSTERING SYSTEM TEST
✅ Initial Clustering State:
   Is clustered: False
✅ Generated Test Data:
   Total samples: 16
   Male samples: 8
   Female samples: 8
   Feature dimensions: 14

�� Testing K-means Clustering...
✅ Clustering Successful!
   Clusters: 2
   Samples: 16
   Silhouette score: 0.187
   Method: kmeans


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


AttributeError: 'SpeakerClustering' object has no attribute 'identify_speaker_from_features'

In [None]:
# integration test

# Test 3: Database + ML Integration Test
print("🧪 INTEGRATION TEST")
print("=" * 50)

# Test the complete pipeline
from speech.speech_processor import SpeakerDetector
from ml.speaker_clustering import SpeakerClustering

# Initialize components
db = EnhancedConversationDB()
clustering = SpeakerClustering()
speaker_detector = SpeakerDetector(
    enhanced_db=db,
    speaker_clustering=clustering
)

print(f"✅ Components Initialized:")
print(f"   Database: {type(db).__name__}")
print(f"   Clustering: {type(clustering).__name__}")
print(f"   Speaker Detector: {type(speaker_detector).__name__}")

# Test speaker detection with ML
import numpy as np

# Simulate audio processing
dummy_audio = np.random.randint(-1000, 1000, size=2048, dtype=np.int16).tobytes()

print(f"\n🔧 Testing Speaker Detection with ML:")
print(f"   Initial speaker: {speaker_detector.current_speaker}")

# Process multiple audio frames to build up features
for i in range(10):
    speaker_detector.update_speaker_count(dummy_audio, 0)

print(f"   After processing: {speaker_detector.current_speaker}")
print(f"   Speaker changes: {speaker_detector.speaker_changes}")
print(f"   Feature buffer size: {len(speaker_detector.feature_buffer)}")

# Test feature extraction
current_features = speaker_detector.get_current_features()
print(f"   Current features available: {current_features is not None}")
if current_features:
    print(f"   Feature count: {len(current_features)}")

# Test database integration
if current_features:
    # Add to database
    db.add_conversation_with_audio(
        session_id="integration_test",
        text="Integration test conversation",
        speaker=speaker_detector.current_speaker,
        role="user",
        is_gemma_mode=False,
        audio_features=current_features
    )
    
    # Verify storage
    stats = db.get_conversation_stats()
    print(f"✅ Database Integration:")
    print(f"   Total conversations: {stats['total_conversations']}")
    print(f"   Audio features: {stats['total_audio_features']}")

print("🎉 Integration test completed successfully!")

In [None]:
# training pipeline test

# Test 4: Training Pipeline Test
print("🧪 TRAINING PIPELINE TEST")
print("=" * 50)

# Test the training script functionality
import sys
import os

# Add the tests directory to path
sys.path.append('tests')

# Import the training function
from train_unsupervised_speakers_new import train_unsupervised_speakers, analyze_clustering_quality

print("✅ Training functions imported successfully")

# Check if we have enough data for training
stats = db.get_conversation_stats()
print(f"📊 Current Data Status:")
print(f"   Audio features: {stats['total_audio_features']}")
print(f"   Need for training: 10+")

if stats['total_audio_features'] >= 10:
    print("✅ Sufficient data for training!")
    print("   Run: train_unsupervised_speakers() to train clustering")
else:
    print("⏳ Need more data for training")
    print("   Continue using the system to collect more audio features")

# Test analysis function
print(f"\n🔍 Testing Analysis Function:")
try:
    analyze_clustering_quality()
    print("✅ Analysis function works!")
except Exception as e:
    print(f"⚠️  Analysis function error: {e}")

print("🎉 Training pipeline test completed!")

# Delete database

In [3]:
import shutil
import os

if os.path.exists("data/vector_db"):
    shutil.rmtree("data/vector_db")
    print("✅ Old database deleted - ready for 14-feature data")

✅ Old database deleted - ready for 14-feature data


In [2]:
# Final database clear
import shutil
import os

if os.path.exists("data/vector_db"):
    shutil.rmtree("data/vector_db")
    print("✅ Database cleared for fresh start")

# Create completely fresh database
db = EnhancedConversationDB()
print("✅ Fresh database ready")

✅ Database cleared for fresh start


NameError: name 'EnhancedConversationDB' is not defined