# Face Duplicate Detection Demo

This notebook demonstrates the face duplicate detection system for bicycle videos using GPU acceleration.

In [None]:
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pandas as pd
import json
from PIL import Image

# Import our detection system
from src import DetectionConfig, DuplicateFaceDetector, FaceDetector
from src.models import BoundingBox

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## 1. System Setup and GPU Check

In [None]:
import torch

print("System Information:")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU count: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        memory_gb = torch.cuda.get_device_properties(i).total_memory / 1e9
        print(f"  Memory: {memory_gb:.1f} GB")
else:
    print("⚠️  GPU not available - will use CPU (slower performance)")

## 2. Configuration Setup

In [None]:
# Configure detection parameters
config = DetectionConfig(
    min_confidence=0.6,      # Higher threshold for cleaner detections
    match_threshold=0.5,     # Distance threshold for face matching
    batch_size=8,            # Adjust based on GPU memory
    log_level="INFO",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

print("Detection Configuration:")
print(f"  Min confidence: {config.min_confidence}")
print(f"  Match threshold: {config.match_threshold}")
print(f"  Batch size: {config.batch_size}")
print(f"  Device: {config.device}")

## 3. Single Frame Face Detection Demo

In [None]:
# Initialize face detector
face_detector = FaceDetector(config)

# Create a sample frame (you can replace this with actual video frame)
def create_sample_frame_with_faces():
    """Create a synthetic frame with multiple face-like regions for demo."""
    frame = np.zeros((480, 640, 3), dtype=np.uint8)
    frame[:] = (50, 100, 150)  # Background color
    
    # Add some face-like rectangular regions
    face_positions = [(100, 150, 80, 100), (300, 200, 90, 110), (450, 100, 75, 95)]
    
    for i, (x, y, w, h) in enumerate(face_positions):
        # Create face-like region
        color = (200 + i * 20, 180 + i * 15, 160 + i * 10)
        cv2.rectangle(frame, (x, y), (x + w, y + h), color, -1)
        
        # Add "eyes"
        eye_color = (50, 50, 50)
        cv2.circle(frame, (x + w//3, y + h//3), 5, eye_color, -1)
        cv2.circle(frame, (x + 2*w//3, y + h//3), 5, eye_color, -1)
        
        # Add "mouth"
        cv2.ellipse(frame, (x + w//2, y + 2*h//3), (w//4, h//8), 0, 0, 180, eye_color, 2)
    
    return frame

# Create and display sample frame
sample_frame = create_sample_frame_with_faces()

plt.figure(figsize=(12, 8))
plt.imshow(cv2.cvtColor(sample_frame, cv2.COLOR_BGR2RGB))
plt.title("Sample Frame with Synthetic Faces")
plt.axis('off')
plt.show()

In [None]:
# Demonstrate face detection on the sample frame
print("Running face detection...")
detections = face_detector.detect_faces(sample_frame)

print(f"Found {len(detections)} faces")

# Visualize detections
frame_with_boxes = sample_frame.copy()

for i, (bbox, confidence) in enumerate(detections):
    # Draw bounding box
    cv2.rectangle(
        frame_with_boxes,
        (bbox.x, bbox.y),
        (bbox.x + bbox.width, bbox.y + bbox.height),
        (0, 255, 0),  # Green box
        2
    )
    
    # Add confidence label
    label = f"Face {i+1}: {confidence:.2f}"
    cv2.putText(
        frame_with_boxes,
        label,
        (bbox.x, bbox.y - 10),
        cv2.FONT_HERSHEY_SIMPLEX,
        0.5,
        (0, 255, 0),
        1
    )

plt.figure(figsize=(12, 8))
plt.imshow(cv2.cvtColor(frame_with_boxes, cv2.COLOR_BGR2RGB))
plt.title(f"Face Detection Results ({len(detections)} faces found)")
plt.axis('off')
plt.show()

## 4. Face Embedding Extraction Demo

In [None]:
# Extract embeddings for detected faces
if detections:
    print("Extracting face embeddings...")
    bboxes = [bbox for bbox, _ in detections]
    embeddings = face_detector.extract_embeddings(sample_frame, bboxes)
    
    print(f"Extracted {len(embeddings)} embeddings")
    
    if embeddings:
        print(f"Embedding shape: {embeddings[0].shape}")
        print(f"Embedding range: [{embeddings[0].min():.3f}, {embeddings[0].max():.3f}]")
        
        # Visualize embedding distributions
        fig, axes = plt.subplots(1, min(3, len(embeddings)), figsize=(15, 4))
        if len(embeddings) == 1:
            axes = [axes]
        
        for i, embedding in enumerate(embeddings[:3]):
            if i < len(axes):
                axes[i].hist(embedding, bins=50, alpha=0.7)
                axes[i].set_title(f"Face {i+1} Embedding Distribution")
                axes[i].set_xlabel("Embedding Value")
                axes[i].set_ylabel("Frequency")
        
        plt.tight_layout()
        plt.show()
else:
    print("No faces detected for embedding extraction.")

## 5. Simulated Video Processing Demo

In [None]:
# Simulate processing multiple video frames
from src.models import FaceDetection

def simulate_video_processing():
    """Simulate face detection across multiple video frames."""
    simulated_detections = []
    
    # Simulate 3 videos with known duplicate faces
    videos = ["bicycle_video_001.mp4", "bicycle_video_002.mp4", "bicycle_video_003.mp4"]
    
    # Create base embeddings for 4 different people
    np.random.seed(42)  # For reproducible results
    person_embeddings = {
        "person_A": np.random.randn(512),
        "person_B": np.random.randn(512),
        "person_C": np.random.randn(512),
        "person_D": np.random.randn(512)
    }
    
    detection_id = 0
    
    for video_idx, video_name in enumerate(videos):
        # Each video has 2-4 detections
        num_detections = np.random.randint(2, 5)
        
        for det_idx in range(num_detections):
            # Randomly assign person (with some probability of duplicates)
            if video_idx > 0 and np.random.random() < 0.4:  # 40% chance of duplicate
                person = np.random.choice(["person_A", "person_B"])
            else:
                person = np.random.choice(list(person_embeddings.keys()))
            
            # Add small noise to embedding to simulate real detection variance
            embedding = person_embeddings[person] + np.random.randn(512) * 0.1
            
            # Create detection
            detection = FaceDetection(
                face_id=f"temp_face_{detection_id}",
                video_filename=video_name,
                timestamp=f"00:00:{det_idx*5:02d}.000",
                bounding_box=BoundingBox(
                    x=100 + np.random.randint(-50, 50),
                    y=150 + np.random.randint(-30, 30),
                    width=80 + np.random.randint(-10, 20),
                    height=100 + np.random.randint(-15, 15)
                ),
                confidence=0.7 + np.random.random() * 0.3,
                embedding=embedding.tolist()
            )
            
            simulated_detections.append(detection)
            detection_id += 1
    
    return simulated_detections

# Generate simulated detections
print("Generating simulated video detections...")
simulated_detections = simulate_video_processing()

print(f"Generated {len(simulated_detections)} face detections across 3 videos")

# Display detection summary
detection_summary = pd.DataFrame([
    {
        'Video': det.video_filename,
        'Timestamp': det.timestamp,
        'Confidence': f"{det.confidence:.3f}",
        'Bbox': f"({det.bounding_box.x}, {det.bounding_box.y}, {det.bounding_box.width}, {det.bounding_box.height})"
    }
    for det in simulated_detections
])

print("\nDetection Summary:")
print(detection_summary.to_string(index=False))

## 6. Face Clustering Demo

In [None]:
# Demonstrate face clustering to find duplicates
from src.face_clusterer import FaceClusterer

print("Running face clustering to identify duplicates...")
face_clusterer = FaceClusterer(config)

# Cluster the simulated detections
clustered_detections = face_clusterer.find_duplicate_faces(simulated_detections)

print(f"Clustering completed. Found {len(set(d.face_id for d in clustered_detections))} unique faces")

# Analyze clustering results
face_counts = {}
for detection in clustered_detections:
    face_counts[detection.face_id] = face_counts.get(detection.face_id, 0) + 1

# Find duplicate groups
duplicate_groups = {fid: count for fid, count in face_counts.items() if count > 1}

print(f"\nDuplicate Analysis:")
print(f"  Total detections: {len(clustered_detections)}")
print(f"  Unique faces: {len(face_counts)}")
print(f"  Duplicate groups: {len(duplicate_groups)}")

if duplicate_groups:
    print("\nDuplicate Groups:")
    for face_id, count in sorted(duplicate_groups.items(), key=lambda x: x[1], reverse=True):
        videos = set(d.video_filename for d in clustered_detections if d.face_id == face_id)
        print(f"  {face_id}: {count} detections across {len(videos)} videos")
        print(f"    Videos: {', '.join(videos)}")

# Visualize clustering results
plt.figure(figsize=(12, 6))

# Plot 1: Face count distribution
plt.subplot(1, 2, 1)
counts = list(face_counts.values())
plt.hist(counts, bins=range(1, max(counts) + 2), alpha=0.7, edgecolor='black')
plt.xlabel('Number of Detections per Face')
plt.ylabel('Number of Faces')
plt.title('Face Detection Frequency Distribution')
plt.xticks(range(1, max(counts) + 1))

# Plot 2: Video distribution
plt.subplot(1, 2, 2)
video_counts = {}
for detection in clustered_detections:
    video_counts[detection.video_filename] = video_counts.get(detection.video_filename, 0) + 1

videos = list(video_counts.keys())
counts = list(video_counts.values())
plt.bar(range(len(videos)), counts, alpha=0.7)
plt.xlabel('Video')
plt.ylabel('Number of Face Detections')
plt.title('Face Detections per Video')
plt.xticks(range(len(videos)), [v.split('_')[-1].split('.')[0] for v in videos])

plt.tight_layout()
plt.show()

## 7. Generate and Analyze Report

In [None]:
# Generate a comprehensive report
detector = DuplicateFaceDetector(config)
processing_time = 45.6  # Simulated processing time

report = detector.generate_report(clustered_detections, processing_time)

print("Detection Report Summary:")
print(f"  Total faces detected: {report.total_faces}")
print(f"  Unique faces: {report.unique_faces}")
print(f"  Duplicate groups: {report.duplicate_groups}")
print(f"  Processing time: {report.processing_time:.1f} seconds")
print(f"  Detection rate: {report.total_faces/report.processing_time:.1f} faces/second")

# Save report to file
output_path = Path("demo_results.json")
report.to_json(output_path)
print(f"\nReport saved to: {output_path}")

# Display sample detections
print("\nSample Detection Entries:")
sample_detections = report.detections[:3]
for i, detection in enumerate(sample_detections):
    print(f"\nDetection {i+1}:")
    print(f"  Face ID: {detection.face_id}")
    print(f"  Video: {detection.video_filename}")
    print(f"  Timestamp: {detection.timestamp}")
    print(f"  Confidence: {detection.confidence:.3f}")
    print(f"  Bounding Box: [{detection.bounding_box.x}, {detection.bounding_box.y}, {detection.bounding_box.width}, {detection.bounding_box.height}]")

## 8. Performance Analysis

In [None]:
# Analyze performance characteristics
print("Performance Analysis:")

# Simulate performance for different batch sizes
batch_sizes = [1, 2, 4, 8, 16, 32]
simulated_fps = []

for batch_size in batch_sizes:
    # Simulate FPS based on batch size (realistic GPU scaling)
    if config.device == "cuda":
        base_fps = 45
        efficiency = min(1.0, batch_size / 8)  # Efficiency peaks around batch_size 8
        memory_penalty = max(0.7, 1 - (batch_size - 16) * 0.05) if batch_size > 16 else 1.0
        fps = base_fps * efficiency * memory_penalty
    else:
        # CPU performance
        fps = 8 + batch_size * 0.5  # Much slower, linear scaling
    
    simulated_fps.append(fps)

# Plot performance curve
plt.figure(figsize=(10, 6))
plt.plot(batch_sizes, simulated_fps, 'o-', linewidth=2, markersize=8)
plt.xlabel('Batch Size')
plt.ylabel('Processing Rate (detections/second)')
plt.title(f'Performance vs Batch Size ({config.device.upper()})')
plt.grid(True, alpha=0.3)
plt.xticks(batch_sizes)

# Highlight current configuration
current_idx = batch_sizes.index(config.batch_size)
plt.plot(config.batch_size, simulated_fps[current_idx], 'ro', markersize=12, 
         label=f'Current Config (batch_size={config.batch_size})')
plt.legend()

plt.tight_layout()
plt.show()

# Memory usage estimation
print(f"\nEstimated Memory Usage (1080p frames):")
frame_size_mb = (1920 * 1080 * 3) / (1024**2)  # RGB frame
batch_memory_mb = frame_size_mb * config.batch_size
model_memory_mb = 800  # Estimated model memory
total_memory_mb = batch_memory_mb + model_memory_mb

print(f"  Frame size: {frame_size_mb:.1f} MB")
print(f"  Batch memory: {batch_memory_mb:.1f} MB")
print(f"  Model memory: {model_memory_mb} MB")
print(f"  Total GPU memory: {total_memory_mb:.1f} MB ({total_memory_mb/1024:.1f} GB)")

if config.device == "cuda" and torch.cuda.is_available():
    available_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
    usage_percent = (total_memory_mb/1024) / available_memory_gb * 100
    print(f"  GPU memory usage: {usage_percent:.1f}% of {available_memory_gb:.1f} GB")
    
    if usage_percent > 80:
        print("  ⚠️  High memory usage - consider reducing batch size")
    elif usage_percent < 40:
        print("  ✅ Memory usage looks good - could increase batch size for better performance")

## 9. Summary and Recommendations

In [None]:
print("=" * 60)
print("FACE DUPLICATE DETECTION DEMO SUMMARY")
print("=" * 60)

print(f"\n🔧 System Configuration:")
print(f"   Device: {config.device.upper()}")
print(f"   Batch Size: {config.batch_size}")
print(f"   Min Confidence: {config.min_confidence}")
print(f"   Match Threshold: {config.match_threshold}")

print(f"\n📊 Demo Results:")
print(f"   Total Detections: {len(clustered_detections)}")
print(f"   Unique Faces: {len(set(d.face_id for d in clustered_detections))}")
print(f"   Duplicate Groups: {len([fid for fid, count in face_counts.items() if count > 1])}")
print(f"   Estimated Processing Rate: {simulated_fps[current_idx]:.1f} detections/second")

print(f"\n💡 Recommendations:")

if config.device == "cpu":
    print("   • Install CUDA-enabled PyTorch for 5-10x performance improvement")
    print("   • GPU acceleration is highly recommended for production use")
else:
    if config.batch_size < 8:
        print("   • Consider increasing batch size to 8-16 for better GPU utilization")
    print("   • Current GPU configuration looks good for production use")

print(f"   • For bicycle videos, skip_frames=30 (1 fps) provides good accuracy/speed balance")
print(f"   • Adjust match_threshold based on validation results (current: {config.match_threshold})")
print(f"   • Monitor GPU memory usage during processing of large videos")

print(f"\n🚀 Next Steps:")
print(f"   1. Place video files in data/Videos/ directory")
print(f"   2. Run: python detect_duplicates.py detect --videos-dir data/Videos --output-file results.json")
print(f"   3. Analyze results and tune parameters as needed")
print(f"   4. Set up automated processing pipeline for production use")

print("\n" + "=" * 60)