From d129981be6703bcc12c0814b2472a03f118f84a7 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 14 Aug 2025 14:32:57 +0200 Subject: [PATCH 1/9] Update --- README.md | 167 +++------ docs/explanation/architecture.md | 218 ++++++++++- docs/explanation/crate-structure.md | 4 - docs/explanation/design.md | 4 - docs/explanation/index.md | 105 +++++- docs/explanation/performance.md | 4 - docs/explanation/replication.md | 273 +++++++++++++- docs/getting-started/first-pipeline.md | 4 - docs/getting-started/installation.md | 4 - docs/getting-started/quickstart.md | 4 - docs/how-to/configure-postgres.md | 328 ++++++++++++++++- docs/how-to/custom-destinations.md | 295 ++++++++++++++- docs/how-to/debugging.md | 492 ++++++++++++++++++++++++- docs/how-to/index.md | 80 +++- docs/how-to/performance.md | 4 - docs/how-to/schema-changes.md | 4 - docs/how-to/testing.md | 4 - docs/index.md | 116 ++++-- docs/reference/index.md | 102 ++++- docs/tutorials/first-pipeline.md | 230 ++++++++++++ docs/tutorials/index.md | 57 ++- docs/tutorials/memory-destination.md | 4 - docs/tutorials/testing-pipelines.md | 4 - 23 files changed, 2292 insertions(+), 215 deletions(-) delete mode 100644 docs/explanation/crate-structure.md delete mode 100644 docs/explanation/design.md delete mode 100644 docs/explanation/performance.md delete mode 100644 docs/getting-started/first-pipeline.md delete mode 100644 docs/getting-started/installation.md delete mode 100644 docs/getting-started/quickstart.md delete mode 100644 docs/how-to/performance.md delete mode 100644 docs/how-to/schema-changes.md delete mode 100644 docs/how-to/testing.md create mode 100644 docs/tutorials/first-pipeline.md delete mode 100644 docs/tutorials/memory-destination.md delete mode 100644 docs/tutorials/testing-pipelines.md diff --git a/README.md b/README.md index b8b17c668..a93c49f2f 100644 --- a/README.md +++ b/README.md @@ -11,171 +11,112 @@

Build real-time Postgres replication applications in Rust
- Documentation + πŸ“– Documentation Β· - Examples + πŸ’‘ Examples Β· - Issues + πŸ› Issues

-**ETL** is a Rust framework by [Supabase](https://supabase.com) that enables you to build high-performance, real-time data replication applications for PostgreSQL. Whether you're creating ETL pipelines, implementing CDC (Change Data Capture), or building custom data synchronization solutions, ETL provides the building blocks you need. +**ETL** is a Rust framework by [Supabase](https://supabase.com) that enables you to build high-performance, real-time data replication applications for PostgreSQL. Stream changes as they happen, route to multiple destinations, and build robust data pipelines with minimal complexity. -Built on top of PostgreSQL's [logical streaming replication protocol](https://www.postgresql.org/docs/current/protocol-logical-replication.html), ETL handles the low-level complexities of database replication while providing a clean, Rust-native API that guides you towards the pit of success. +Built on PostgreSQL's [logical replication protocol](https://www.postgresql.org/docs/current/protocol-logical-replication.html), ETL handles the complexities so you can focus on your data. -## Table of Contents +## ✨ Key Features -- [Features](#features) -- [Installation](#installation) -- [Quickstart](#quickstart) -- [Database Setup](#database-setup) -- [Running Tests](#running-tests) -- [Docker](#docker) -- [Architecture](#architecture) -- [Troubleshooting](#troubleshooting) -- [License](#license) +- πŸš€ **Real-time streaming** - Changes flow instantly from PostgreSQL +- πŸ”„ **Multiple destinations** - BigQuery, custom APIs, and more +- πŸ›‘οΈ **Built-in resilience** - Automatic retries and recovery +- ⚑ **High performance** - Efficient batching and parallel processing +- πŸ”§ **Extensible** - Plugin architecture for any destination -## Features - -**Core Capabilities:** -- πŸš€ **Real-time replication**: Stream changes from PostgreSQL as they happen -- πŸ”„ **Multiple destinations**: Support for various data warehouses and databases (coming soon) -- πŸ›‘οΈ **Fault tolerance**: Built-in error handling, retries, and recovery mechanisms -- ⚑ **High performance**: Efficient batching and parallel processing -- πŸ”§ **Extensible**: Plugin architecture for custom destinations - -**Supported Destinations:** -- [x] **BigQuery** - Google Cloud's data warehouse -- [ ] **Apache Iceberg** (planned) - Open table format for analytics -- [ ] **DuckDB** (planned) - In-process analytical database - -## Installation - -Add ETL to your Rust project via git dependencies in `Cargo.toml`: - -```toml -[dependencies] -etl = { git = "https://github.com/supabase/etl" } -``` - -> **Note**: ETL is currently distributed via Git while we prepare for the initial crates.io release. - -## Quickstart - -Get up and running with ETL in minutes using the built-in memory destination: +## 🚦 Quick Start ```rust -use etl::config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}; -use etl::pipeline::Pipeline; -use etl::destination::memory::MemoryDestination; -use etl::store::both::memory::MemoryStore; +use etl::{ + config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}, + destination::memory::MemoryDestination, + pipeline::Pipeline, + store::both::memory::MemoryStore, +}; #[tokio::main] async fn main() -> Result<(), Box> { // Configure PostgreSQL connection - let pg_connection_config = PgConnectionConfig { + let pg_config = PgConnectionConfig { host: "localhost".to_string(), port: 5432, name: "mydb".to_string(), username: "postgres".to_string(), - password: Some("password".into()), - tls: TlsConfig { - trusted_root_certs: String::new(), - enabled: false, - }, + password: Some("password".to_string().into()), + tls: TlsConfig { enabled: false, trusted_root_certs: String::new() }, }; + // Create memory-based store and destination for testing + let store = MemoryStore::new(); + let destination = MemoryDestination::new(); + // Configure the pipeline - let pipeline_config = PipelineConfig { + let config = PipelineConfig { id: 1, publication_name: "my_publication".to_string(), - pg_connection: pg_connection_config, - batch: BatchConfig { - max_size: 1000, - max_fill_ms: 5000, - }, + pg_connection: pg_config, + batch: BatchConfig { max_size: 1000, max_fill_ms: 5000 }, table_error_retry_delay_ms: 10000, max_table_sync_workers: 4, }; - // Create in-memory store and destination for testing - let store = MemoryStore::new(); - let destination = MemoryDestination::new(); - // Create and start the pipeline - let mut pipeline = Pipeline::new(1, pipeline_config, store, destination); + let mut pipeline = Pipeline::new(1, config, store, destination); pipeline.start().await?; - + + // Pipeline will run until stopped + pipeline.wait().await?; + Ok(()) } ``` -**Need production destinations?** Add the `etl-destinations` crate with specific features: - -```toml -[dependencies] -etl = { git = "https://github.com/supabase/etl" } -etl-destinations = { git = "https://github.com/supabase/etl", features = ["bigquery"] } -``` +**Want to try it?** β†’ [**Build your first pipeline in 15 minutes**](https://supabase.github.io/etl/tutorials/first-pipeline/) πŸ“š -For comprehensive examples and tutorials, visit the [etl-examples](etl-examples/README.md) crate and our [documentation](https://supabase.github.io/etl). +## πŸ“š Learn More -## Database Setup +Our comprehensive documentation covers everything you need: -Before running the examples, tests, or the API and replicator components, you'll need to set up a PostgreSQL database. -We provide a convenient script to help you with this setup. For detailed instructions on how to use the database setup script, please refer to our [Database Setup Guide](docs/guides/database-setup.md). +- **πŸŽ“ [Tutorials](https://supabase.github.io/etl/tutorials/)** - Step-by-step learning experiences +- **πŸ”§ [How-To Guides](https://supabase.github.io/etl/how-to/)** - Practical solutions for common tasks +- **πŸ“– [Reference](https://supabase.github.io/etl/reference/)** - Complete API documentation +- **πŸ’‘ [Explanations](https://supabase.github.io/etl/explanation/)** - Architecture and design decisions -## Running Tests +## πŸ“¦ Installation -To run the test suite: +Add to your `Cargo.toml`: -```bash -cargo test --all-features +```toml +[dependencies] +etl = { git = "https://github.com/supabase/etl" } ``` -## Docker +> **Note**: ETL will be available on crates.io soon! -The repository includes Docker support for both the `replicator` and `api` components: +## πŸ—οΈ Development ```bash -# Build replicator image -docker build -f ./etl-replicator/Dockerfile . +# Run tests +cargo test --all-features -# Build api image +# Build Docker images +docker build -f ./etl-replicator/Dockerfile . docker build -f ./etl-api/Dockerfile . ``` -## Architecture - -For a detailed explanation of the ETL architecture and design decisions, please refer to our [Design Document](docs/design/etl-crate-design.md). +## πŸ“„ License -## Troubleshooting - -### Too Many Open Files Error - -If you see the following error when running tests on macOS: - -``` -called `Result::unwrap()` on an `Err` value: Os { code: 24, kind: Uncategorized, message: "Too many open files" } -``` - -Raise the limit of open files per process with: - -```bash -ulimit -n 10000 -``` - -### Performance Considerations - -Currently, the system parallelizes the copying of different tables, but each individual table is still copied in sequential batches. -This limits performance for large tables. We plan to address this once the ETL system reaches greater stability. - -## License - -Distributed under the Apache-2.0 License. See `LICENSE` for more information. +Apache-2.0 License - see [`LICENSE`](LICENSE) for details. ---

Made with ❀️ by the Supabase team -

+

diff --git a/docs/explanation/architecture.md b/docs/explanation/architecture.md index b62abfa82..1d757e915 100644 --- a/docs/explanation/architecture.md +++ b/docs/explanation/architecture.md @@ -1,4 +1,216 @@ -# ETL Architecture +--- +type: explanation +title: ETL Architecture Overview +last_reviewed: 2025-01-14 +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# ETL Architecture Overview + +**Understanding how ETL components work together to replicate data from PostgreSQL** + +ETL's architecture is built around a few key abstractions that work together to provide reliable, high-performance data replication. This document explains how these components interact and why they're designed the way they are. + +## The Big Picture + +At its core, ETL connects PostgreSQL's logical replication stream to configurable destination systems: + +``` +PostgreSQL ETL Pipeline Destination +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ WAL Stream │──▷│ Data Processing │────▷│ BigQuery β”‚ +β”‚ Publicationsβ”‚ β”‚ Batching β”‚ β”‚ Custom API β”‚ +β”‚ Repl. Slots β”‚ β”‚ Error Handling β”‚ β”‚ Memory β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” + β”‚ State Store β”‚ + β”‚ Schema Info β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +The architecture separates concerns to make the system extensible, testable, and maintainable. + +## Core Components + +### Pipeline: The Orchestrator + +The [`Pipeline`](../reference/pipeline/) is ETL's central component that coordinates all other parts: + +**Responsibilities:** +- Establishes connection to PostgreSQL replication stream +- Manages initial table synchronization ("backfill") +- Processes ongoing change events from WAL +- Coordinates batching and delivery to destinations +- Handles errors and retries + +**Why this design?** By centralizing orchestration in one component, we can ensure consistent behavior across all operations while keeping the interface simple for users. + +### Destinations: Where Data Goes + +The [`Destination`](../reference/destination-trait/) trait defines how data leaves ETL: + +```rust +trait Destination { + async fn write_batch(&mut self, batch: BatchedData) -> Result<(), DestinationError>; + async fn flush(&mut self) -> Result<(), DestinationError>; +} +``` + +**Built-in implementations:** +- [`MemoryDestination`](../reference/memory-destination/) - For testing and development +- [`BigQueryDestination`](../reference/bigquery-destination/) - Google BigQuery integration + +**Why this abstraction?** The trait allows ETL to support any output system while providing consistent batching, error handling, and retry behavior. New destinations get all the pipeline reliability features automatically. + +### Stores: Managing State and Schemas + +ETL uses two types of storage via the [`Store`](../reference/store-trait/) trait: + +**State storage** tracks replication progress: +- WAL positions for recovery +- Table synchronization status +- Retry counters and backoff timers + +**Schema storage** manages table structures: +- Column names and types +- Primary key information +- Schema evolution tracking + +**Implementation options:** +- [`MemoryStore`](../reference/memory-store/) - Fast, but loses state on restart +- [`PostgresStore`](../reference/postgres-store/) - Persistent, production-ready + +**Why separate storage?** This allows ETL to work in different deployment scenarios: development (memory), cloud-native (external databases), or embedded (SQLite, eventually). + +## Data Flow Architecture + +### Initial Synchronization + +When a pipeline starts, ETL performs a full synchronization of existing data: + +1. **Discovery:** Query PostgreSQL catalogs to find tables in the publication +2. **Schema capture:** Extract column information and primary keys +3. **Snapshot:** Copy existing rows in batches to the destination +4. **State tracking:** Record progress to support resumption + +This ensures the destination has complete data before processing real-time changes. + +### Ongoing Replication + +After initial sync, ETL processes the PostgreSQL WAL stream: + +1. **Stream connection:** Attach to the replication slot +2. **Event parsing:** Decode WAL records into structured changes +3. **Batching:** Group changes for efficient destination writes +4. **Delivery:** Send batches to destinations with retry logic +5. **Acknowledgment:** Confirm WAL position to PostgreSQL + +### Error Handling Strategy + +ETL's error handling follows a layered approach: + +**Transient errors** (network issues, destination overload): +- Exponential backoff retry +- Circuit breaker to prevent cascading failures +- Eventual resumption from last known good state + +**Permanent errors** (schema mismatches, authentication failures): +- Immediate pipeline halt +- Clear error reporting to operators +- Manual intervention required + +**Partial failures** (some tables succeed, others fail): +- Per-table error tracking +- Independent retry schedules +- Healthy tables continue processing + +## Scalability Patterns + +### Vertical Scaling + +ETL supports scaling up through configuration: + +- **Batch sizes:** Larger batches for higher throughput +- **Worker threads:** Parallel table synchronization +- **Buffer sizes:** More memory for better batching + +### Horizontal Scaling + +For massive databases, ETL supports: + +- **Multiple pipelines:** Split tables across different pipeline instances +- **Destination sharding:** Route different tables to different destinations +- **Read replicas:** Reduce load on primary database + +### Resource Management + +ETL is designed to be resource-predictable: + +- **Memory bounds:** Configurable limits on batch sizes and buffers +- **Connection pooling:** Reuse PostgreSQL connections efficiently +- **Backpressure:** Slow down if destinations can't keep up + +## Extension Points + +### Custom Destinations + +The [`Destination`](../reference/destination-trait/) trait makes it straightforward to add support for new output systems: + +- **REST APIs:** HTTP-based services +- **Message queues:** Kafka, RabbitMQ, etc. +- **Databases:** Any database with bulk insert capabilities +- **File systems:** Parquet, JSON, CSV outputs + +### Custom Stores + +The [`Store`](../reference/store-trait/) trait allows different persistence strategies: + +- **Cloud databases:** RDS, CloudSQL, etc. +- **Key-value stores:** Redis, DynamoDB +- **Local storage:** SQLite, embedded databases + +### Plugin Architecture + +ETL's trait-based design enables: + +- **Runtime plugin loading:** Dynamic destination discovery +- **Configuration-driven setup:** Choose implementations via config +- **Testing isolation:** Mock implementations for unit tests + +## Design Philosophy + +### Correctness First + +ETL prioritizes data consistency over raw speed: +- **At-least-once delivery:** Better to duplicate than lose data +- **State durability:** Persist progress before acknowledging +- **Schema safety:** Validate destination compatibility + +### Operational Simplicity + +ETL aims to be easy to operate: +- **Clear error messages:** Actionable information for operators +- **Predictable behavior:** Minimal configuration surprises +- **Observable:** Built-in metrics and logging + +### Performance Where It Matters + +ETL optimizes the bottlenecks: +- **Batching:** Amortize per-operation overhead +- **Async I/O:** Maximize network utilization +- **Zero-copy:** Minimize data copying where possible + +## Next Steps + +Now that you understand ETL's architecture: + +- **See it in action** β†’ [Build your first pipeline](../tutorials/first-pipeline/) +- **Learn about performance** β†’ [Performance characteristics](performance/) +- **Understand the foundation** β†’ [PostgreSQL logical replication](replication/) +- **Compare with alternatives** β†’ [ETL vs. other tools](comparisons/) + +## See Also + +- [Design decisions](design/) - Why ETL is built the way it is +- [Crate structure](crate-structure/) - How code is organized +- [State management](state-management/) - Deep dive on state handling \ No newline at end of file diff --git a/docs/explanation/crate-structure.md b/docs/explanation/crate-structure.md deleted file mode 100644 index 4450f2242..000000000 --- a/docs/explanation/crate-structure.md +++ /dev/null @@ -1,4 +0,0 @@ -# Crate Structure - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/explanation/design.md b/docs/explanation/design.md deleted file mode 100644 index 1e0d3261a..000000000 --- a/docs/explanation/design.md +++ /dev/null @@ -1,4 +0,0 @@ -# Design Philosophy - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/explanation/index.md b/docs/explanation/index.md index 445116398..92d766c35 100644 --- a/docs/explanation/index.md +++ b/docs/explanation/index.md @@ -1,4 +1,103 @@ -# Explanation +--- +type: explanation +title: Understanding ETL +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# Explanations + +**Deep dives into ETL concepts, architecture, and design decisions** + +Explanations help you build mental models of how ETL works and why it's designed the way it is. These topics provide background knowledge, compare alternatives, and explore the reasoning behind key architectural choices. + +## Core Concepts + +### [ETL Architecture Overview](architecture/) +**The big picture of how ETL components work together** + +Understand the relationship between pipelines, destinations, stores, and the PostgreSQL replication protocol. Learn how data flows through the system and where extension points exist. + +*Topics covered:* Component architecture, data flow, extension patterns, scalability considerations. + +### [Why Postgres Logical Replication?](replication/) +**The foundation technology and its trade-offs** + +Explore how PostgreSQL's logical replication works, why ETL builds on this foundation, and how it compares to other change data capture approaches. + +*Topics covered:* WAL-based replication, publications and subscriptions, alternatives like triggers or polling, performance characteristics. + +### [Design Decisions and Trade-offs](design/) +**Key choices that shape ETL's behavior** + +Learn about the major design decisions in ETL, the problems they solve, and the trade-offs they represent. Understanding these choices helps you use ETL effectively. + +*Topics covered:* Rust as implementation language, async architecture, batching strategy, error handling philosophy. + +## System Characteristics + +### [Performance and Scalability](performance/) +**How ETL behaves under different loads and configurations** + +Understand ETL's performance characteristics, bottlenecks, and scaling patterns. Learn how different configuration choices affect throughput and resource usage. + +*Topics covered:* Throughput patterns, memory usage, network considerations, scaling strategies. + +### [Crate Structure and Organization](crate-structure/) +**How ETL's modular design supports different use cases** + +Explore how ETL is organized into multiple crates, what each crate provides, and how they work together. Understand the reasoning behind this modular architecture. + +*Topics covered:* Core vs. optional crates, dependency management, feature flags, extensibility. + +## Integration Patterns + +### [Working with Destinations](destinations-explained/) +**Understanding the destination abstraction and ecosystem** + +Learn how destinations work conceptually, why they're designed as they are, and how to choose between different destination options. + +*Topics covered:* Destination trait design, batching strategy, error handling patterns, building ecosystems. + +### [State Management Philosophy](state-management/) +**How ETL tracks replication state and schema changes** + +Understand ETL's approach to managing replication state, handling schema evolution, and ensuring consistency across restarts. + +*Topics covered:* State storage options, schema change handling, consistency guarantees, recovery behavior. + +## Broader Context + +### [ETL vs. Other Replication Tools](comparisons/) +**How ETL fits in the data replication landscape** + +Compare ETL to other PostgreSQL replication tools, general-purpose ETL systems, and cloud-managed solutions. Understand when to choose each approach. + +*Topics covered:* Tool comparisons, use case fit, ecosystem integration, operational trade-offs. + +### [Future Directions](roadmap/) +**Where ETL is heading and how to influence its evolution** + +Learn about planned features, architectural improvements, and community priorities. Understand how to contribute to ETL's development. + +*Topics covered:* Planned features, architectural evolution, community involvement, contribution guidelines. + +## Reading Guide + +**New to data replication?** Start with [Postgres Logical Replication](replication/) to understand the foundation technology. + +**Coming from other tools?** Jump to [ETL vs. Other Tools](comparisons/) to see how ETL fits in the landscape. + +**Planning a production deployment?** Read [Architecture](architecture/) and [Performance](performance/) to understand system behavior. + +**Building extensions?** Focus on [Crate Structure](crate-structure/) and [Destinations](destinations-explained/) for extension patterns. + +## Next Steps + +After building conceptual understanding: +- **Start building** β†’ [Tutorials](../tutorials/) +- **Solve specific problems** β†’ [How-To Guides](../how-to/) +- **Look up technical details** β†’ [Reference](../reference/) + +## Contributing to Explanations + +Found gaps in these explanations? See something that could be clearer? +[Open an issue](https://github.com/supabase/etl/issues) or contribute improvements to help other users build better mental models of ETL. \ No newline at end of file diff --git a/docs/explanation/performance.md b/docs/explanation/performance.md deleted file mode 100644 index c963daad9..000000000 --- a/docs/explanation/performance.md +++ /dev/null @@ -1,4 +0,0 @@ -# Performance Model - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/explanation/replication.md b/docs/explanation/replication.md index 04613fb6e..2aa7a5442 100644 --- a/docs/explanation/replication.md +++ b/docs/explanation/replication.md @@ -1,4 +1,271 @@ -# Replication Protocol +--- +type: explanation +title: Why PostgreSQL Logical Replication? +last_reviewed: 2025-01-14 +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# Why PostgreSQL Logical Replication? + +**Understanding the foundation technology that powers ETL and its advantages over alternatives** + +PostgreSQL logical replication is the core technology that ETL builds upon. This document explains how it works, why it's well-suited for ETL use cases, and how it compares to other change data capture approaches. + +## What is Logical Replication? + +Logical replication streams changes from PostgreSQL databases at the **logical level** (rows and operations) rather than the **physical level** (disk blocks and binary changes). This means ETL receives structured, interpretable data changes that can be easily transformed and routed to different destinations. + +### Key Characteristics + +- **Row-based:** Changes are captured as individual row operations (INSERT, UPDATE, DELETE) +- **Selective:** Choose which tables to replicate via publications +- **Real-time:** Changes stream immediately as they're committed +- **Durable:** Uses PostgreSQL's Write-Ahead Log (WAL) for reliability +- **Ordered:** Changes arrive in commit order within each table + +## How Logical Replication Works + +### The WAL-Based Foundation + +PostgreSQL's logical replication is built on its Write-Ahead Log (WAL): + +1. **Transaction commits** are written to WAL before being applied to data files +2. **Logical decoding** translates WAL entries into structured change events +3. **Replication slots** track which changes have been consumed +4. **Publications** define which tables and operations to replicate + +``` +Application PostgreSQL ETL Pipeline + β”‚ β”‚ β”‚ + │──── INSERT ────│ β”‚ + β”‚ │──── WAL entry ────────│ + β”‚ β”‚ │──── Structured change + β”‚ β”‚ β”‚ (table, operation, data) + │◄─── SUCCESS ───│ β”‚ +``` + +### Publications and Subscriptions + +**Publications** define what to replicate: + +```sql +-- Replicate specific tables +CREATE PUBLICATION app_data FOR TABLE users, orders, products; + +-- Replicate all tables (use with caution) +CREATE PUBLICATION all_data FOR ALL TABLES; + +-- Replicate only specific operations +CREATE PUBLICATION inserts_only FOR TABLE users WITH (publish = 'insert'); +``` + +**Replication slots** track consumption: + +```sql +-- ETL creates and manages these automatically +SELECT pg_create_logical_replication_slot('etl_slot', 'pgoutput'); +``` + +### Data Consistency Guarantees + +Logical replication provides strong consistency: + +- **Transactional consistency:** All changes from a transaction arrive together +- **Ordering guarantees:** Changes within a table maintain commit order +- **Durability:** WAL ensures no committed changes are lost +- **At-least-once delivery:** Changes may be delivered multiple times but never lost + +## Why ETL Uses Logical Replication + +### Real-Time Performance + +Unlike polling-based approaches, logical replication provides **immediate change notification**: + +- **Low latency:** Changes stream as they happen (milliseconds to seconds) +- **No database overhead:** No impact on application queries +- **Efficient bandwidth:** Only actual changes are transmitted + +### Operational Simplicity + +Logical replication is **built into PostgreSQL**: + +- **No triggers to maintain:** Changes are captured automatically +- **No application changes:** Existing applications work unchanged +- **Reliable recovery:** Built-in WAL retention and replay +- **Minimal configuration:** Just enable logical replication and create publications + +### Complete Change Capture + +Captures **all types of changes**: + +- **DML operations:** INSERT, UPDATE, DELETE operations +- **Bulk operations:** COPY, bulk updates, and imports +- **Transaction boundaries:** Commit and rollback information +- **Schema information:** Column types and table structure + +## Comparing Replication Approaches + +### Logical Replication vs. Physical Replication + +| Aspect | Logical Replication | Physical Replication | +|--------|-------------------|-------------------| +| **Granularity** | Table/row level | Entire database cluster | +| **Selectivity** | Choose specific tables | All or nothing | +| **Version compatibility** | Cross-version support | Same major version only | +| **Overhead** | Moderate (logical decoding) | Low (binary copy) | +| **Use case** | ETL, selective sync | Backup, disaster recovery | + +### Logical Replication vs. Trigger-Based CDC + +| Aspect | Logical Replication | Trigger-Based CDC | +|--------|-------------------|-----------------| +| **Performance impact** | Minimal on source | High (trigger execution) | +| **Change coverage** | All operations including bulk | Only row-by-row operations | +| **Maintenance** | Built-in PostgreSQL feature | Custom triggers to maintain | +| **Reliability** | WAL-based durability | Depends on trigger implementation | +| **Schema changes** | Handles automatically | Triggers need updates | + +### Logical Replication vs. Query-Based Polling + +| Aspect | Logical Replication | Query-Based Polling | +|--------|-------------------|-------------------| +| **Latency** | Real-time (seconds) | Polling interval (minutes) | +| **Source load** | Minimal | Repeated full table scans | +| **Delete detection** | Automatic | Requires soft deletes | +| **Infrastructure** | Simple (ETL + PostgreSQL) | Complex (schedulers, state tracking) | +| **Change ordering** | Guaranteed | Can miss intermediate states | + +## Limitations and Considerations + +### What Logical Replication Doesn't Capture + +- **DDL operations:** Schema changes (CREATE, ALTER, DROP) are not replicated +- **TRUNCATE operations:** Not captured by default (can be enabled in PostgreSQL 11+) +- **Sequence changes:** nextval() calls on sequences +- **Large object changes:** BLOB/CLOB modifications +- **Temporary table operations:** Temp tables are not replicated + +### Performance Considerations + +**WAL generation overhead:** +- Logical replication increases WAL volume by ~10-30% +- More detailed logging required for logical decoding +- May require WAL retention tuning for catch-up scenarios + +**Replication slot management:** +- Unused slots prevent WAL cleanup (disk space growth) +- Slow consumers can cause WAL buildup +- Need monitoring and automatic cleanup + +**Network bandwidth:** +- All change data flows over network +- Large transactions can cause bandwidth spikes +- Consider batching and compression for high-volume scenarios + +## ETL's Enhancements to Logical Replication + +ETL builds on PostgreSQL's logical replication with additional features: + +### Intelligent Batching + +- **Configurable batch sizes:** Balance latency vs. throughput +- **Time-based batching:** Ensure maximum latency bounds +- **Backpressure handling:** Slow down if destinations can't keep up + +### Error Handling and Recovery + +- **Retry logic:** Handle transient destination failures +- **Circuit breakers:** Prevent cascade failures +- **State persistence:** Resume from exact WAL positions after restarts + +### Multi-Destination Routing + +- **Fan-out replication:** Send same data to multiple destinations +- **Selective routing:** Different tables to different destinations +- **Transformation pipelines:** Modify data en route to destinations + +### Operational Features + +- **Metrics and monitoring:** Track replication lag, throughput, errors +- **Schema change detection:** Automatic handling of table structure changes +- **Resource management:** Memory and connection pooling + +## Use Cases and Patterns + +### Real-Time Analytics + +Stream transactional data to analytical systems: + +``` +PostgreSQL (OLTP) ──ETL──▷ BigQuery (OLAP) + β”‚ β”‚ + β”œβ”€β”€ Users insert orders β”œβ”€β”€ Real-time dashboards + β”œβ”€β”€ Inventory updates β”œβ”€β”€ Business intelligence + └── Payment processing └── Data science workflows +``` + +### Event-Driven Architecture + +Use database changes as event sources: + +``` +PostgreSQL ──ETL──▷ Event Bus ──▷ Microservices + β”‚ β”‚ β”‚ + β”œβ”€β”€ Order created β”œβ”€β”€ Events β”œβ”€β”€ Email service + β”œβ”€β”€ User updated β”œβ”€β”€ Topics β”œβ”€β”€ Notification service + └── Inventory low └── Streams └── Recommendation engine +``` + +### Data Lake Ingestion + +Continuously populate data lakes: + +``` +PostgreSQL ──ETL──▷ Data Lake ──▷ ML/Analytics + β”‚ β”‚ β”‚ + β”œβ”€β”€ App database β”œβ”€β”€ Parquet β”œβ”€β”€ Feature stores + β”œβ”€β”€ User behavior β”œβ”€β”€ Delta β”œβ”€β”€ Model training + └── Business data └── Iceberg └── Batch processing +``` + +## Choosing Logical Replication + +**Logical replication is ideal when you need:** + +- Real-time or near real-time change capture +- Selective table replication +- Cross-version or cross-platform data movement +- Minimal impact on source database performance +- Built-in reliability and durability guarantees + +**Consider alternatives when you need:** + +- **Immediate consistency:** Use synchronous replication or 2PC +- **Schema change replication:** Consider schema migration tools +- **Cross-database replication:** Look at database-specific solutions +- **Complex transformations:** ETL tools might be simpler + +## Future of Logical Replication + +PostgreSQL continues to enhance logical replication: + +- **Row-level security:** Filter replicated data by user permissions +- **Binary protocol improvements:** Faster, more efficient encoding +- **Cross-version compatibility:** Better support for version differences +- **Performance optimizations:** Reduced overhead and increased throughput + +ETL evolves alongside these improvements, providing a stable interface while leveraging new capabilities as they become available. + +## Next Steps + +Now that you understand the foundation: + +- **See it in practice** β†’ [ETL Architecture](architecture/) +- **Compare alternatives** β†’ [ETL vs. Other Tools](comparisons/) +- **Build your first pipeline** β†’ [First Pipeline Tutorial](../tutorials/first-pipeline/) +- **Configure PostgreSQL** β†’ [PostgreSQL Setup](../how-to/configure-postgres/) + +## See Also + +- [PostgreSQL Logical Replication Docs](https://www.postgresql.org/docs/current/logical-replication.html) - Official documentation +- [Design decisions](design/) - Why ETL is built the way it is +- [Performance characteristics](performance/) - Understanding ETL's behavior under load \ No newline at end of file diff --git a/docs/getting-started/first-pipeline.md b/docs/getting-started/first-pipeline.md deleted file mode 100644 index 002d829a3..000000000 --- a/docs/getting-started/first-pipeline.md +++ /dev/null @@ -1,4 +0,0 @@ -# Your First Pipeline - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md deleted file mode 100644 index a02e348a4..000000000 --- a/docs/getting-started/installation.md +++ /dev/null @@ -1,4 +0,0 @@ -# Installation - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md deleted file mode 100644 index 56ac12024..000000000 --- a/docs/getting-started/quickstart.md +++ /dev/null @@ -1,4 +0,0 @@ -# Quick Start - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/how-to/configure-postgres.md b/docs/how-to/configure-postgres.md index b208e601d..0e42e743e 100644 --- a/docs/how-to/configure-postgres.md +++ b/docs/how-to/configure-postgres.md @@ -1,4 +1,326 @@ -# Configure PostgreSQL +--- +type: how-to +audience: developers, database administrators +prerequisites: + - PostgreSQL server access with superuser privileges + - Understanding of PostgreSQL configuration + - Knowledge of PostgreSQL user management +version_last_tested: 0.1.0 +last_reviewed: 2025-01-14 +risk_level: medium +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# Configure PostgreSQL for Replication + +**Set up PostgreSQL with the correct permissions and settings for ETL logical replication** + +This guide walks you through configuring PostgreSQL to support logical replication for ETL, including WAL settings, user permissions, and publication setup. + +## Goal + +Configure PostgreSQL to: + +- Enable logical replication at the server level +- Create appropriate user accounts with minimal required permissions +- Set up publications for the tables you want to replicate +- Configure replication slots for reliable WAL consumption + +## Prerequisites + +- PostgreSQL 12 or later +- Superuser access to the PostgreSQL server +- Ability to restart PostgreSQL server (for configuration changes) +- Network connectivity from ETL to PostgreSQL + +## Decision Points + +**Choose your approach based on your environment:** + +| Environment | Security Level | Recommended Setup | +|-------------|----------------|-------------------| +| **Development** | Low | Single superuser account | +| **Staging** | Medium | Dedicated replication user with specific permissions | +| **Production** | High | Least-privilege user with row-level security | + +## Configuration Steps + +### Step 1: Enable Logical Replication + +Edit your PostgreSQL configuration file (usually `postgresql.conf`): + +```ini +# Enable logical replication +wal_level = logical + +# Increase max replication slots (default is 10) +max_replication_slots = 20 + +# Increase max WAL senders (default is 10) +max_wal_senders = 20 + +# Optional: Increase checkpoint segments for better performance +checkpoint_segments = 32 +checkpoint_completion_target = 0.9 +``` + +**If using PostgreSQL 13+**, also consider: + +```ini +# Enable publication of truncate operations (optional) +wal_sender_timeout = 60s + +# Improve WAL retention for catching up +wal_keep_size = 1GB +``` + +**Restart PostgreSQL** to apply these settings: + +```bash +# On systemd systems +sudo systemctl restart postgresql + +# On other systems +sudo pg_ctl restart -D /path/to/data/directory +``` + +### Step 2: Create a Replication User + +Create a dedicated user with appropriate permissions: + +```sql +-- Create replication user +CREATE USER etl_replicator WITH PASSWORD 'secure_password_here'; + +-- Grant replication privileges +ALTER USER etl_replicator REPLICATION; + +-- Grant connection privileges +GRANT CONNECT ON DATABASE your_database TO etl_replicator; + +-- Grant schema usage (adjust schema names as needed) +GRANT USAGE ON SCHEMA public TO etl_replicator; + +-- Grant select on specific tables (more secure than all tables) +GRANT SELECT ON TABLE users, orders, products TO etl_replicator; + +-- Alternative: Grant select on all tables in schema (less secure but easier) +-- GRANT SELECT ON ALL TABLES IN SCHEMA public TO etl_replicator; +-- ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO etl_replicator; +``` + +### Step 3: Configure Connection Security + +**For development (less secure):** + +Edit `pg_hba.conf` to allow connections: + +``` +# Allow local connections with password +host your_database etl_replicator localhost md5 + +# Allow connections from specific IP range +host your_database etl_replicator 10.0.0.0/8 md5 +``` + +**For production (more secure):** + +Use SSL/TLS connections: + +``` +# Require SSL connections +hostssl your_database etl_replicator 10.0.0.0/8 md5 +``` + +Reload PostgreSQL configuration: + +```sql +SELECT pg_reload_conf(); +``` + +### Step 4: Create Publications + +Connect as a superuser or table owner and create publications: + +```sql +-- Create publication for specific tables +CREATE PUBLICATION etl_publication FOR TABLE users, orders, products; + +-- Alternative: Create publication for all tables (use with caution) +-- CREATE PUBLICATION etl_publication FOR ALL TABLES; + +-- View existing publications +SELECT * FROM pg_publication; + +-- View tables in a publication +SELECT * FROM pg_publication_tables WHERE pubname = 'etl_publication'; +``` + +### Step 5: Test the Configuration + +Verify your setup works: + +```sql +-- Test replication slot creation (as etl_replicator user) +SELECT pg_create_logical_replication_slot('test_slot', 'pgoutput'); + +-- Verify the slot was created +SELECT * FROM pg_replication_slots WHERE slot_name = 'test_slot'; + +-- Clean up test slot +SELECT pg_drop_replication_slot('test_slot'); +``` + +### Step 6: Configure ETL Connection + +Update your ETL configuration to use the new setup: + +```rust +use etl::config::{PgConnectionConfig, TlsConfig}; + +let pg_config = PgConnectionConfig { + host: "your-postgres-server.com".to_string(), + port: 5432, + name: "your_database".to_string(), + username: "etl_replicator".to_string(), + password: Some("secure_password_here".into()), + tls: TlsConfig { + enabled: true, // Enable for production + trusted_root_certs: "/path/to/ca-certificates.crt".to_string(), + }, +}; +``` + +## Validation + +Verify your configuration: + +### Test 1: Connection Test + +```bash +# Test connection from ETL server +psql -h your-postgres-server.com -p 5432 -U etl_replicator -d your_database -c "SELECT 1;" +``` + +### Test 2: Replication Permissions + +```sql +-- As etl_replicator user, verify you can: +-- 1. Create replication slots +SELECT pg_create_logical_replication_slot('validation_slot', 'pgoutput'); + +-- 2. Read from tables in the publication +SELECT COUNT(*) FROM users; + +-- 3. Access publication information +SELECT * FROM pg_publication_tables WHERE pubname = 'etl_publication'; + +-- Clean up +SELECT pg_drop_replication_slot('validation_slot'); +``` + +### Test 3: ETL Pipeline Test + +Run a simple ETL pipeline to verify end-to-end functionality: + +```rust +// Use your configuration to create a test pipeline +// This should complete initial sync successfully +``` + +## Troubleshooting + +### "ERROR: logical decoding requires wal_level >= logical" + +**Solution:** Update `postgresql.conf` with `wal_level = logical` and restart PostgreSQL. + +### "ERROR: permission denied to create replication slot" + +**Solutions:** +- Ensure user has `REPLICATION` privilege: `ALTER USER etl_replicator REPLICATION;` +- Check if you're connecting to the right database +- Verify `pg_hba.conf` allows the connection + +### "ERROR: publication does not exist" + +**Solutions:** +- Verify publication name matches exactly: `SELECT * FROM pg_publication;` +- Ensure you're connected to the correct database +- Check if publication was created by another user + +### "Connection refused" or timeout issues + +**Solutions:** +- Check `postgresql.conf` has `listen_addresses = '*'` (or specific IPs) +- Verify `pg_hba.conf` allows your connection +- Check firewall settings on PostgreSQL server +- Confirm PostgreSQL is running: `sudo systemctl status postgresql` + +### "ERROR: too many replication slots" + +**Solutions:** +- Increase `max_replication_slots` in `postgresql.conf` +- Clean up unused replication slots: `SELECT pg_drop_replication_slot('unused_slot');` +- Monitor slot usage: `SELECT * FROM pg_replication_slots;` + +## Security Best Practices + +### Principle of Least Privilege + +- **Don't use superuser accounts** for ETL in production +- **Grant SELECT only on tables** that need replication +- **Use specific database names** instead of template1 or postgres +- **Limit connection sources** with specific IP ranges in pg_hba.conf + +### Network Security + +- **Always use SSL/TLS** in production: `hostssl` in pg_hba.conf +- **Use certificate authentication** for highest security +- **Restrict network access** with firewalls and VPCs +- **Monitor connections** with log_connections = on + +### Operational Security + +- **Rotate passwords regularly** for replication users +- **Monitor replication slots** for unused or stalled slots +- **Set up alerting** for replication lag and failures +- **Audit publication changes** in your change management process + +## Performance Considerations + +### WAL Configuration + +```ini +# For high-throughput systems +wal_buffers = 16MB +checkpoint_completion_target = 0.9 +wal_writer_delay = 200ms +commit_delay = 1000 +``` + +### Monitoring Queries + +Track replication performance: + +```sql +-- Monitor replication lag +SELECT + slot_name, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as lag +FROM pg_replication_slots; + +-- Monitor WAL generation rate +SELECT pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')) as total_wal; +``` + +## Next Steps + +- **Build your first pipeline** β†’ [First ETL Pipeline](../tutorials/first-pipeline/) +- **Handle schema changes** β†’ [Schema Change Management](schema-changes/) +- **Optimize performance** β†’ [Performance Tuning](performance/) +- **Set up monitoring** β†’ [Debugging Guide](debugging/) + +## See Also + +- [PostgreSQL Logical Replication Documentation](https://www.postgresql.org/docs/current/logical-replication.html) +- [ETL Architecture](../explanation/architecture/) - Understanding how ETL uses these settings +- [Connection Configuration Reference](../reference/pg-connection-config/) - All available connection options \ No newline at end of file diff --git a/docs/how-to/custom-destinations.md b/docs/how-to/custom-destinations.md index 2a4a34346..87a0c1d9b 100644 --- a/docs/how-to/custom-destinations.md +++ b/docs/how-to/custom-destinations.md @@ -1,4 +1,293 @@ -# Implement Custom Destinations +--- +type: how-to +audience: developers +prerequisites: + - Complete first pipeline tutorial + - Rust async/await knowledge + - Understanding of your target system's API +version_last_tested: 0.1.0 +last_reviewed: 2025-01-14 +risk_level: medium +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# Build Custom Destinations + +**Create destination implementations for systems not supported out of the box** + +This guide walks you through implementing the [`Destination`](../../reference/destination-trait/) trait to send replicated data to custom storage systems, APIs, or data warehouses. + +## Goal + +Build a custom destination that receives batched data changes from ETL and writes them to your target system with proper error handling and retry logic. + +## Prerequisites + +- Completed [first pipeline tutorial](../../tutorials/first-pipeline/) +- Access to your target system (database, API, etc.) +- Understanding of your target system's data ingestion patterns +- Rust knowledge of traits and async programming + +## Decision Points + +**Choose your approach based on your target system:** + +| Target System | Key Considerations | Recommended Pattern | +|---------------|-------------------|-------------------| +| **REST API** | Rate limiting, authentication | Batch with retry backoff | +| **Database** | Transaction support, connection pooling | Bulk insert transactions | +| **File System** | File formats, compression | Append or rotate files | +| **Message Queue** | Ordering guarantees, partitioning | Individual message sending | + +## Implementation Steps + +### Step 1: Define Your Destination Struct + +Create a new file `src/my_destination.rs`: + +```rust +use etl::destination::base::{Destination, DestinationError}; +use etl::types::pipeline::BatchedData; +use async_trait::async_trait; + +pub struct MyCustomDestination { + // Configuration fields + api_endpoint: String, + auth_token: String, + batch_size: usize, +} + +impl MyCustomDestination { + pub fn new(api_endpoint: String, auth_token: String) -> Self { + Self { + api_endpoint, + auth_token, + batch_size: 1000, + } + } +} +``` + +### Step 2: Implement the Destination Trait + +Add the core trait implementation: + +```rust +#[async_trait] +impl Destination for MyCustomDestination { + async fn write_batch(&mut self, batch: BatchedData) -> Result<(), DestinationError> { + // Convert ETL data to your target format + let payload = self.convert_batch_to_target_format(&batch)?; + + // Send to your target system with retries + self.send_with_retries(payload).await?; + + Ok(()) + } + + async fn flush(&mut self) -> Result<(), DestinationError> { + // Implement any final cleanup or flush logic + Ok(()) + } +} +``` + +### Step 3: Implement Data Conversion + +Add conversion logic specific to your target system: + +```rust +impl MyCustomDestination { + fn convert_batch_to_target_format(&self, batch: &BatchedData) -> Result { + let mut records = Vec::new(); + + for change in &batch.changes { + match change.operation { + Operation::Insert => { + records.push(json!({ + "action": "insert", + "table": change.table_name, + "data": change.new_values, + "timestamp": change.timestamp + })); + } + Operation::Update => { + records.push(json!({ + "action": "update", + "table": change.table_name, + "old_data": change.old_values, + "new_data": change.new_values, + "timestamp": change.timestamp + })); + } + Operation::Delete => { + records.push(json!({ + "action": "delete", + "table": change.table_name, + "data": change.old_values, + "timestamp": change.timestamp + })); + } + } + } + + serde_json::to_string(&records) + .map_err(|e| DestinationError::SerializationError(e.to_string())) + } +} +``` + +### Step 4: Add Error Handling and Retries + +Implement robust error handling: + +```rust +impl MyCustomDestination { + async fn send_with_retries(&self, payload: String) -> Result<(), DestinationError> { + let mut attempts = 0; + let max_attempts = 3; + + while attempts < max_attempts { + match self.send_to_target(&payload).await { + Ok(_) => return Ok(()), + Err(e) if self.is_retryable_error(&e) => { + attempts += 1; + if attempts < max_attempts { + let backoff_ms = 2_u64.pow(attempts) * 1000; + tokio::time::sleep(Duration::from_millis(backoff_ms)).await; + continue; + } + } + Err(e) => return Err(e), + } + } + + Err(DestinationError::RetryExhausted(format!("Failed after {} attempts", max_attempts))) + } + + async fn send_to_target(&self, payload: &str) -> Result<(), DestinationError> { + let client = reqwest::Client::new(); + let response = client + .post(&self.api_endpoint) + .header("Authorization", format!("Bearer {}", self.auth_token)) + .header("Content-Type", "application/json") + .body(payload.to_string()) + .send() + .await + .map_err(|e| DestinationError::NetworkError(e.to_string()))?; + + if !response.status().is_success() { + return Err(DestinationError::HttpError( + response.status().as_u16(), + format!("Request failed: {}", response.text().await.unwrap_or_default()) + )); + } + + Ok(()) + } + + fn is_retryable_error(&self, error: &DestinationError) -> bool { + match error { + DestinationError::NetworkError(_) => true, + DestinationError::HttpError(status, _) => { + // Retry on 5xx server errors and some 4xx errors + *status >= 500 || *status == 429 + } + _ => false, + } + } +} +``` + +### Step 5: Use Your Custom Destination + +In your main application: + +```rust +use etl::pipeline::Pipeline; +use etl::store::both::memory::MemoryStore; + +#[tokio::main] +async fn main() -> Result<(), Box> { + let store = MemoryStore::new(); + let destination = MyCustomDestination::new( + "https://api.example.com/ingest".to_string(), + "your-auth-token".to_string() + ); + + let mut pipeline = Pipeline::new(pipeline_config, store, destination); + pipeline.start().await?; + + Ok(()) +} +``` + +## Validation + +Test your custom destination: + +1. **Unit tests** for data conversion logic +2. **Integration tests** with a test target system +3. **Error simulation** to verify retry behavior +4. **Load testing** with realistic data volumes + +```rust +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_data_conversion() { + let destination = MyCustomDestination::new( + "http://test".to_string(), + "token".to_string() + ); + + // Create test batch + let batch = create_test_batch(); + + // Test conversion + let result = destination.convert_batch_to_target_format(&batch); + assert!(result.is_ok()); + + // Verify JSON structure + let json: serde_json::Value = serde_json::from_str(&result.unwrap()).unwrap(); + assert!(json.is_array()); + } +} +``` + +## Troubleshooting + +**Data not appearing in target system:** +- Enable debug logging to see conversion output +- Check target system's ingestion logs +- Verify authentication credentials + +**High error rates:** +- Review retry logic and backoff timing +- Check if target system has rate limits +- Consider implementing circuit breaker pattern + +**Performance issues:** +- Profile data conversion logic +- Consider batch size tuning +- Implement connection pooling for database destinations + +## Rollback + +If your destination isn't working: +1. Switch back to [`MemoryDestination`](../../reference/memory-destination/) for testing +2. Check ETL logs for specific error messages +3. Test destination logic in isolation + +## Next Steps + +- **Add monitoring** β†’ [Performance monitoring](performance/) +- **Handle schema changes** β†’ [Schema change handling](schema-changes/) +- **Production deployment** β†’ [Debugging guide](debugging/) + +## See Also + +- [Destination API Reference](../../reference/destination-trait/) - Complete trait documentation +- [BigQuery destination example](https://github.com/supabase/etl/blob/main/etl-destinations/src/bigquery/) - Real-world implementation +- [Error handling patterns](../../explanation/error-handling/) - Best practices for error management \ No newline at end of file diff --git a/docs/how-to/debugging.md b/docs/how-to/debugging.md index 199c490ef..92a509e2b 100644 --- a/docs/how-to/debugging.md +++ b/docs/how-to/debugging.md @@ -1,4 +1,490 @@ -# Debug Replication Issues +--- +type: how-to +audience: developers, operators +prerequisites: + - Basic understanding of ETL pipelines + - Access to PostgreSQL and ETL logs + - Familiarity with ETL configuration +version_last_tested: 0.1.0 +last_reviewed: 2025-01-14 +risk_level: low +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# Debug Pipeline Issues + +**Diagnose and resolve common ETL pipeline problems quickly and systematically** + +This guide helps you identify, diagnose, and fix issues with ETL pipelines using a structured troubleshooting approach. + +## Goal + +Learn to systematically debug ETL issues: + +- Identify the source of pipeline problems +- Use logging and monitoring to diagnose issues +- Apply appropriate fixes for common failure patterns +- Prevent similar issues in the future + +## Prerequisites + +- Running ETL pipeline (even if failing) +- Access to PostgreSQL server and logs +- ETL application logs and configuration +- Basic SQL knowledge for diagnostic queries + +## Decision Points + +**Choose your debugging approach based on symptoms:** + +| Symptom | Most Likely Cause | Start Here | +|---------|-------------------|------------| +| Pipeline won't start | Configuration/connection issues | [Connection Problems](#connection-problems) | +| Pipeline starts but no data | Publication/replication setup | [Replication Issues](#replication-issues) | +| Pipeline stops unexpectedly | Resource/permission problems | [Runtime Failures](#runtime-failures) | +| Data missing or incorrect | Schema/destination issues | [Data Quality Problems](#data-quality-problems) | +| Slow performance | Batching/network issues | [Performance Issues](#performance-issues) | + +## Systematic Debugging Process + +### Step 1: Gather Information + +Before diving into fixes, collect diagnostic information: + +**Check ETL logs:** +```bash +# If using structured logging +grep -E "(ERROR|FATAL|PANIC)" etl.log | tail -20 + +# Look for specific patterns +grep "connection" etl.log +grep "replication slot" etl.log +grep "publication" etl.log +``` + +**Check PostgreSQL logs:** +```sql +-- Recent PostgreSQL errors +SELECT pg_current_logfile(); +-- Then check that file for errors around your pipeline start time +``` + +**Collect system information:** +```sql +-- Check replication slots +SELECT slot_name, slot_type, active, confirmed_flush_lsn +FROM pg_replication_slots; + +-- Check publications +SELECT pubname, puballtables, pubinsert, pubupdate, pubdelete +FROM pg_publication; + +-- Check database connections +SELECT pid, usename, application_name, state, query_start +FROM pg_stat_activity +WHERE application_name LIKE '%etl%'; +``` + +### Step 2: Identify the Problem Category + +Use this decision tree to narrow down the issue: + +``` +Pipeline fails to start? +β”œβ”€ YES β†’ Connection Problems +└─ NO β†’ Pipeline starts but... + β”œβ”€ No data flowing β†’ Replication Issues + β”œβ”€ Pipeline crashes β†’ Runtime Failures + β”œβ”€ Wrong/missing data β†’ Data Quality Problems + └─ Slow performance β†’ Performance Issues +``` + +## Common Problem Categories + +### Connection Problems + +**Symptoms:** +- "Connection refused" errors +- "Authentication failed" errors +- "Database does not exist" errors +- Pipeline exits immediately on startup + +**Diagnosis:** + +```bash +# Test basic connection +psql -h your-host -p 5432 -U etl_user -d your_db -c "SELECT 1;" + +# Test from ETL server specifically +# (run this from where ETL runs) +telnet your-host 5432 +``` + +**Common causes and fixes:** + +| Error Message | Cause | Fix | +|--------------|-------|-----| +| "Connection refused" | PostgreSQL not running or firewall | Check `systemctl status postgresql` and firewall rules | +| "Authentication failed" | Wrong password/user | Verify credentials and `pg_hba.conf` | +| "Database does not exist" | Wrong database name | Check database name in connection string | +| "SSL required" | TLS configuration mismatch | Update `TlsConfig` to match server requirements | + +### Replication Issues + +**Symptoms:** +- Pipeline starts successfully but no data flows +- "Publication not found" errors +- "Replication slot already exists" errors +- Initial sync never completes + +**Diagnosis:** + +```sql +-- Check if publication exists and has tables +SELECT schemaname, tablename +FROM pg_publication_tables +WHERE pubname = 'your_publication_name'; + +-- Check if replication slot is active +SELECT slot_name, active, confirmed_flush_lsn +FROM pg_replication_slots +WHERE slot_name = 'your_slot_name'; + +-- Check table permissions +SELECT grantee, table_schema, table_name, privilege_type +FROM information_schema.role_table_grants +WHERE grantee = 'etl_user' AND table_name = 'your_table'; +``` + +**Common fixes:** + +**Publication doesn't exist:** +```sql +CREATE PUBLICATION your_publication FOR TABLE table1, table2; +``` + +**No tables in publication:** +```sql +-- Add tables to existing publication +ALTER PUBLICATION your_publication ADD TABLE missing_table; +``` + +**Permission denied on tables:** +```sql +GRANT SELECT ON TABLE your_table TO etl_user; +``` + +**Stale replication slot:** +```sql +-- Drop and recreate (will lose position) +SELECT pg_drop_replication_slot('stale_slot_name'); +``` + +### Runtime Failures + +**Symptoms:** +- Pipeline runs for a while then crashes +- "Out of memory" errors +- "Too many open files" errors +- Destination write failures + +**Diagnosis:** + +```bash +# Check system resources +htop # or top +df -h # disk space +ulimit -n # file descriptor limit + +# Check ETL memory usage +ps aux | grep etl +``` + +**Common fixes:** + +**Memory issues:** +```rust +// Reduce batch sizes in configuration +BatchConfig { + max_size: 500, // Reduce from 1000+ + max_fill_ms: 2000, +} +``` + +**File descriptor limits:** +```bash +# Temporary fix +ulimit -n 10000 + +# Permanent fix (add to /etc/security/limits.conf) +etl_user soft nofile 65536 +etl_user hard nofile 65536 +``` + +**Destination timeouts:** +```rust +// Add retry configuration or connection pooling +// Check destination system health and capacity +``` + +### Data Quality Problems + +**Symptoms:** +- Some rows missing in destination +- Data appears corrupted or truncated +- Schema mismatch errors +- Timestamp/timezone issues + +**Diagnosis:** + +```sql +-- Compare row counts between source and destination +SELECT COUNT(*) FROM source_table; +-- vs destination count + +-- Check for recent schema changes +SELECT schemaname, tablename, attname, atttypid +FROM pg_attribute +JOIN pg_class ON attrelid = oid +JOIN pg_namespace ON relnamespace = pg_namespace.oid +WHERE schemaname = 'public' AND tablename = 'your_table'; + +-- Check for problematic data types +SELECT column_name, data_type, character_maximum_length +FROM information_schema.columns +WHERE table_name = 'your_table' + AND data_type IN ('json', 'jsonb', 'text', 'bytea'); +``` + +**Common fixes:** + +**Schema evolution:** +```sql +-- Restart pipeline after schema changes +-- ETL will detect and adapt to new schema +``` + +**Data type issues:** +```rust +// Enable feature flag for unknown types +etl = { git = "https://github.com/supabase/etl", features = ["unknown-types-to-bytes"] } +``` + +**Character encoding problems:** +```sql +-- Check database encoding +SHOW server_encoding; +SHOW client_encoding; +``` + +### Performance Issues + +**Symptoms:** +- Very slow initial sync +- High replication lag +- High CPU/memory usage +- Destination write bottlenecks + +**Diagnosis:** + +```sql +-- Monitor replication lag +SELECT slot_name, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as lag +FROM pg_replication_slots; + +-- Check WAL generation rate +SELECT pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')) as total_wal; + +-- Monitor long-running queries +SELECT pid, now() - pg_stat_activity.query_start AS duration, query +FROM pg_stat_activity +WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes'; +``` + +**Performance tuning:** + +```rust +// Optimize batch configuration +PipelineConfig { + batch: BatchConfig { + max_size: 2000, // Increase batch size + max_fill_ms: 10000, // Allow longer batching + }, + max_table_sync_workers: 8, // Increase parallelism + // ... other config +} +``` + +```sql +-- PostgreSQL tuning +-- In postgresql.conf: +-- shared_buffers = 1GB +-- effective_cache_size = 4GB +-- wal_buffers = 16MB +-- checkpoint_completion_target = 0.9 +``` + +## Advanced Debugging Techniques + +### Enable Debug Logging + +**For ETL:** +```bash +# Set environment variable +export ETL_LOG_LEVEL=debug + +# Or in configuration +RUST_LOG=etl=debug cargo run +``` + +**For PostgreSQL:** +```sql +-- Temporarily enable detailed logging +SET log_statement = 'all'; +SET log_min_duration_statement = 0; +``` + +### Monitor Replication in Real-Time + +```sql +-- Create a monitoring query +WITH replication_status AS ( + SELECT + slot_name, + active, + pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as lag_size, + extract(EPOCH FROM (now() - pg_stat_replication.reply_time))::int as lag_seconds + FROM pg_replication_slots + LEFT JOIN pg_stat_replication ON slot_name = application_name + WHERE slot_name LIKE '%etl%' +) +SELECT * FROM replication_status; +``` + +### Test Individual Components + +**Test publication setup:** +```sql +-- Simulate ETL's publication query +SELECT schemaname, tablename +FROM pg_publication_tables +WHERE pubname = 'your_publication'; +``` + +**Test replication slot consumption:** +```sql +-- Create a test logical replication session +SELECT * FROM pg_logical_slot_get_changes('your_slot', NULL, NULL, 'pretty-print', '1'); +``` + +### Memory and Resource Analysis + +```bash +# Monitor ETL resource usage over time +while true; do + echo "$(date): $(ps -o pid,vsz,rss,pcpu -p $(pgrep etl))" + sleep 30 +done >> etl_resources.log + +# Analyze memory patterns +cat etl_resources.log | grep -E "RSS|VSZ" | tail -20 +``` + +## Prevention Best Practices + +### Configuration Validation + +```rust +// Always validate configuration before starting +impl PipelineConfig { + pub fn validate(&self) -> Result<(), ConfigError> { + if self.batch.max_size > 10000 { + return Err(ConfigError::BatchSizeTooLarge); + } + // ... other validations + } +} +``` + +### Health Checks + +```rust +// Implement health check endpoints +async fn health_check() -> Result { + // Check PostgreSQL connection + // Check replication slot status + // Check destination connectivity + // Return overall status +} +``` + +### Monitoring and Alerting + +```sql +-- Set up monitoring queries to run periodically +-- Alert on: +-- - Replication lag > 1GB or 5 minutes +-- - Inactive replication slots +-- - Failed pipeline restarts +-- - Unusual error rates +``` + +## Recovery Procedures + +### Recovering from WAL Position Loss + +```sql +-- If replication slot is lost, you may need to recreate +-- WARNING: This will cause a full resync +SELECT pg_create_logical_replication_slot('new_slot_name', 'pgoutput'); +``` + +### Handling Destination Failures + +```rust +// ETL typically handles this automatically with retries +// For manual intervention: +// 1. Fix destination issues +// 2. ETL will resume from last known WAL position +// 3. May see duplicate data (destinations should handle this) +``` + +### Schema Change Recovery + +```sql +-- After schema changes, ETL usually adapts automatically +-- If not, restart the pipeline to force schema refresh +``` + +## Getting Help + +When you need additional support: + +1. **Search existing issues:** Check [GitHub issues](https://github.com/supabase/etl/issues) +2. **Collect diagnostic information:** Use queries and commands from this guide +3. **Prepare a minimal reproduction:** Isolate the problem to its essential parts +4. **Open an issue:** Include PostgreSQL version, ETL version, configuration, and logs + +### Information to Include in Bug Reports + +- ETL version and build information +- PostgreSQL version and configuration relevant settings +- Complete error messages and stack traces +- Configuration files (with sensitive information redacted) +- Steps to reproduce the issue +- Expected vs. actual behavior + +## Next Steps + +After resolving your immediate issue: + +- **Optimize performance** β†’ [Performance Tuning](performance/) +- **Implement monitoring** β†’ [Monitoring best practices](../explanation/monitoring/) +- **Plan for schema changes** β†’ [Schema Change Handling](schema-changes/) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) + +## See Also + +- [PostgreSQL setup guide](configure-postgres/) - Prevent configuration issues +- [Performance optimization](performance/) - Tune for better throughput +- [ETL architecture](../explanation/architecture/) - Understand system behavior \ No newline at end of file diff --git a/docs/how-to/index.md b/docs/how-to/index.md index ffd78f6d2..9bf8101a0 100644 --- a/docs/how-to/index.md +++ b/docs/how-to/index.md @@ -1,4 +1,78 @@ -# How-to Guides +--- +type: how-to +title: How-To Guides +--- -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +# How-To Guides + +**Practical solutions for common ETL tasks** + +How-to guides provide step-by-step instructions for accomplishing specific goals when working with ETL. Each guide assumes you're already familiar with ETL basics and focuses on the task at hand. + +## Database Configuration + +### [Configure PostgreSQL for Replication](configure-postgres/) +Set up PostgreSQL with the correct permissions, settings, and publications for ETL pipelines. + +**When to use:** Setting up a new PostgreSQL source for replication. + +## Destinations and Output + +### [Build Custom Destinations](custom-destinations/) +Create your own destination implementations for specific data warehouses or storage systems. + +**When to use:** ETL doesn't support your target system out of the box. + +### [Handle Schema Changes](schema-changes/) +Manage table schema changes without breaking your replication pipeline. + +**When to use:** Your source database schema evolves over time. + +## Operations and Monitoring + +### [Debug Pipeline Issues](debugging/) +Diagnose and resolve common pipeline problems like connection failures, data inconsistencies, and performance bottlenecks. + +**When to use:** Your pipeline isn't working as expected. + +### [Optimize Performance](performance/) +Tune your ETL pipeline for maximum throughput and minimal resource usage. + +**When to use:** Your pipeline is working but needs to handle more data or run faster. + +### [Test ETL Pipelines](testing/) +Build comprehensive test suites for your ETL applications using mocks and test utilities. + +**When to use:** Ensuring reliability before deploying to production. + +## Before You Start + +**Prerequisites:** +- Complete the [first pipeline tutorial](../tutorials/first-pipeline/) +- Have a working ETL development environment +- Understanding of your specific use case requirements + +## Guide Structure + +Each how-to guide follows this pattern: + +1. **Goal statement** - What you'll accomplish +2. **Prerequisites** - Required setup and knowledge +3. **Decision points** - Key choices that affect the approach +4. **Step-by-step procedure** - Actions to take +5. **Validation** - How to verify success +6. **Troubleshooting** - Common issues and solutions + +## Next Steps + +After solving your immediate problem: +- **Learn more concepts** β†’ [Explanations](../explanation/) +- **Look up technical details** β†’ [Reference](../reference/) +- **Build foundational knowledge** β†’ [Tutorials](../tutorials/) + +## Need Help? + +If these guides don't cover your specific situation: +1. Check if it's addressed in [Debugging](debugging/) +2. Search existing [GitHub issues](https://github.com/supabase/etl/issues) +3. [Open a new issue](https://github.com/supabase/etl/issues/new) with details about your use case \ No newline at end of file diff --git a/docs/how-to/performance.md b/docs/how-to/performance.md deleted file mode 100644 index 7826051e4..000000000 --- a/docs/how-to/performance.md +++ /dev/null @@ -1,4 +0,0 @@ -# Optimize Performance - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/how-to/schema-changes.md b/docs/how-to/schema-changes.md deleted file mode 100644 index aabf712e3..000000000 --- a/docs/how-to/schema-changes.md +++ /dev/null @@ -1,4 +0,0 @@ -# Handle Schema Changes - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/how-to/testing.md b/docs/how-to/testing.md deleted file mode 100644 index 48234cc70..000000000 --- a/docs/how-to/testing.md +++ /dev/null @@ -1,4 +0,0 @@ -# Set Up Tests - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 160c6859d..01bf0a7fa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,60 +1,110 @@ --- hide: - navigation +title: ETL Documentation --- -# ETL +# ETL Documentation -!!! info "Coming Soon" - ETL docs are coming soon! +**Build real-time Postgres replication applications in Rust** -Welcome to the ETL project, a Rust-based collection of tooling designed to build efficient and reliable Postgres replication applications. This documentation page provides an overview of the ETL project, the benefits of using ETL, the advantages of implementing it in Rust, and an introduction to Postgres logical replication. It also outlines the resources available in this documentation to help you get started. +ETL is a Rust framework by [Supabase](https://supabase.com) that enables you to build high-performance, real-time data replication applications for PostgreSQL. Whether you're creating ETL pipelines, implementing CDC (Change Data Capture), or building custom data synchronization solutions, ETL provides the building blocks you need. -## What is ETL +## Getting Started -ETL is a collection of Rust crates which can be used to build replication data pipelines on top of [Postgres's logical replication protocol](https://www.postgresql.org/docs/current/protocol-logical-replication.html). It provides a high-level API to work with Postgres logical replication, allowing developers to focus on building their applications without worrying about the low-level details of the replication protocol. The ETL crate abstracts away the complexities of managing replication slots, publications, and subscriptions, enabling you to create robust data pipelines that can continually copy data from Postgres to various destinations like BigQuery and other OLAP databases. +Choose your path based on your needs: -## What is Postgres Logical Replication? +### New to ETL? +Start with our **[Tutorials](tutorials/)** to learn ETL through hands-on examples: -Postgres logical replication is a method for replicating data between PostgreSQL databases at the logical (table or row) level, rather than the physical (block-level) level. It allows selective replication of specific tables or data subsets, making it ideal for scenarios like data warehousing, real-time analytics, or cross-database synchronization. +- [Build your first ETL pipeline](tutorials/first-pipeline/) - Complete beginner's guide (15 minutes) +- [Set up memory-based testing](tutorials/memory-destination/) - Test your pipeline locally (10 minutes) +- [Testing ETL pipelines](tutorials/testing-pipelines/) - Ensure reliability (20 minutes) -Logical replication uses a publish/subscribe model, where a source database (publisher) sends changes to a replication slot, and a destination system (subscriber) applies those changes to its own tables. This approach supports selective data replication and is compatible with different PostgreSQL versions or even external systems. +### Ready to solve specific problems? +Jump to our **[How-To Guides](how-to/)** for practical solutions: -### How Does Postgres Logical Replication Work? +- [Configure PostgreSQL for replication](how-to/configure-postgres/) +- [Build custom destinations](how-to/custom-destinations/) +- [Debug pipeline issues](how-to/debugging/) +- [Handle schema changes](how-to/schema-changes/) +- [Optimize performance](how-to/performance/) -Postgres logical replication operates through the following steps: +### Need detailed technical information? +Consult our **[Reference](reference/)** documentation: -**Publication Creation**: A publication is created in the source database, specifying which tables or data to replicate. For example: +- API reference +- Configuration options +- Error codes and messages -```sql -create publication my_publication for table orders, customers; -``` +### Want to understand the bigger picture? +Read our **[Explanations](explanation/)** for deeper insights: -**Replication Slot**: A logical replication slot is created on the source database to track changes (inserts, updates, deletes) for the published tables. The slot ensures that changes are preserved until they are consumed by a subscriber. +- [ETL architecture overview](explanation/architecture/) +- [Why Postgres logical replication?](explanation/replication/) +- [Performance characteristics](explanation/performance/) +- [Design decisions](explanation/design/) -**Subscription Setup**: The destination system (subscriber) creates a subscription that connects to the publication, specifying the source database and replication slot. For example: +## Core Concepts -```sql -create subscription my_subscription -connection 'host=localhost port=5432 dbname=postgres user=postgres password=password' -publication my_publication; -``` +**Postgres Logical Replication** streams data changes from PostgreSQL databases in real-time using the Write-Ahead Log (WAL). ETL builds on this foundation to provide: + +- πŸš€ **Real-time replication** - Stream changes as they happen +- πŸ”„ **Multiple destinations** - BigQuery and more coming soon +- πŸ›‘οΈ **Fault tolerance** - Built-in error handling and recovery +- ⚑ **High performance** - Efficient batching and parallel processing +- πŸ”§ **Extensible** - Plugin architecture for custom destinations -**Change Data Capture (CDC)**: The source database streams changes (via the Write-Ahead Log, or WAL) to the replication slot. The subscriber receives these changes and applies them to its tables, maintaining data consistency. +## Quick Example -This process enables real-time data synchronization with minimal overhead, making it suitable for ETL workflows where data needs to be transformed and loaded into destinations like data warehouses or analytical databases. +```rust +use etl::{ + config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}, + destination::memory::MemoryDestination, + pipeline::Pipeline, + store::both::memory::MemoryStore, +}; -## Why Use ETL +#[tokio::main] +async fn main() -> Result<(), Box> { + // Configure PostgreSQL connection + let pg_config = PgConnectionConfig { + host: "localhost".to_string(), + port: 5432, + name: "mydb".to_string(), + username: "postgres".to_string(), + password: Some("password".to_string().into()), + tls: TlsConfig { enabled: false, trusted_root_certs: String::new() }, + }; -ETL provides a set of building blocks to construct data pipelines which can continually copy data from Postgres to other systems. It abstracts away the low-level details of the logical replication protocol and provides a high-level API to work with. This allows developers to focus on building their applications without worrying about the intricacies of the replication protocol. + // Create memory-based store and destination for testing + let store = MemoryStore::new(); + let destination = MemoryDestination::new(); -### Why is ETL Written in Rust? + // Configure the pipeline + let config = PipelineConfig { + id: 1, + publication_name: "my_publication".to_string(), + pg_connection: pg_config, + batch: BatchConfig { max_size: 1000, max_fill_ms: 5000 }, + table_error_retry_delay_ms: 10000, + max_table_sync_workers: 4, + }; -The ETL crate is written in Rust to leverage the language's unique strengths, making it an ideal choice for building robust data pipelines: + // Create and start the pipeline + let mut pipeline = Pipeline::new(1, config, store, destination); + pipeline.start().await?; + + // Pipeline will run until stopped + pipeline.wait().await?; + + Ok(()) +} +``` -- **Performance**: Rust's zero-cost abstractions and low-level control enable high-performance data processing, critical for handling large-scale ETL workloads. -- **Safety**: Rust's strong type system and memory safety guarantees minimize bugs and ensure reliable data handling, reducing the risk of data corruption or crashes. -- **Concurrency**: Rust’s ownership model and async capabilities allow efficient parallel processing, ideal for managing complex, high-throughput ETL pipelines. -- **Ecosystem Integration**: Rust’s growing ecosystem and compatibility with modern cloud and database technologies make it a natural fit for Postgres-focused infrastructure. +## Next Steps -By using Rust, the ETL crate provides a fast, safe, and scalable solution for building Postgres replication applications. +- **First time using ETL?** β†’ Start with [Build your first pipeline](tutorials/first-pipeline/) +- **Have a specific goal?** β†’ Browse [How-To Guides](how-to/) +- **Need technical details?** β†’ Check the [Reference](reference/) +- **Want to understand ETL deeply?** β†’ Read [Explanations](explanation/) diff --git a/docs/reference/index.md b/docs/reference/index.md index 1d8074836..3806b1dfd 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -1,4 +1,102 @@ +--- +type: reference +title: API Reference +--- + # Reference -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +**Technical documentation for ETL configuration and usage** + +## API Documentation + +Complete API documentation is available through Rust's built-in documentation system. We publish comprehensive rustdoc documentation that covers all public APIs, traits, and configuration structures. + +**View the API docs:** [Rust API Documentation](https://supabase.github.io/etl/docs/) *(coming soon)* + +The rustdoc includes: + +- All public APIs with detailed descriptions +- Code examples for major components +- Trait implementations and bounds +- Configuration structures and their fields +- Error types and their variants + +## Feature Flags + +ETL supports the following Cargo features: + +| Feature | Description | Default | +|---------|-------------|---------| +| `unknown-types-to-bytes` | Convert unknown PostgreSQL types to byte arrays | βœ“ | +| `test-utils` | Include testing utilities and helpers | - | +| `failpoints` | Enable failure injection for testing | - | + +## Environment Variables + +| Variable | Purpose | Default | +|----------|---------|---------| +| `ETL_LOG_LEVEL` | Logging verbosity (error, warn, info, debug, trace) | `info` | +| `ETL_METRICS_ENABLED` | Enable metrics collection | `false` | + +## Error Codes + +### Pipeline Errors + +| Code | Description | Action | +|------|-------------|---------| +| `P001` | Connection to PostgreSQL failed | Check connection configuration | +| `P002` | Publication not found | Verify publication exists | +| `P003` | Replication slot creation failed | Check PostgreSQL permissions | + +### Destination Errors + +| Code | Description | Action | +|------|-------------|---------| +| `D001` | Batch write failed | Check destination system health | +| `D002` | Authentication failed | Verify credentials | +| `D003` | Data serialization error | Check data format compatibility | + +## Compatibility + +### Supported Versions + +- **Rust:** 1.75 or later +- **PostgreSQL:** 12, 13, 14, 15, 16 +- **Tokio:** 1.0 or later + +### Platform Support + +- **Linux:** Full support (x86_64, aarch64) +- **macOS:** Full support (Intel, Apple Silicon) +- **Windows:** Experimental support + +## Performance Characteristics + +### Memory Usage +- **Base overhead:** ~10MB per pipeline +- **Per-table overhead:** ~1MB +- **Batch memory:** Configurable via `BatchConfig` + +### Throughput +- **Typical range:** 10,000-100,000 operations/second +- **Factors:** Network latency, batch size, destination performance +- **Bottlenecks:** Usually destination write speed + +## Navigation + +**By component type:** +- [Pipeline APIs](pipeline/) - Core orchestration +- [Destination APIs](destinations/) - Data output interfaces +- [Store APIs](stores/) - State management +- [Configuration](config/) - All configuration structures + +**By use case:** +- [Testing](testing/) - Test utilities and mocks +- [Monitoring](monitoring/) - Metrics and observability +- [Extensions](extensions/) - Building custom components + +## See Also + +- [How-to guides](../how-to/) - Task-oriented instructions +- [Tutorials](../tutorials/) - Learning-oriented lessons +- [Explanations](../explanation/) - Understanding-oriented discussions \ No newline at end of file diff --git a/docs/tutorials/first-pipeline.md b/docs/tutorials/first-pipeline.md new file mode 100644 index 000000000..3bb4fe17f --- /dev/null +++ b/docs/tutorials/first-pipeline.md @@ -0,0 +1,230 @@ +--- +type: tutorial +audience: developers +prerequisites: + - Rust 1.75 or later + - PostgreSQL server (local or remote) + - Basic Rust and SQL knowledge +version_last_tested: 0.1.0 +last_reviewed: 2025-01-14 +estimated_time: 15 +--- + +# Build Your First ETL Pipeline + +**Learn the fundamentals by building a working pipeline in 15 minutes** + +By the end of this tutorial, you'll have a complete ETL pipeline that streams data changes from PostgreSQL to a memory destination in real-time. You'll see how to set up publications, configure pipelines, and handle live data replication. + +![Pipeline outcome diagram showing data flowing from PostgreSQL through ETL to memory destination] + +## What You'll Build + +A real-time data pipeline that: +- Monitors a PostgreSQL table for changes +- Streams INSERT, UPDATE, and DELETE operations +- Stores replicated data in memory for immediate access + +## Who This Tutorial Is For + +- Rust developers new to ETL +- Anyone interested in PostgreSQL logical replication +- Developers building data synchronization tools + +**Time required:** 15 minutes +**Difficulty:** Beginner + +## Safety Note + +This tutorial uses an isolated test database. To clean up, simply drop the test database when finished. No production data is affected. + +## Step 1: Set Up Your Environment + +Create a new Rust project for this tutorial: + +```bash +cargo new etl-tutorial +cd etl-tutorial +``` + +Add ETL to your dependencies in `Cargo.toml`: + +```toml +[dependencies] +etl = { git = "https://github.com/supabase/etl" } +etl-config = { git = "https://github.com/supabase/etl" } +tokio = { version = "1.0", features = ["full"] } +``` + +**Checkpoint:** Run `cargo check` - it should compile successfully. + +## Step 2: Prepare PostgreSQL + +Connect to your PostgreSQL server and create a test database: + +```sql +CREATE DATABASE etl_tutorial; +\c etl_tutorial + +-- Create a sample table +CREATE TABLE users ( + id SERIAL PRIMARY KEY, + name TEXT NOT NULL, + email TEXT UNIQUE NOT NULL, + created_at TIMESTAMP DEFAULT NOW() +); + +-- Insert sample data +INSERT INTO users (name, email) VALUES + ('Alice Johnson', 'alice@example.com'), + ('Bob Smith', 'bob@example.com'); +``` + +Create a publication for replication: + +```sql +CREATE PUBLICATION my_publication FOR TABLE users; +``` + +**Checkpoint:** Verify the publication exists: +```sql +SELECT * FROM pg_publication WHERE pubname = 'my_publication'; +``` +You should see one row returned. + +## Step 3: Configure Your Pipeline + +Replace the contents of `src/main.rs`: + +```rust +use etl::config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}; +use etl::pipeline::Pipeline; +use etl::destination::memory::MemoryDestination; +use etl::store::both::memory::MemoryStore; +use std::error::Error; + +#[tokio::main] +async fn main() -> Result<(), Box> { + // Configure PostgreSQL connection + let pg_connection_config = PgConnectionConfig { + host: "localhost".to_string(), + port: 5432, + name: "etl_tutorial".to_string(), + username: "postgres".to_string(), + password: Some("your_password".into()), + tls: TlsConfig { + trusted_root_certs: String::new(), + enabled: false, + }, + }; + + // Configure pipeline behavior + let pipeline_config = PipelineConfig { + id: 1, + publication_name: "my_publication".to_string(), + pg_connection: pg_connection_config, + batch: BatchConfig { + max_size: 1000, + max_fill_ms: 5000, + }, + table_error_retry_delay_ms: 10000, + max_table_sync_workers: 4, + }; + + // Create stores and destination + let store = MemoryStore::new(); + let destination = MemoryDestination::new(); + + println!("Starting ETL pipeline..."); + + // Create and start the pipeline + let mut pipeline = Pipeline::new(pipeline_config, store, destination); + pipeline.start().await?; + + Ok(()) +} +``` + +**Important:** Replace `"your_password"` with your PostgreSQL password. + +## Step 4: Start Your Pipeline + +Run your pipeline: + +```bash +cargo run +``` + +You should see output like: +``` +Starting ETL pipeline... +Pipeline started successfully +Syncing table: users +Initial sync completed: 2 rows +Listening for changes... +``` + +**Checkpoint:** Your pipeline is now running and has completed initial synchronization. + +## Step 5: Test Real-Time Replication + +With your pipeline running, open a new terminal and connect to PostgreSQL: + +```bash +psql -d etl_tutorial +``` + +Make some changes to test replication: + +```sql +-- Insert a new user +INSERT INTO users (name, email) VALUES ('Charlie Brown', 'charlie@example.com'); + +-- Update an existing user +UPDATE users SET name = 'Alice Cooper' WHERE email = 'alice@example.com'; + +-- Delete a user +DELETE FROM users WHERE email = 'bob@example.com'; +``` + +**Checkpoint:** In your pipeline terminal, you should see log messages indicating these changes were captured and processed. + +## Step 6: Verify Data Replication + +The data is now replicated in your memory destination. While this tutorial uses memory (perfect for testing), the same pattern works with BigQuery, DuckDB, or custom destinations. + +Stop your pipeline with `Ctrl+C`. + +**Checkpoint:** You've successfully built and tested a complete ETL pipeline! + +## What You've Learned + +You've mastered the core ETL concepts: + +- **Publications** define which tables to replicate +- **Pipeline configuration** controls behavior and performance +- **Memory destinations** provide fast, local testing +- **Real-time replication** captures all data changes automatically + +## Cleanup + +Remove the test database: + +```sql +DROP DATABASE etl_tutorial; +``` + +## Next Steps + +Now that you understand the basics: + +- **Add robust testing** β†’ [Testing ETL Pipelines](testing-pipelines/) +- **Connect to BigQuery** β†’ [How to Set Up BigQuery Destination](../how-to/custom-destinations/) +- **Handle production scenarios** β†’ [How to Debug Pipeline Issues](../how-to/debugging/) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) + +## See Also + +- [Memory Destination Tutorial](memory-destination/) - Deep dive into testing with memory +- [API Reference](../reference/) - Complete configuration options +- [Performance Guide](../how-to/performance/) - Optimize your pipelines \ No newline at end of file diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 9cc1257f1..7a1e1bfdb 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -1,4 +1,57 @@ +--- +type: tutorial +title: Tutorials +--- + # Tutorials -!!! info "Coming Soon" - This page is under development. \ No newline at end of file +**Learn ETL through guided, hands-on experiences** + +Tutorials provide step-by-step learning paths that take you from zero knowledge to working applications. Each tutorial is designed to be completed successfully by following the exact steps provided. + +## Getting Started + +### [Build Your First ETL Pipeline](first-pipeline/) +**15 minutes** β€’ **Beginner** + +Create a complete ETL pipeline that replicates data from PostgreSQL to a memory destination. You'll learn the core concepts of publications, replication slots, and pipeline configuration. + +*What you'll build:* A working pipeline that streams changes from a sample PostgreSQL table to an in-memory destination. + +## Before You Start + +**Prerequisites for all tutorials:** + +- Rust installed (1.75 or later) +- PostgreSQL server (local or remote) +- Basic familiarity with Rust and SQL + +**What you'll need:** + +- A terminal/command line +- Your favorite text editor +- About 30-60 minutes total time + +## Tutorial Structure + +Each tutorial follows the same pattern: + +1. **Clear outcome** - See exactly what you'll build +2. **Step-by-step instructions** - No guessing, just follow along +3. **Immediate feedback** - See results after each major step +4. **Clean completion** - Working code you can build upon + +## Next Steps + +After completing the tutorials: +- **Solve specific problems** β†’ [How-To Guides](../how-to/) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) +- **Look up technical details** β†’ [API Reference](../reference/) + +## Need Help? + +If you get stuck: +1. Double-check the prerequisites +2. Ensure your PostgreSQL setup matches the requirements +3. Check our [debugging guide](../how-to/debugging/) +4. [Open an issue](https://github.com/supabase/etl/issues) with your specific problem \ No newline at end of file diff --git a/docs/tutorials/memory-destination.md b/docs/tutorials/memory-destination.md deleted file mode 100644 index e97cc0889..000000000 --- a/docs/tutorials/memory-destination.md +++ /dev/null @@ -1,4 +0,0 @@ -# Memory Destination - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file diff --git a/docs/tutorials/testing-pipelines.md b/docs/tutorials/testing-pipelines.md deleted file mode 100644 index 44bd431da..000000000 --- a/docs/tutorials/testing-pipelines.md +++ /dev/null @@ -1,4 +0,0 @@ -# Testing Pipelines - -!!! info "Coming Soon" - This page is under development. \ No newline at end of file From 0847b6339ef541ae8c2da836733e6e53aaa659c9 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 14 Aug 2025 16:16:26 +0200 Subject: [PATCH 2/9] Update --- docs/explanation/architecture.md | 52 ++- docs/how-to/custom-destinations.md | 293 ------------- docs/reference/index.md | 94 +--- docs/test-mermaid.md | 39 ++ docs/tutorials/custom-implementations.md | 528 +++++++++++++++++++++++ docs/tutorials/first-pipeline.md | 63 ++- docs/tutorials/index.md | 18 + mkdocs.yaml | 23 +- 8 files changed, 685 insertions(+), 425 deletions(-) delete mode 100644 docs/how-to/custom-destinations.md create mode 100644 docs/test-mermaid.md create mode 100644 docs/tutorials/custom-implementations.md diff --git a/docs/explanation/architecture.md b/docs/explanation/architecture.md index 1d757e915..e945cadd5 100644 --- a/docs/explanation/architecture.md +++ b/docs/explanation/architecture.md @@ -14,18 +14,46 @@ ETL's architecture is built around a few key abstractions that work together to At its core, ETL connects PostgreSQL's logical replication stream to configurable destination systems: -``` -PostgreSQL ETL Pipeline Destination -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ WAL Stream │──▷│ Data Processing │────▷│ BigQuery β”‚ -β”‚ Publicationsβ”‚ β”‚ Batching β”‚ β”‚ Custom API β”‚ -β”‚ Repl. Slots β”‚ β”‚ Error Handling β”‚ β”‚ Memory β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ - β”‚ - β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” - β”‚ State Store β”‚ - β”‚ Schema Info β”‚ - β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +```mermaid +flowchart LR + subgraph PostgreSQL + A["WAL Stream
Publications
Replication Slots"] + end + + subgraph ETL_Pipeline[ETL Pipeline] + subgraph ApplyWorker[Apply Worker] + B1["CDC Events Processing and Tables Synchronization"] + end + + subgraph TableSyncWorkers[Table Sync Workers] + B2["Table 1 Sync + CDC"] + B3["Table 2 Sync + CDC"] + B4["Table N Sync + CDC"] + end + end + + subgraph Destination[Destination] + Dest["BigQuery
Custom API
Memory"] + end + + subgraph Store[Store] + subgraph StateStore[State Store] + D1["Memory
PostgreSQL"] + end + + subgraph SchemaStore[Schema Store] + D2["Memory
PostgreSQL"] + end + end + + A --> ApplyWorker + ApplyWorker --> TableSyncWorkers + + ApplyWorker --> Destination + TableSyncWorkers --> Destination + + ApplyWorker --> Store + TableSyncWorkers --> Store ``` The architecture separates concerns to make the system extensible, testable, and maintainable. diff --git a/docs/how-to/custom-destinations.md b/docs/how-to/custom-destinations.md deleted file mode 100644 index 87a0c1d9b..000000000 --- a/docs/how-to/custom-destinations.md +++ /dev/null @@ -1,293 +0,0 @@ ---- -type: how-to -audience: developers -prerequisites: - - Complete first pipeline tutorial - - Rust async/await knowledge - - Understanding of your target system's API -version_last_tested: 0.1.0 -last_reviewed: 2025-01-14 -risk_level: medium ---- - -# Build Custom Destinations - -**Create destination implementations for systems not supported out of the box** - -This guide walks you through implementing the [`Destination`](../../reference/destination-trait/) trait to send replicated data to custom storage systems, APIs, or data warehouses. - -## Goal - -Build a custom destination that receives batched data changes from ETL and writes them to your target system with proper error handling and retry logic. - -## Prerequisites - -- Completed [first pipeline tutorial](../../tutorials/first-pipeline/) -- Access to your target system (database, API, etc.) -- Understanding of your target system's data ingestion patterns -- Rust knowledge of traits and async programming - -## Decision Points - -**Choose your approach based on your target system:** - -| Target System | Key Considerations | Recommended Pattern | -|---------------|-------------------|-------------------| -| **REST API** | Rate limiting, authentication | Batch with retry backoff | -| **Database** | Transaction support, connection pooling | Bulk insert transactions | -| **File System** | File formats, compression | Append or rotate files | -| **Message Queue** | Ordering guarantees, partitioning | Individual message sending | - -## Implementation Steps - -### Step 1: Define Your Destination Struct - -Create a new file `src/my_destination.rs`: - -```rust -use etl::destination::base::{Destination, DestinationError}; -use etl::types::pipeline::BatchedData; -use async_trait::async_trait; - -pub struct MyCustomDestination { - // Configuration fields - api_endpoint: String, - auth_token: String, - batch_size: usize, -} - -impl MyCustomDestination { - pub fn new(api_endpoint: String, auth_token: String) -> Self { - Self { - api_endpoint, - auth_token, - batch_size: 1000, - } - } -} -``` - -### Step 2: Implement the Destination Trait - -Add the core trait implementation: - -```rust -#[async_trait] -impl Destination for MyCustomDestination { - async fn write_batch(&mut self, batch: BatchedData) -> Result<(), DestinationError> { - // Convert ETL data to your target format - let payload = self.convert_batch_to_target_format(&batch)?; - - // Send to your target system with retries - self.send_with_retries(payload).await?; - - Ok(()) - } - - async fn flush(&mut self) -> Result<(), DestinationError> { - // Implement any final cleanup or flush logic - Ok(()) - } -} -``` - -### Step 3: Implement Data Conversion - -Add conversion logic specific to your target system: - -```rust -impl MyCustomDestination { - fn convert_batch_to_target_format(&self, batch: &BatchedData) -> Result { - let mut records = Vec::new(); - - for change in &batch.changes { - match change.operation { - Operation::Insert => { - records.push(json!({ - "action": "insert", - "table": change.table_name, - "data": change.new_values, - "timestamp": change.timestamp - })); - } - Operation::Update => { - records.push(json!({ - "action": "update", - "table": change.table_name, - "old_data": change.old_values, - "new_data": change.new_values, - "timestamp": change.timestamp - })); - } - Operation::Delete => { - records.push(json!({ - "action": "delete", - "table": change.table_name, - "data": change.old_values, - "timestamp": change.timestamp - })); - } - } - } - - serde_json::to_string(&records) - .map_err(|e| DestinationError::SerializationError(e.to_string())) - } -} -``` - -### Step 4: Add Error Handling and Retries - -Implement robust error handling: - -```rust -impl MyCustomDestination { - async fn send_with_retries(&self, payload: String) -> Result<(), DestinationError> { - let mut attempts = 0; - let max_attempts = 3; - - while attempts < max_attempts { - match self.send_to_target(&payload).await { - Ok(_) => return Ok(()), - Err(e) if self.is_retryable_error(&e) => { - attempts += 1; - if attempts < max_attempts { - let backoff_ms = 2_u64.pow(attempts) * 1000; - tokio::time::sleep(Duration::from_millis(backoff_ms)).await; - continue; - } - } - Err(e) => return Err(e), - } - } - - Err(DestinationError::RetryExhausted(format!("Failed after {} attempts", max_attempts))) - } - - async fn send_to_target(&self, payload: &str) -> Result<(), DestinationError> { - let client = reqwest::Client::new(); - let response = client - .post(&self.api_endpoint) - .header("Authorization", format!("Bearer {}", self.auth_token)) - .header("Content-Type", "application/json") - .body(payload.to_string()) - .send() - .await - .map_err(|e| DestinationError::NetworkError(e.to_string()))?; - - if !response.status().is_success() { - return Err(DestinationError::HttpError( - response.status().as_u16(), - format!("Request failed: {}", response.text().await.unwrap_or_default()) - )); - } - - Ok(()) - } - - fn is_retryable_error(&self, error: &DestinationError) -> bool { - match error { - DestinationError::NetworkError(_) => true, - DestinationError::HttpError(status, _) => { - // Retry on 5xx server errors and some 4xx errors - *status >= 500 || *status == 429 - } - _ => false, - } - } -} -``` - -### Step 5: Use Your Custom Destination - -In your main application: - -```rust -use etl::pipeline::Pipeline; -use etl::store::both::memory::MemoryStore; - -#[tokio::main] -async fn main() -> Result<(), Box> { - let store = MemoryStore::new(); - let destination = MyCustomDestination::new( - "https://api.example.com/ingest".to_string(), - "your-auth-token".to_string() - ); - - let mut pipeline = Pipeline::new(pipeline_config, store, destination); - pipeline.start().await?; - - Ok(()) -} -``` - -## Validation - -Test your custom destination: - -1. **Unit tests** for data conversion logic -2. **Integration tests** with a test target system -3. **Error simulation** to verify retry behavior -4. **Load testing** with realistic data volumes - -```rust -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_data_conversion() { - let destination = MyCustomDestination::new( - "http://test".to_string(), - "token".to_string() - ); - - // Create test batch - let batch = create_test_batch(); - - // Test conversion - let result = destination.convert_batch_to_target_format(&batch); - assert!(result.is_ok()); - - // Verify JSON structure - let json: serde_json::Value = serde_json::from_str(&result.unwrap()).unwrap(); - assert!(json.is_array()); - } -} -``` - -## Troubleshooting - -**Data not appearing in target system:** -- Enable debug logging to see conversion output -- Check target system's ingestion logs -- Verify authentication credentials - -**High error rates:** -- Review retry logic and backoff timing -- Check if target system has rate limits -- Consider implementing circuit breaker pattern - -**Performance issues:** -- Profile data conversion logic -- Consider batch size tuning -- Implement connection pooling for database destinations - -## Rollback - -If your destination isn't working: -1. Switch back to [`MemoryDestination`](../../reference/memory-destination/) for testing -2. Check ETL logs for specific error messages -3. Test destination logic in isolation - -## Next Steps - -- **Add monitoring** β†’ [Performance monitoring](performance/) -- **Handle schema changes** β†’ [Schema change handling](schema-changes/) -- **Production deployment** β†’ [Debugging guide](debugging/) - -## See Also - -- [Destination API Reference](../../reference/destination-trait/) - Complete trait documentation -- [BigQuery destination example](https://github.com/supabase/etl/blob/main/etl-destinations/src/bigquery/) - Real-world implementation -- [Error handling patterns](../../explanation/error-handling/) - Best practices for error management \ No newline at end of file diff --git a/docs/reference/index.md b/docs/reference/index.md index 3806b1dfd..8340678a2 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -5,95 +5,11 @@ title: API Reference # Reference -**Technical documentation for ETL configuration and usage** - -## API Documentation - -Complete API documentation is available through Rust's built-in documentation system. We publish comprehensive rustdoc documentation that covers all public APIs, traits, and configuration structures. - -**View the API docs:** [Rust API Documentation](https://supabase.github.io/etl/docs/) *(coming soon)* - -The rustdoc includes: - -- All public APIs with detailed descriptions -- Code examples for major components -- Trait implementations and bounds -- Configuration structures and their fields -- Error types and their variants - -## Feature Flags - -ETL supports the following Cargo features: - -| Feature | Description | Default | -|---------|-------------|---------| -| `unknown-types-to-bytes` | Convert unknown PostgreSQL types to byte arrays | βœ“ | -| `test-utils` | Include testing utilities and helpers | - | -| `failpoints` | Enable failure injection for testing | - | - -## Environment Variables - -| Variable | Purpose | Default | -|----------|---------|---------| -| `ETL_LOG_LEVEL` | Logging verbosity (error, warn, info, debug, trace) | `info` | -| `ETL_METRICS_ENABLED` | Enable metrics collection | `false` | - -## Error Codes - -### Pipeline Errors - -| Code | Description | Action | -|------|-------------|---------| -| `P001` | Connection to PostgreSQL failed | Check connection configuration | -| `P002` | Publication not found | Verify publication exists | -| `P003` | Replication slot creation failed | Check PostgreSQL permissions | - -### Destination Errors - -| Code | Description | Action | -|------|-------------|---------| -| `D001` | Batch write failed | Check destination system health | -| `D002` | Authentication failed | Verify credentials | -| `D003` | Data serialization error | Check data format compatibility | - -## Compatibility - -### Supported Versions - -- **Rust:** 1.75 or later -- **PostgreSQL:** 12, 13, 14, 15, 16 -- **Tokio:** 1.0 or later - -### Platform Support - -- **Linux:** Full support (x86_64, aarch64) -- **macOS:** Full support (Intel, Apple Silicon) -- **Windows:** Experimental support - -## Performance Characteristics - -### Memory Usage -- **Base overhead:** ~10MB per pipeline -- **Per-table overhead:** ~1MB -- **Batch memory:** Configurable via `BatchConfig` - -### Throughput -- **Typical range:** 10,000-100,000 operations/second -- **Factors:** Network latency, batch size, destination performance -- **Bottlenecks:** Usually destination write speed - -## Navigation - -**By component type:** -- [Pipeline APIs](pipeline/) - Core orchestration -- [Destination APIs](destinations/) - Data output interfaces -- [Store APIs](stores/) - State management -- [Configuration](config/) - All configuration structures - -**By use case:** -- [Testing](testing/) - Test utilities and mocks -- [Monitoring](monitoring/) - Metrics and observability -- [Extensions](extensions/) - Building custom components +Complete API documentation is available through Rust's built-in documentation system. We will publish comprehensive rustdoc documentation that covers all public APIs, traits, and configuration structures. +Right now the docs are accessible via the code or by running: +```shell +cargo doc --workspace --all-features --no-deps --open +``` ## See Also diff --git a/docs/test-mermaid.md b/docs/test-mermaid.md new file mode 100644 index 000000000..4d644b200 --- /dev/null +++ b/docs/test-mermaid.md @@ -0,0 +1,39 @@ +# Mermaid Test + +This page tests Mermaid diagram rendering in MkDocs. + +## Simple Flowchart + +```mermaid +flowchart TD + A[Start] --> B{Is it?} + B -->|Yes| C[OK] + C --> D[Rethink] + D --> B + B ---->|No| E[End] +``` + +## Sequence Diagram + +```mermaid +sequenceDiagram + participant Alice + participant Bob + Alice->>John: Hello John, how are you? + loop Healthcheck + John->>John: Fight against hypochondria + end + Note right of John: Rational thoughts
prevail! + John-->>Alice: Great! + John->>Bob: How about you? + Bob-->>John: Jolly good! +``` + +## Database Schema Example + +```mermaid +erDiagram + CUSTOMER ||--o{ ORDER : places + ORDER ||--|{ LINE-ITEM : contains + CUSTOMER }|..|{ DELIVERY-ADDRESS : uses +``` \ No newline at end of file diff --git a/docs/tutorials/custom-implementations.md b/docs/tutorials/custom-implementations.md new file mode 100644 index 000000000..faa54f2aa --- /dev/null +++ b/docs/tutorials/custom-implementations.md @@ -0,0 +1,528 @@ +--- +type: tutorial +audience: developers +prerequisites: + - Complete first pipeline tutorial + - Advanced Rust knowledge (traits, async, Arc/Mutex) + - Understanding of ETL architecture +version_last_tested: 0.1.0 +last_reviewed: 2025-01-14 +estimated_time: 25 +--- + +# Build Custom Stores and Destinations + +**Learn ETL's extension patterns by implementing simple custom components** + +This tutorial teaches you ETL's design patterns by implementing minimal custom stores and destinations. You'll understand the separation between state and schema storage, and learn the patterns needed for production extensions. + +## What You'll Build + +Simple custom implementations to understand the patterns: + +- **Custom in-memory store** with logging to see the flow +- **Custom HTTP destination** with basic retry logic +- Understanding of ETL's architectural contracts + +**Time required:** 25 minutes +**Difficulty:** Advanced + +## Understanding ETL's Storage Design + +ETL separates storage into two focused traits: + +### SchemaStore: Table Structure Information + +```rust +pub trait SchemaStore { + // Get cached schema (fast reads from memory) + fn get_table_schema(&self, table_id: &TableId) -> EtlResult>>; + + // Load schemas once at startup into cache + fn load_table_schemas(&self) -> EtlResult; + + // Store schema in both cache and persistent store + fn store_table_schema(&self, schema: TableSchema) -> EtlResult<()>; +} +``` + +### StateStore: Replication Progress Tracking + +```rust +pub trait StateStore { + // Track replication progress (Pending β†’ Syncing β†’ Streaming) + fn get_table_replication_state(&self, table_id: TableId) -> EtlResult>; + + // Update progress in cache and persistent store + fn update_table_replication_state(&self, table_id: TableId, state: TableReplicationPhase) -> EtlResult<()>; + + // Map source table IDs to destination names + fn get_table_mapping(&self, source_table_id: &TableId) -> EtlResult>; +} +``` + +**Key Design Principles:** + +- **Cache-first**: All reads from memory for performance +- **Dual writes**: Updates go to both cache and persistent store +- **Load-once**: Load persistent data into cache at startup only +- **Thread-safe**: Arc/Mutex for concurrent worker access + +## Step 1: Create Simple Custom Store + +Create `src/custom_store.rs`: + +```rust +use etl_postgres::schema::{TableId, TableSchema}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::Mutex; +use tracing::info; + +use etl::error::EtlResult; +use etl::state::table::TableReplicationPhase; +use etl::store::schema::SchemaStore; +use etl::store::state::StateStore; + +/// Educational custom store showing ETL's patterns. +/// +/// This demonstrates: +/// - Cache-first design (all reads from memory) +/// - Dual-write pattern (cache + "persistent" store) +/// - Thread safety with Arc/Mutex +/// - Separation of schema vs state concerns +#[derive(Debug, Clone)] +pub struct CustomStore { + // In-memory caches (the source of truth for reads) + schema_cache: Arc>>>, + state_cache: Arc>>, + mapping_cache: Arc>>, + + // "Persistent" storage simulation (in reality, this would be Redis, SQLite, etc.) + persistent_schemas: Arc>>, + persistent_states: Arc>>, + persistent_mappings: Arc>>, +} + +impl CustomStore { + pub fn new() -> Self { + info!("Creating custom store with cache-first architecture"); + Self { + schema_cache: Arc::new(Mutex::new(HashMap::new())), + state_cache: Arc::new(Mutex::new(HashMap::new())), + mapping_cache: Arc::new(Mutex::new(HashMap::new())), + persistent_schemas: Arc::new(Mutex::new(HashMap::new())), + persistent_states: Arc::new(Mutex::new(HashMap::new())), + persistent_mappings: Arc::new(Mutex::new(HashMap::new())), + } + } +} + +impl SchemaStore for CustomStore { + async fn get_table_schema(&self, table_id: &TableId) -> EtlResult>> { + // Always read from cache (never from persistent store) + let cache = self.schema_cache.lock().await; + let result = cache.get(table_id).cloned(); + info!("Schema cache read for table {}: {}", table_id.0, result.is_some()); + Ok(result) + } + + async fn get_table_schemas(&self) -> EtlResult>> { + let cache = self.schema_cache.lock().await; + Ok(cache.values().cloned().collect()) + } + + async fn load_table_schemas(&self) -> EtlResult { + info!("Loading schemas from 'persistent' store into cache (startup only)"); + + // In production: read from database/file/Redis + let persistent = self.persistent_schemas.lock().await; + let mut cache = self.schema_cache.lock().await; + + for (table_id, schema) in persistent.iter() { + cache.insert(*table_id, Arc::new(schema.clone())); + } + + let loaded_count = persistent.len(); + info!("Loaded {} schemas into cache", loaded_count); + Ok(loaded_count) + } + + async fn store_table_schema(&self, table_schema: TableSchema) -> EtlResult<()> { + let table_id = table_schema.id; + info!("Storing schema for table {} (dual-write: cache + persistent)", table_id.0); + + // Write to persistent store first (in production: database transaction) + { + let mut persistent = self.persistent_schemas.lock().await; + persistent.insert(table_id, table_schema.clone()); + } + + // Then update cache + { + let mut cache = self.schema_cache.lock().await; + cache.insert(table_id, Arc::new(table_schema)); + } + + Ok(()) + } +} + +impl StateStore for CustomStore { + async fn get_table_replication_state(&self, table_id: TableId) -> EtlResult> { + let cache = self.state_cache.lock().await; + let result = cache.get(&table_id).copied(); + info!("State cache read for table {}: {:?}", table_id.0, result); + Ok(result) + } + + async fn get_table_replication_states(&self) -> EtlResult> { + let cache = self.state_cache.lock().await; + Ok(cache.clone()) + } + + async fn load_table_replication_states(&self) -> EtlResult { + info!("Loading states from 'persistent' store into cache"); + + let persistent = self.persistent_states.lock().await; + let mut cache = self.state_cache.lock().await; + + *cache = persistent.clone(); + let loaded_count = persistent.len(); + info!("Loaded {} states into cache", loaded_count); + Ok(loaded_count) + } + + async fn update_table_replication_state(&self, table_id: TableId, state: TableReplicationPhase) -> EtlResult<()> { + info!("Updating state for table {} to {:?} (dual-write)", table_id.0, state); + + // Write to persistent store first + { + let mut persistent = self.persistent_states.lock().await; + persistent.insert(table_id, state); + } + + // Then update cache + { + let mut cache = self.state_cache.lock().await; + cache.insert(table_id, state); + } + + Ok(()) + } + + async fn rollback_table_replication_state(&self, _table_id: TableId) -> EtlResult { + // Simplified for tutorial - in production, you'd track state history + todo!("Implement state history tracking for rollback") + } + + async fn get_table_mapping(&self, source_table_id: &TableId) -> EtlResult> { + let cache = self.mapping_cache.lock().await; + Ok(cache.get(source_table_id).cloned()) + } + + async fn get_table_mappings(&self) -> EtlResult> { + let cache = self.mapping_cache.lock().await; + Ok(cache.clone()) + } + + async fn load_table_mappings(&self) -> EtlResult { + info!("Loading mappings from 'persistent' store into cache"); + let persistent = self.persistent_mappings.lock().await; + let mut cache = self.mapping_cache.lock().await; + *cache = persistent.clone(); + Ok(persistent.len()) + } + + async fn store_table_mapping(&self, source_table_id: TableId, destination_table_id: String) -> EtlResult<()> { + info!("Storing mapping: {} -> {} (dual-write)", source_table_id.0, destination_table_id); + + // Write to both stores + { + let mut persistent = self.persistent_mappings.lock().await; + persistent.insert(source_table_id, destination_table_id.clone()); + } + { + let mut cache = self.mapping_cache.lock().await; + cache.insert(source_table_id, destination_table_id); + } + + Ok(()) + } +} +``` + +## Step 2: Create Simple HTTP Destination + +Create `src/http_destination.rs`: + +```rust +use reqwest::Client; +use serde_json::json; +use std::time::Duration; +use tracing::{info, warn}; + +use etl::destination::Destination; +use etl::error::{EtlError, EtlResult}; +use etl::types::{Event, TableId, TableRow}; + +/// Simple HTTP destination showing core patterns. +/// +/// Demonstrates: +/// - Implementing the Destination trait +/// - Basic retry logic with exponential backoff +/// - Error handling (retry vs fail-fast) +/// - Data serialization for API compatibility +pub struct HttpDestination { + client: Client, + base_url: String, + max_retries: usize, +} + +impl HttpDestination { + pub fn new(base_url: String) -> EtlResult { + let client = Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .map_err(|e| EtlError::new("Failed to create HTTP client".into(), e.into()))?; + + Ok(Self { + client, + base_url, + max_retries: 3, + }) + } + + /// Simple retry with exponential backoff + async fn retry_request(&self, mut operation: F) -> EtlResult<()> + where + F: FnMut() -> Fut, + Fut: std::future::Future>, + { + for attempt in 0..self.max_retries { + match operation().await { + Ok(response) if response.status().is_success() => { + info!("HTTP request succeeded on attempt {}", attempt + 1); + return Ok(()); + } + Ok(response) => { + let status = response.status(); + warn!("HTTP request failed with status {}, attempt {}", status, attempt + 1); + + // Retry on server errors, fail fast on client errors + if !status.is_server_error() { + return Err(EtlError::new( + "HTTP client error".into(), + anyhow::anyhow!("Status: {}", status).into(), + )); + } + } + Err(e) => { + warn!("HTTP request network error on attempt {}: {}", attempt + 1, e); + } + } + + // Exponential backoff: 500ms, 1s, 2s + let delay = Duration::from_millis(500 * 2_u64.pow(attempt as u32)); + tokio::time::sleep(delay).await; + } + + Err(EtlError::new("HTTP request failed after retries".into(), anyhow::anyhow!("Max retries exceeded").into())) + } +} + +impl Destination for HttpDestination { + async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> { + info!("HTTP: Truncating table {}", table_id.0); + + let url = format!("{}/tables/{}/truncate", self.base_url, table_id.0); + let operation = || self.client.delete(&url).send(); + + self.retry_request(operation).await?; + Ok(()) + } + + async fn write_table_rows(&self, table_id: TableId, table_rows: Vec) -> EtlResult<()> { + if table_rows.is_empty() { + return Ok(()); + } + + info!("HTTP: Writing {} rows for table {}", table_rows.len(), table_id.0); + + // Simple serialization - in production you'd handle all data types properly + let rows_json: Vec<_> = table_rows.iter().map(|row| { + json!({ + "values": row.values.iter().map(|v| format!("{:?}", v)).collect::>() + }) + }).collect(); + + let payload = json!({ + "table_id": table_id.0, + "rows": rows_json + }); + + let url = format!("{}/tables/{}/rows", self.base_url, table_id.0); + let operation = || self.client.post(&url).json(&payload).send(); + + self.retry_request(operation).await?; + Ok(()) + } + + async fn write_events(&self, events: Vec) -> EtlResult<()> { + if events.is_empty() { + return Ok(()); + } + + info!("HTTP: Writing {} events", events.len()); + + // Simple event serialization + let events_json: Vec<_> = events.iter().map(|event| { + json!({ + "event_type": format!("{:?}", event), + "timestamp": chrono::Utc::now() + }) + }).collect(); + + let payload = json!({ + "events": events_json + }); + + let url = format!("{}/events", self.base_url); + let operation = || self.client.post(&url).json(&payload).send(); + + self.retry_request(operation).await?; + Ok(()) + } +} +``` + +## Step 3: Use Your Custom Components + +Create `src/main.rs`: + +```rust +mod custom_store; +mod http_destination; + +use custom_store::CustomStore; +use http_destination::HttpDestination; +use etl::config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}; +use etl::pipeline::Pipeline; +use tracing::{info, Level}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + tracing_subscriber::fmt().with_max_level(Level::INFO).init(); + + info!("Starting ETL with custom store and destination"); + + // Create custom components + let store = CustomStore::new(); + let destination = HttpDestination::new("https://httpbin.org/post".to_string())?; + + // Standard PostgreSQL config + let pipeline_config = PipelineConfig { + id: 1, + publication_name: "my_publication".to_string(), + pg_connection: PgConnectionConfig { + host: "localhost".to_string(), + port: 5432, + name: "postgres".to_string(), + username: "postgres".to_string(), + password: Some("your_password".to_string().into()), + tls: TlsConfig { enabled: false, trusted_root_certs: String::new() }, + }, + batch: BatchConfig { max_size: 100, max_fill_ms: 5000 }, + table_error_retry_delay_ms: 10000, + max_table_sync_workers: 2, + }; + + // Create pipeline with custom components + let mut pipeline = Pipeline::new(pipeline_config, store, destination); + pipeline.start().await?; + pipeline.wait().await?; + + Ok(()) +} +``` + +Add dependencies to `Cargo.toml`: + +```toml +[dependencies] +etl = { git = "https://github.com/supabase/etl" } +tokio = { version = "1.0", features = ["full"] } +reqwest = { version = "0.11", features = ["json"] } +serde_json = "1.0" +chrono = { version = "0.4", features = ["serde"] } +tracing = "0.1" +tracing-subscriber = "0.3" +anyhow = "1.0" +``` + +## Key Patterns You've Learned + +### Store Architecture +- **Cache-first reads**: Never hit persistent storage for reads +- **Dual-write updates**: Write to persistent then cache atomically +- **Startup loading**: Load persistent data into cache once +- **Thread safety**: Arc/Mutex for concurrent worker access + +### Destination Patterns +- **Retry logic**: Exponential backoff for transient failures +- **Error classification**: Retry server errors, fail fast on client errors +- **Data transformation**: Convert ETL types to API-friendly formats +- **Batching awareness**: Handle empty batches gracefully + +### Production Extensions + +For real production use, extend these patterns: + +```rust +// Custom Store Extensions +impl CustomStore { + // Add connection pooling for database stores + async fn with_database_pool() -> Self { /* ... */ } + + // Add metrics collection + async fn get_cache_metrics(&self) -> Metrics { /* ... */ } + + // Add state history for rollbacks + async fn track_state_history(&self, table_id: TableId, state: TableReplicationPhase) { /* ... */ } +} + +// Custom Destination Extensions +impl HttpDestination { + // Add circuit breaker pattern + async fn should_break_circuit(&self) -> bool { /* ... */ } + + // Add authentication handling + async fn refresh_auth_token(&mut self) -> EtlResult<()> { /* ... */ } + + // Add request batching + async fn batch_multiple_requests(&self, requests: Vec) -> EtlResult<()> { /* ... */ } +} +``` + +## What You've Learned + +You now understand ETL's extension patterns: + +- **Storage separation**: Schema vs state concerns with different access patterns +- **Cache-first architecture**: Fast reads from memory, dual writes for consistency +- **Thread-safe design**: Arc/Mutex patterns for concurrent access +- **Retry patterns**: Exponential backoff with error classification +- **Trait contracts**: What ETL expects from custom implementations + +## Next Steps + +- **Test your implementations** β†’ [Testing ETL Pipelines](testing-pipelines/) +- **Debug issues** β†’ [Debugging Guide](../how-to/debugging/) +- **Understand architecture** β†’ [ETL Architecture](../explanation/architecture/) +- **See production examples** β†’ [Custom Destinations Guide](../how-to/custom-destinations/) + +## See Also + +- [State management explanation](../explanation/state-management/) - Deep dive on ETL's state handling +- [Architecture overview](../explanation/architecture/) - Understanding component relationships +- [API reference](../reference/) - Complete trait documentation \ No newline at end of file diff --git a/docs/tutorials/first-pipeline.md b/docs/tutorials/first-pipeline.md index 3bb4fe17f..c5e31f8fa 100644 --- a/docs/tutorials/first-pipeline.md +++ b/docs/tutorials/first-pipeline.md @@ -16,8 +16,6 @@ estimated_time: 15 By the end of this tutorial, you'll have a complete ETL pipeline that streams data changes from PostgreSQL to a memory destination in real-time. You'll see how to set up publications, configure pipelines, and handle live data replication. -![Pipeline outcome diagram showing data flowing from PostgreSQL through ETL to memory destination] - ## What You'll Build A real-time data pipeline that: @@ -52,7 +50,6 @@ Add ETL to your dependencies in `Cargo.toml`: ```toml [dependencies] etl = { git = "https://github.com/supabase/etl" } -etl-config = { git = "https://github.com/supabase/etl" } tokio = { version = "1.0", features = ["full"] } ``` @@ -105,20 +102,20 @@ use std::error::Error; #[tokio::main] async fn main() -> Result<(), Box> { - // Configure PostgreSQL connection + // Configure PostgreSQL connection. let pg_connection_config = PgConnectionConfig { host: "localhost".to_string(), port: 5432, - name: "etl_tutorial".to_string(), + name: "postgres".to_string(), username: "postgres".to_string(), - password: Some("your_password".into()), + password: Some("your_password".to_string().into()), tls: TlsConfig { trusted_root_certs: String::new(), enabled: false, }, }; - // Configure pipeline behavior + // Configure pipeline behavior. let pipeline_config = PipelineConfig { id: 1, publication_name: "my_publication".to_string(), @@ -131,16 +128,44 @@ async fn main() -> Result<(), Box> { max_table_sync_workers: 4, }; - // Create stores and destination + // Create stores and destination. let store = MemoryStore::new(); let destination = MemoryDestination::new(); - + + // We spawn a task to periodically print the content of the destination. + let destination_clone = destination.clone(); + tokio::spawn(async move { + loop { + println!("Destination Contents At This Time\n"); + + // Table rows are the initial rows in the table that are copied. + for (table_id, table_rows) in destination_clone.table_rows().await { + println!("Table ({:?}): {:?}", table_id, table_rows); + } + + // Events are realtime events that are sent by Postgres after the table has been copied. + for event in destination_clone.events().await { + println!("Event: {:?}", event); + } + + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + + print!("\n\n"); + } + }); + println!("Starting ETL pipeline..."); - - // Create and start the pipeline + + // Create and start the pipeline. let mut pipeline = Pipeline::new(pipeline_config, store, destination); pipeline.start().await?; - + + println!("Waiting for pipeline to finish..."); + + // Wait for the pipeline to finish, without a shutdown signal it will continue to work until the + // connection is closed. + pipeline.wait().await?; + Ok(()) } ``` @@ -158,10 +183,14 @@ cargo run You should see output like: ``` Starting ETL pipeline... -Pipeline started successfully -Syncing table: users -Initial sync completed: 2 rows -Listening for changes... +Waiting for pipeline to finish... + +Destination Contents At This Time + +Destination Contents At This Time + +Table (TableId(32341)): [TableRow { values: [I32(1), String("Alice"), String("alice@example.com"), TimeStampTz(2025-08-05T11:14:54.400235Z)] }, TableRow { values: [I32(2), String("Bob"), String("bob@example.com"), TimeStampTz(2025-08-05T11:14:54.400235Z)] }, TableRow { values: [I32(3), String("Charlie"), String("charlie@example.com"), TimeStampTz(2025-08-05T11:14:54.400235Z)] }] +Table (TableId(245615)): [TableRow { values: [I32(1), Array(I32([Some(1), Some(2), None, Some(4)]))] }, TableRow { values: [I32(2), Array(I32([None, None, Some(3)]))] }, TableRow { values: [I32(3), Array(I32([Some(5), None]))] }, TableRow { values: [I32(4), Array(I32([None]))] }, TableRow { values: [I32(5), Null] }] ``` **Checkpoint:** Your pipeline is now running and has completed initial synchronization. @@ -193,8 +222,6 @@ DELETE FROM users WHERE email = 'bob@example.com'; The data is now replicated in your memory destination. While this tutorial uses memory (perfect for testing), the same pattern works with BigQuery, DuckDB, or custom destinations. -Stop your pipeline with `Ctrl+C`. - **Checkpoint:** You've successfully built and tested a complete ETL pipeline! ## What You've Learned diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index 7a1e1bfdb..b62af4cb2 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -18,6 +18,22 @@ Create a complete ETL pipeline that replicates data from PostgreSQL to a memory *What you'll build:* A working pipeline that streams changes from a sample PostgreSQL table to an in-memory destination. +### [Set Up Memory-Based Testing](memory-destination/) +**10 minutes** β€’ **Beginner** + +Learn how to use ETL's built-in memory destination for rapid prototyping and testing. Perfect for development and CI environments. + +*What you'll build:* A test environment that validates your pipeline logic without external dependencies. + +## Advanced Topics + +### [Build Custom Stores and Destinations](custom-implementations/) +**45 minutes** β€’ **Advanced** + +Implement production-ready custom stores and destinations. Learn ETL's design patterns, build persistent SQLite storage, and create HTTP-based destinations with retry logic. + +*What you'll build:* Custom SQLite store for persistent state/schema storage and HTTP destination with production-ready error handling. + ## Before You Start **Prerequisites for all tutorials:** @@ -44,6 +60,7 @@ Each tutorial follows the same pattern: ## Next Steps After completing the tutorials: + - **Solve specific problems** β†’ [How-To Guides](../how-to/) - **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) - **Look up technical details** β†’ [API Reference](../reference/) @@ -51,6 +68,7 @@ After completing the tutorials: ## Need Help? If you get stuck: + 1. Double-check the prerequisites 2. Ensure your PostgreSQL setup matches the requirements 3. Check our [debugging guide](../how-to/debugging/) diff --git a/mkdocs.yaml b/mkdocs.yaml index 6bbccef56..be8361d57 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -10,28 +10,18 @@ nav: - Home: index.md - Tutorials: - Overview: tutorials/index.md - - Installation: getting-started/installation.md - - Quick Start: getting-started/quickstart.md - - Your First Pipeline: getting-started/first-pipeline.md - - Working with Destinations: tutorials/memory-destination.md - - Testing Pipelines: tutorials/testing-pipelines.md + - Your First Pipeline: tutorials/first-pipeline.md + - Custom Stores and Destinations: tutorials/custom-implementations.md - How-to Guides: - Overview: how-to/index.md - Configure PostgreSQL: how-to/configure-postgres.md - - Handle Schema Changes: how-to/schema-changes.md - - Create Custom Destinations: how-to/custom-destinations.md - Debug Replication Issues: how-to/debugging.md - - Optimize Performance: how-to/performance.md - - Test Your Pipelines: how-to/testing.md - Reference: - Overview: reference/index.md - Explanation: - Overview: explanation/index.md - Architecture: explanation/architecture.md - Replication Protocol: explanation/replication.md - - Design Philosophy: explanation/design.md - - Performance Characteristics: explanation/performance.md - - Crate Structure: explanation/crate-structure.md theme: name: "material" @@ -73,6 +63,9 @@ theme: extra_css: - stylesheets/extra.css +extra_javascript: + - https://unpkg.com/mermaid@10.6.1/dist/mermaid.min.js + extra: social: - icon: fontawesome/brands/x-twitter @@ -91,7 +84,11 @@ markdown_extensions: guess_lang: false use_pygments: true pygments_style: default - - pymdownx.superfences + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format - pymdownx.tabbed: alternate_style: true - pymdownx.snippets From 8fc29ffe9bc236dd1345c1b669c88bdb43c76e4b Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 14 Aug 2025 16:51:53 +0200 Subject: [PATCH 3/9] Update --- docs/tutorials/custom-implementations.md | 529 ++++++++++++----------- 1 file changed, 285 insertions(+), 244 deletions(-) diff --git a/docs/tutorials/custom-implementations.md b/docs/tutorials/custom-implementations.md index faa54f2aa..c84f47759 100644 --- a/docs/tutorials/custom-implementations.md +++ b/docs/tutorials/custom-implementations.md @@ -22,58 +22,48 @@ Simple custom implementations to understand the patterns: - **Custom in-memory store** with logging to see the flow - **Custom HTTP destination** with basic retry logic -- Understanding of ETL's architectural contracts **Time required:** 25 minutes **Difficulty:** Advanced -## Understanding ETL's Storage Design +## Understanding ETL's Store Design -ETL separates storage into two focused traits: +ETL is design to be a modular replication library. This means that it relies on abstractions to allow for easy extension. -### SchemaStore: Table Structure Information +One core aspect of ETL is the store. The store is composed by two parts: -```rust -pub trait SchemaStore { - // Get cached schema (fast reads from memory) - fn get_table_schema(&self, table_id: &TableId) -> EtlResult>>; - - // Load schemas once at startup into cache - fn load_table_schemas(&self) -> EtlResult; - - // Store schema in both cache and persistent store - fn store_table_schema(&self, schema: TableSchema) -> EtlResult<()>; -} -``` +- **Schema Store**: Stores table schemas. +- **State Store**: Stores replication state. -### StateStore: Replication Progress Tracking +Having the store as an extension point allows you to implement your own store for your own use case. For example, you might +want to store replication state in a simple text file, or you might just want the replication state to be stored in memory. It's +important to note that the implementation of the store will significantly affect the performance and safety of the pipeline. This means +that it's your responsibility to make sure that the store is designed to be performant, thread safe and durable (in case you need persistence). The pipeline +doesn't make any assumptions about the store, it just stores data and retrieves it. -```rust -pub trait StateStore { - // Track replication progress (Pending β†’ Syncing β†’ Streaming) - fn get_table_replication_state(&self, table_id: TableId) -> EtlResult>; - - // Update progress in cache and persistent store - fn update_table_replication_state(&self, table_id: TableId, state: TableReplicationPhase) -> EtlResult<()>; - - // Map source table IDs to destination names - fn get_table_mapping(&self, source_table_id: &TableId) -> EtlResult>; -} -``` +One important thing about the stores, is that they both offer `load_*` and `get_*` methods. The rationale behind this design is +that `load_*` methods are used to load data into a cache implemented within the store and `get_*` exclusively read from that cache. If there is no +need to load data into the cache, because the store implementation doesn't write data anywhere, you can just implement `load_*` as a no-op.` + +### `SchemaStore`: Store for Table Schemas + +The `SchemaStore` trait is responsible for storing and retrieving table schemas. -**Key Design Principles:** +Table schemas are required by ETL since they are used to correctly parse and handle incoming data from PostgreSQL and they +also serve to correctly map tables from Postgres to the destination. -- **Cache-first**: All reads from memory for performance -- **Dual writes**: Updates go to both cache and persistent store -- **Load-once**: Load persistent data into cache at startup only -- **Thread-safe**: Arc/Mutex for concurrent worker access +### `StateStore`: Store for Replication State + +The `StateStore` trait is responsible for storing and retrieving replication state. + +The state is crucial for proper pipeline operation since it's used to track the progress of replication and (if persistent) +allows a pipeline to be safely paused and later resumed. ## Step 1: Create Simple Custom Store Create `src/custom_store.rs`: ```rust -use etl_postgres::schema::{TableId, TableSchema}; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::Mutex; @@ -83,170 +73,223 @@ use etl::error::EtlResult; use etl::state::table::TableReplicationPhase; use etl::store::schema::SchemaStore; use etl::store::state::StateStore; +use etl::types::{TableId, TableSchema}; + +#[derive(Debug, Clone)] +struct CachedEntry { + schema: Option>, + state: Option, + mapping: Option, +} + +#[derive(Debug, Clone)] +struct PersistentEntry { + schema: Option, + state: Option, + mapping: Option, +} -/// Educational custom store showing ETL's patterns. -/// -/// This demonstrates: -/// - Cache-first design (all reads from memory) -/// - Dual-write pattern (cache + "persistent" store) -/// - Thread safety with Arc/Mutex -/// - Separation of schema vs state concerns #[derive(Debug, Clone)] pub struct CustomStore { - // In-memory caches (the source of truth for reads) - schema_cache: Arc>>>, - state_cache: Arc>>, - mapping_cache: Arc>>, - - // "Persistent" storage simulation (in reality, this would be Redis, SQLite, etc.) - persistent_schemas: Arc>>, - persistent_states: Arc>>, - persistent_mappings: Arc>>, + // We simulate cached entries. + cache: Arc>>, + // We simulate persistent entries. + persistent: Arc>>, } impl CustomStore { pub fn new() -> Self { - info!("Creating custom store with cache-first architecture"); + info!("Creating custom store (2 maps: cache + persistent)"); Self { - schema_cache: Arc::new(Mutex::new(HashMap::new())), - state_cache: Arc::new(Mutex::new(HashMap::new())), - mapping_cache: Arc::new(Mutex::new(HashMap::new())), - persistent_schemas: Arc::new(Mutex::new(HashMap::new())), - persistent_states: Arc::new(Mutex::new(HashMap::new())), - persistent_mappings: Arc::new(Mutex::new(HashMap::new())), + cache: Arc::new(Mutex::new(HashMap::new())), + persistent: Arc::new(Mutex::new(HashMap::new())), } } + + fn ensure_cache_slot<'a>( + cache: &'a mut HashMap, + id: TableId, + ) -> &'a mut CachedEntry { + cache + .entry(id) + .or_insert_with(|| CachedEntry { schema: None, state: None, mapping: None }) + } + + fn ensure_persistent_slot<'a>( + persistent: &'a mut HashMap, + id: TableId, + ) -> &'a mut PersistentEntry { + persistent + .entry(id) + .or_insert_with(|| PersistentEntry { schema: None, state: None, mapping: None }) + } } impl SchemaStore for CustomStore { async fn get_table_schema(&self, table_id: &TableId) -> EtlResult>> { - // Always read from cache (never from persistent store) - let cache = self.schema_cache.lock().await; - let result = cache.get(table_id).cloned(); + let cache = self.cache.lock().await; + let result = cache.get(table_id).and_then(|e| e.schema.clone()); info!("Schema cache read for table {}: {}", table_id.0, result.is_some()); Ok(result) } async fn get_table_schemas(&self) -> EtlResult>> { - let cache = self.schema_cache.lock().await; - Ok(cache.values().cloned().collect()) + let cache = self.cache.lock().await; + Ok(cache + .values() + .filter_map(|e| e.schema.clone()) + .collect()) } async fn load_table_schemas(&self) -> EtlResult { - info!("Loading schemas from 'persistent' store into cache (startup only)"); - - // In production: read from database/file/Redis - let persistent = self.persistent_schemas.lock().await; - let mut cache = self.schema_cache.lock().await; - - for (table_id, schema) in persistent.iter() { - cache.insert(*table_id, Arc::new(schema.clone())); + info!("Loading schemas from 'persistent' into cache (startup)"); + let persistent = self.persistent.lock().await; + let mut cache = self.cache.lock().await; + + let mut loaded = 0; + for (id, pentry) in persistent.iter() { + if let Some(schema) = &pentry.schema { + let centry = Self::ensure_cache_slot(&mut cache, *id); + centry.schema = Some(Arc::new(schema.clone())); + loaded += 1; + } } - - let loaded_count = persistent.len(); - info!("Loaded {} schemas into cache", loaded_count); - Ok(loaded_count) + info!("Loaded {} schemas into cache", loaded); + Ok(loaded) } async fn store_table_schema(&self, table_schema: TableSchema) -> EtlResult<()> { - let table_id = table_schema.id; - info!("Storing schema for table {} (dual-write: cache + persistent)", table_id.0); + let id = table_schema.id; + info!("Storing schema for table {} (dual-write)", id.0); - // Write to persistent store first (in production: database transaction) { - let mut persistent = self.persistent_schemas.lock().await; - persistent.insert(table_id, table_schema.clone()); + let mut persistent = self.persistent.lock().await; + let p = Self::ensure_persistent_slot(&mut persistent, id); + p.schema = Some(table_schema.clone()); } - - // Then update cache { - let mut cache = self.schema_cache.lock().await; - cache.insert(table_id, Arc::new(table_schema)); + let mut cache = self.cache.lock().await; + let c = Self::ensure_cache_slot(&mut cache, id); + c.schema = Some(Arc::new(table_schema)); } - Ok(()) } } impl StateStore for CustomStore { - async fn get_table_replication_state(&self, table_id: TableId) -> EtlResult> { - let cache = self.state_cache.lock().await; - let result = cache.get(&table_id).copied(); + async fn get_table_replication_state( + &self, + table_id: TableId, + ) -> EtlResult> { + let cache = self.cache.lock().await; + let result = cache.get(&table_id).and_then(|e| e.state.clone()); info!("State cache read for table {}: {:?}", table_id.0, result); Ok(result) } - async fn get_table_replication_states(&self) -> EtlResult> { - let cache = self.state_cache.lock().await; - Ok(cache.clone()) + async fn get_table_replication_states( + &self, + ) -> EtlResult> { + let cache = self.cache.lock().await; + Ok(cache + .iter() + .filter_map(|(id, e)| e.state.clone().map(|s| (*id, s))) + .collect()) } async fn load_table_replication_states(&self) -> EtlResult { - info!("Loading states from 'persistent' store into cache"); - - let persistent = self.persistent_states.lock().await; - let mut cache = self.state_cache.lock().await; - - *cache = persistent.clone(); - let loaded_count = persistent.len(); - info!("Loaded {} states into cache", loaded_count); - Ok(loaded_count) + info!("Loading states from 'persistent' into cache"); + let persistent = self.persistent.lock().await; + let mut cache = self.cache.lock().await; + + let mut loaded = 0; + for (id, pentry) in persistent.iter() { + if let Some(state) = pentry.state.clone() { + let centry = Self::ensure_cache_slot(&mut cache, *id); + centry.state = Some(state); + loaded += 1; + } + } + info!("Loaded {} states into cache", loaded); + Ok(loaded) } - async fn update_table_replication_state(&self, table_id: TableId, state: TableReplicationPhase) -> EtlResult<()> { + async fn update_table_replication_state( + &self, + table_id: TableId, + state: TableReplicationPhase, + ) -> EtlResult<()> { info!("Updating state for table {} to {:?} (dual-write)", table_id.0, state); - - // Write to persistent store first + { - let mut persistent = self.persistent_states.lock().await; - persistent.insert(table_id, state); + let mut persistent = self.persistent.lock().await; + let p = Self::ensure_persistent_slot(&mut persistent, table_id); + p.state = Some(state.clone()); } - - // Then update cache { - let mut cache = self.state_cache.lock().await; - cache.insert(table_id, state); + let mut cache = self.cache.lock().await; + let c = Self::ensure_cache_slot(&mut cache, table_id); + c.state = Some(state); } - Ok(()) } - async fn rollback_table_replication_state(&self, _table_id: TableId) -> EtlResult { - // Simplified for tutorial - in production, you'd track state history + async fn rollback_table_replication_state( + &self, + _table_id: TableId, + ) -> EtlResult { todo!("Implement state history tracking for rollback") } async fn get_table_mapping(&self, source_table_id: &TableId) -> EtlResult> { - let cache = self.mapping_cache.lock().await; - Ok(cache.get(source_table_id).cloned()) + let cache = self.cache.lock().await; + Ok(cache.get(source_table_id).and_then(|e| e.mapping.clone())) } async fn get_table_mappings(&self) -> EtlResult> { - let cache = self.mapping_cache.lock().await; - Ok(cache.clone()) + let cache = self.cache.lock().await; + Ok(cache + .iter() + .filter_map(|(id, e)| e.mapping.clone().map(|m| (*id, m))) + .collect()) } async fn load_table_mappings(&self) -> EtlResult { - info!("Loading mappings from 'persistent' store into cache"); - let persistent = self.persistent_mappings.lock().await; - let mut cache = self.mapping_cache.lock().await; - *cache = persistent.clone(); - Ok(persistent.len()) + info!("Loading mappings from 'persistent' into cache"); + let persistent = self.persistent.lock().await; + let mut cache = self.cache.lock().await; + + let mut loaded = 0; + for (id, pentry) in persistent.iter() { + if let Some(m) = &pentry.mapping { + let centry = Self::ensure_cache_slot(&mut cache, *id); + centry.mapping = Some(m.clone()); + loaded += 1; + } + } + Ok(loaded) } - async fn store_table_mapping(&self, source_table_id: TableId, destination_table_id: String) -> EtlResult<()> { - info!("Storing mapping: {} -> {} (dual-write)", source_table_id.0, destination_table_id); - - // Write to both stores + async fn store_table_mapping( + &self, + source_table_id: TableId, + destination_table_id: String, + ) -> EtlResult<()> { + info!( + "Storing mapping: {} -> {} (dual-write)", + source_table_id.0, destination_table_id + ); + { - let mut persistent = self.persistent_mappings.lock().await; - persistent.insert(source_table_id, destination_table_id.clone()); + let mut persistent = self.persistent.lock().await; + let p = Self::ensure_persistent_slot(&mut persistent, source_table_id); + p.mapping = Some(destination_table_id.clone()); } { - let mut cache = self.mapping_cache.lock().await; - cache.insert(source_table_id, destination_table_id); + let mut cache = self.cache.lock().await; + let c = Self::ensure_cache_slot(&mut cache, source_table_id); + c.mapping = Some(destination_table_id); } - Ok(()) } } @@ -257,26 +300,22 @@ impl StateStore for CustomStore { Create `src/http_destination.rs`: ```rust -use reqwest::Client; -use serde_json::json; +use reqwest::{Client, Method}; +use serde_json::{Value, json}; use std::time::Duration; use tracing::{info, warn}; use etl::destination::Destination; -use etl::error::{EtlError, EtlResult}; +use etl::error::{ErrorKind, EtlError, EtlResult}; use etl::types::{Event, TableId, TableRow}; +use etl::{bail, etl_error}; + +const MAX_RETRIES: usize = 3; +const BASE_BACKOFF_MS: u64 = 500; -/// Simple HTTP destination showing core patterns. -/// -/// Demonstrates: -/// - Implementing the Destination trait -/// - Basic retry logic with exponential backoff -/// - Error handling (retry vs fail-fast) -/// - Data serialization for API compatibility pub struct HttpDestination { client: Client, base_url: String, - max_retries: usize, } impl HttpDestination { @@ -284,114 +323,146 @@ impl HttpDestination { let client = Client::builder() .timeout(Duration::from_secs(10)) .build() - .map_err(|e| EtlError::new("Failed to create HTTP client".into(), e.into()))?; - - Ok(Self { - client, - base_url, - max_retries: 3, - }) + .map_err(|e| etl_error!(ErrorKind::Unknown, "Failed to create HTTP client", e))?; + Ok(Self { client, base_url }) + } + + fn url(&self, path: &str) -> String { + format!( + "{}/{}", + self.base_url.trim_end_matches('/'), + path.trim_start_matches('/') + ) } - /// Simple retry with exponential backoff - async fn retry_request(&self, mut operation: F) -> EtlResult<()> - where - F: FnMut() -> Fut, - Fut: std::future::Future>, - { - for attempt in 0..self.max_retries { - match operation().await { - Ok(response) if response.status().is_success() => { - info!("HTTP request succeeded on attempt {}", attempt + 1); + /// Small, generic sender with retry + backoff. + async fn send_json(&self, method: Method, path: &str, body: Option<&Value>) -> EtlResult<()> { + let url = self.url(path); + + for attempt in 0..MAX_RETRIES { + let mut req = self.client.request(method.clone(), &url); + if let Some(b) = body { + req = req.json(b); + } + + match req.send().await { + Ok(resp) if resp.status().is_success() => { + info!( + "HTTP {} {} succeeded (attempt {})", + method, + url, + attempt + 1 + ); return Ok(()); } - Ok(response) => { - let status = response.status(); - warn!("HTTP request failed with status {}, attempt {}", status, attempt + 1); - - // Retry on server errors, fail fast on client errors + Ok(resp) => { + let status = resp.status(); + warn!( + "HTTP {} {} failed with {}, attempt {}", + method, + url, + status, + attempt + 1 + ); + // Fail-fast on 4xx if !status.is_server_error() { - return Err(EtlError::new( - "HTTP client error".into(), - anyhow::anyhow!("Status: {}", status).into(), - )); + bail!( + ErrorKind::Unknown, + "HTTP client error", + format!("Status: {}", status) + ); } } - Err(e) => { - warn!("HTTP request network error on attempt {}: {}", attempt + 1, e); - } + Err(e) => warn!( + "HTTP {} {} network error on attempt {}: {}", + method, + url, + attempt + 1, + e + ), } - + // Exponential backoff: 500ms, 1s, 2s - let delay = Duration::from_millis(500 * 2_u64.pow(attempt as u32)); + let delay = Duration::from_millis(BASE_BACKOFF_MS * 2u64.pow(attempt as u32)); tokio::time::sleep(delay).await; } - - Err(EtlError::new("HTTP request failed after retries".into(), anyhow::anyhow!("Max retries exceeded").into())) + + bail!( + ErrorKind::Unknown, + "HTTP request failed after retries", + format!("Max retries ({MAX_RETRIES}) exceeded") + ) } } impl Destination for HttpDestination { async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> { info!("HTTP: Truncating table {}", table_id.0); - - let url = format!("{}/tables/{}/truncate", self.base_url, table_id.0); - let operation = || self.client.delete(&url).send(); - - self.retry_request(operation).await?; - Ok(()) + self.send_json( + Method::DELETE, + &format!("tables/{}/truncate", table_id.0), + None, + ) + .await } - async fn write_table_rows(&self, table_id: TableId, table_rows: Vec) -> EtlResult<()> { + async fn write_table_rows( + &self, + table_id: TableId, + table_rows: Vec, + ) -> EtlResult<()> { if table_rows.is_empty() { return Ok(()); } - - info!("HTTP: Writing {} rows for table {}", table_rows.len(), table_id.0); - - // Simple serialization - in production you'd handle all data types properly - let rows_json: Vec<_> = table_rows.iter().map(|row| { - json!({ - "values": row.values.iter().map(|v| format!("{:?}", v)).collect::>() + + info!( + "HTTP: Writing {} rows for table {}", + table_rows.len(), + table_id.0 + ); + + // Simple serialization β€” stringify values for demo-compat. + let rows_json: Vec<_> = table_rows + .iter() + .map(|row| { + json!({ + "values": row.values.iter().map(|v| format!("{:?}", v)).collect::>() + }) }) - }).collect(); - + .collect(); + let payload = json!({ "table_id": table_id.0, "rows": rows_json }); - - let url = format!("{}/tables/{}/rows", self.base_url, table_id.0); - let operation = || self.client.post(&url).json(&payload).send(); - - self.retry_request(operation).await?; - Ok(()) + + self.send_json( + Method::POST, + &format!("tables/{}/rows", table_id.0), + Some(&payload), + ) + .await } async fn write_events(&self, events: Vec) -> EtlResult<()> { if events.is_empty() { return Ok(()); } - + info!("HTTP: Writing {} events", events.len()); - - // Simple event serialization - let events_json: Vec<_> = events.iter().map(|event| { - json!({ - "event_type": format!("{:?}", event), - "timestamp": chrono::Utc::now() + + let events_json: Vec<_> = events + .iter() + .map(|event| { + json!({ + "event_type": format!("{:?}", event), + }) }) - }).collect(); - - let payload = json!({ - "events": events_json - }); - - let url = format!("{}/events", self.base_url); - let operation = || self.client.post(&url).json(&payload).send(); - - self.retry_request(operation).await?; - Ok(()) + .collect(); + + let payload = json!({ "events": events_json }); + + self.send_json(Method::POST, "events", Some(&payload)).await } } ``` @@ -474,36 +545,6 @@ anyhow = "1.0" - **Data transformation**: Convert ETL types to API-friendly formats - **Batching awareness**: Handle empty batches gracefully -### Production Extensions - -For real production use, extend these patterns: - -```rust -// Custom Store Extensions -impl CustomStore { - // Add connection pooling for database stores - async fn with_database_pool() -> Self { /* ... */ } - - // Add metrics collection - async fn get_cache_metrics(&self) -> Metrics { /* ... */ } - - // Add state history for rollbacks - async fn track_state_history(&self, table_id: TableId, state: TableReplicationPhase) { /* ... */ } -} - -// Custom Destination Extensions -impl HttpDestination { - // Add circuit breaker pattern - async fn should_break_circuit(&self) -> bool { /* ... */ } - - // Add authentication handling - async fn refresh_auth_token(&mut self) -> EtlResult<()> { /* ... */ } - - // Add request batching - async fn batch_multiple_requests(&self, requests: Vec) -> EtlResult<()> { /* ... */ } -} -``` - ## What You've Learned You now understand ETL's extension patterns: From 63290f158872182a8ae40a9b2365681a2a064704 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Thu, 14 Aug 2025 17:05:44 +0200 Subject: [PATCH 4/9] Update --- docs/explanation/architecture.md | 326 ++++++++------- docs/explanation/replication.md | 271 ------------- docs/how-to/debugging.md | 490 ----------------------- docs/tutorials/custom-implementations.md | 484 ++++++++++++++-------- mkdocs.yaml | 2 - 5 files changed, 496 insertions(+), 1077 deletions(-) delete mode 100644 docs/explanation/replication.md delete mode 100644 docs/how-to/debugging.md diff --git a/docs/explanation/architecture.md b/docs/explanation/architecture.md index e945cadd5..cb465da8a 100644 --- a/docs/explanation/architecture.md +++ b/docs/explanation/architecture.md @@ -1,18 +1,16 @@ --- type: explanation -title: ETL Architecture Overview -last_reviewed: 2025-01-14 +title: ETL Architecture Overview +last_reviewed: 2025-08-14 --- # ETL Architecture Overview **Understanding how ETL components work together to replicate data from PostgreSQL** -ETL's architecture is built around a few key abstractions that work together to provide reliable, high-performance data replication. This document explains how these components interact and why they're designed the way they are. +ETL's architecture centers around four core abstractions that work together to provide reliable, high-performance data replication: Pipeline, Destination, SchemaStore, and StateStore. This document explains how these components interact and coordinate data flow from PostgreSQL logical replication to target systems. -## The Big Picture - -At its core, ETL connects PostgreSQL's logical replication stream to configurable destination systems: +A diagram of the overall architecture is shown below: ```mermaid flowchart LR @@ -56,189 +54,233 @@ flowchart LR TableSyncWorkers --> Store ``` -The architecture separates concerns to make the system extensible, testable, and maintainable. - -## Core Components +## Core Abstractions ### Pipeline: The Orchestrator -The [`Pipeline`](../reference/pipeline/) is ETL's central component that coordinates all other parts: +The Pipeline is ETL's central component that orchestrates all replication activity. It manages worker lifecycles, coordinates data flow, and handles error recovery. -**Responsibilities:** -- Establishes connection to PostgreSQL replication stream -- Manages initial table synchronization ("backfill") -- Processes ongoing change events from WAL -- Coordinates batching and delivery to destinations -- Handles errors and retries +**Key responsibilities:** +- Establishes PostgreSQL replication connection +- Spawns and manages worker processes +- Coordinates initial table synchronization with ongoing replication +- Handles shutdown and error scenarios -**Why this design?** By centralizing orchestration in one component, we can ensure consistent behavior across all operations while keeping the interface simple for users. +### Destination: Where Data Goes -### Destinations: Where Data Goes - -The [`Destination`](../reference/destination-trait/) trait defines how data leaves ETL: +The Destination trait defines how replicated data is delivered to target systems: ```rust -trait Destination { - async fn write_batch(&mut self, batch: BatchedData) -> Result<(), DestinationError>; - async fn flush(&mut self) -> Result<(), DestinationError>; +pub trait Destination { + fn truncate_table(&self, table_id: TableId) -> impl Future> + Send; + + fn write_table_rows( + &self, + table_id: TableId, + table_rows: Vec, + ) -> impl Future> + Send; + + fn write_events(&self, events: Vec) -> impl Future> + Send; } ``` -**Built-in implementations:** -- [`MemoryDestination`](../reference/memory-destination/) - For testing and development -- [`BigQueryDestination`](../reference/bigquery-destination/) - Google BigQuery integration - -**Why this abstraction?** The trait allows ETL to support any output system while providing consistent batching, error handling, and retry behavior. New destinations get all the pipeline reliability features automatically. - -### Stores: Managing State and Schemas +The trait provides three operations: `truncate_table` clears destination tables before bulk loading, `write_table_rows` handles bulk data insertion during initial synchronization, and `write_events` processes streaming replication changes. + +### SchemaStore: Table Structure Management + +The SchemaStore trait manages table schema information: + +```rust +pub trait SchemaStore { + fn get_table_schema( + &self, + table_id: &TableId, + ) -> impl Future>>> + Send; + + fn get_table_schemas(&self) -> impl Future>>> + Send; + + fn load_table_schemas(&self) -> impl Future> + Send; + + fn store_table_schema( + &self, + table_schema: TableSchema, + ) -> impl Future> + Send; +} +``` -ETL uses two types of storage via the [`Store`](../reference/store-trait/) trait: +The store follows a cache-first pattern: `load_table_schemas` populates an in-memory cache at startup, while `get_*` methods read only from cache for performance. `store_table_schema` implements dual-write to both persistent storage and cache. -**State storage** tracks replication progress: -- WAL positions for recovery -- Table synchronization status -- Retry counters and backoff timers +### StateStore: Replication Progress Tracking -**Schema storage** manages table structures: -- Column names and types -- Primary key information -- Schema evolution tracking +The StateStore trait manages replication state and table mappings: -**Implementation options:** -- [`MemoryStore`](../reference/memory-store/) - Fast, but loses state on restart -- [`PostgresStore`](../reference/postgres-store/) - Persistent, production-ready +```rust +pub trait StateStore { + fn get_table_replication_state( + &self, + table_id: TableId, + ) -> impl Future>> + Send; + + fn get_table_replication_states( + &self, + ) -> impl Future>> + Send; + + fn load_table_replication_states(&self) -> impl Future> + Send; + + fn update_table_replication_state( + &self, + table_id: TableId, + state: TableReplicationPhase, + ) -> impl Future> + Send; + + fn rollback_table_replication_state( + &self, + table_id: TableId, + ) -> impl Future> + Send; + + fn get_table_mapping( + &self, + source_table_id: &TableId, + ) -> impl Future>> + Send; + + fn get_table_mappings( + &self, + ) -> impl Future>> + Send; + + fn load_table_mappings(&self) -> impl Future> + Send; + + fn store_table_mapping( + &self, + source_table_id: TableId, + destination_table_id: String, + ) -> impl Future> + Send; +} +``` -**Why separate storage?** This allows ETL to work in different deployment scenarios: development (memory), cloud-native (external databases), or embedded (SQLite, eventually). +Like SchemaStore, StateStore uses cache-first reads with `load_*` methods for startup population and dual-write patterns for updates. The store tracks both replication progress through `TableReplicationPhase` and source-to-destination table name mappings. ## Data Flow Architecture -### Initial Synchronization - -When a pipeline starts, ETL performs a full synchronization of existing data: - -1. **Discovery:** Query PostgreSQL catalogs to find tables in the publication -2. **Schema capture:** Extract column information and primary keys -3. **Snapshot:** Copy existing rows in batches to the destination -4. **State tracking:** Record progress to support resumption - -This ensures the destination has complete data before processing real-time changes. +### Worker Coordination -### Ongoing Replication +ETL's data flow is orchestrated through two types of workers: -After initial sync, ETL processes the PostgreSQL WAL stream: +**Apply Worker** - The primary replication processor: +- Processes PostgreSQL logical replication stream +- Spawns table sync workers as needed +- Coordinates with table sync workers through shared state +- Handles final event processing for tables in `Ready` state -1. **Stream connection:** Attach to the replication slot -2. **Event parsing:** Decode WAL records into structured changes -3. **Batching:** Group changes for efficient destination writes -4. **Delivery:** Send batches to destinations with retry logic -5. **Acknowledgment:** Confirm WAL position to PostgreSQL +**Table Sync Workers** - Initial data synchronization: +- Perform bulk copying of existing table data +- Coordinate handoff to apply worker when synchronization completes +- Multiple workers run in parallel, limited by configured semaphore -### Error Handling Strategy +### Worker Startup Sequence -ETL's error handling follows a layered approach: +The Pipeline follows this startup sequence: -**Transient errors** (network issues, destination overload): -- Exponential backoff retry -- Circuit breaker to prevent cascading failures -- Eventual resumption from last known good state +1. **Pipeline Initialization**: Establishes PostgreSQL connection and loads cached state +2. **Apply Worker Launch**: Creates and starts the primary apply worker first +3. **Table Discovery**: Apply worker identifies tables requiring synchronization +4. **Table Sync Spawning**: Apply worker spawns table sync workers for tables in `Init` state +5. **Coordination**: Workers communicate through shared state store -**Permanent errors** (schema mismatches, authentication failures): -- Immediate pipeline halt -- Clear error reporting to operators -- Manual intervention required +The apply worker always starts first because it coordinates the overall replication process and spawns table sync workers on demand. -**Partial failures** (some tables succeed, others fail): -- Per-table error tracking -- Independent retry schedules -- Healthy tables continue processing +### Table Replication Phases -## Scalability Patterns +Each table progresses through distinct phases during replication: -### Vertical Scaling - -ETL supports scaling up through configuration: - -- **Batch sizes:** Larger batches for higher throughput -- **Worker threads:** Parallel table synchronization -- **Buffer sizes:** More memory for better batching - -### Horizontal Scaling - -For massive databases, ETL supports: - -- **Multiple pipelines:** Split tables across different pipeline instances -- **Destination sharding:** Route different tables to different destinations -- **Read replicas:** Reduce load on primary database - -### Resource Management - -ETL is designed to be resource-predictable: - -- **Memory bounds:** Configurable limits on batch sizes and buffers -- **Connection pooling:** Reuse PostgreSQL connections efficiently -- **Backpressure:** Slow down if destinations can't keep up - -## Extension Points +```rust +pub enum TableReplicationPhase { + Init, + DataSync, + FinishedCopy, + SyncWait, + Catchup { lsn: PgLsn }, + SyncDone { lsn: PgLsn }, + Ready, + Errored { reason: String, solution: Option, retry_policy: RetryPolicy }, +} +``` -### Custom Destinations +**Phase Ownership and Transitions:** -The [`Destination`](../reference/destination-trait/) trait makes it straightforward to add support for new output systems: +- **Init**: Set by pipeline when table first discovered +- **DataSync**: Table sync worker begins bulk data copying +- **FinishedCopy**: Table sync worker completes bulk copy, begins catching up with replication stream +- **SyncWait**: Table sync worker requests apply worker to pause (memory-only, not persisted) +- **Catchup**: Apply worker pauses and signals LSN position for table sync worker (memory-only) +- **SyncDone**: Table sync worker catches up to apply worker's LSN and signals completion +- **Ready**: Apply worker takes over all processing for this table +- **Errored**: Either worker encounters unrecoverable error -- **REST APIs:** HTTP-based services -- **Message queues:** Kafka, RabbitMQ, etc. -- **Databases:** Any database with bulk insert capabilities -- **File systems:** Parquet, JSON, CSV outputs +### Synchronization Handoff -### Custom Stores +The critical coordination happens during the transition from table sync worker to apply worker control: -The [`Store`](../reference/store-trait/) trait allows different persistence strategies: +1. **Table sync worker** completes bulk copy (`FinishedCopy`) +2. **Table sync worker** processes replication events to catch up +3. **Table sync worker** sets state to `SyncWait` (signals apply worker to pause) +4. **Apply worker** detects `SyncWait` at transaction boundary and pauses +5. **Apply worker** sets state to `Catchup` with current LSN position +6. **Table sync worker** processes events up to the `Catchup` LSN +7. **Table sync worker** sets state to `SyncDone` with final LSN and terminates +8. **Apply worker** detects `SyncDone` and transitions table to `Ready` +9. **Apply worker** resumes processing and handles all future events for the table -- **Cloud databases:** RDS, CloudSQL, etc. -- **Key-value stores:** Redis, DynamoDB -- **Local storage:** SQLite, embedded databases +This coordination ensures no events are lost during the handoff and that the table reaches a consistent state. -### Plugin Architecture +### Event Processing Flow -ETL's trait-based design enables: +**Initial Synchronization (Table Sync Worker):** +1. Truncate destination table using `Destination::truncate_table` +2. Copy existing data in batches using `Destination::write_table_rows` +3. Process replication stream events using `Destination::write_events` +4. Coordinate handoff to apply worker -- **Runtime plugin loading:** Dynamic destination discovery -- **Configuration-driven setup:** Choose implementations via config -- **Testing isolation:** Mock implementations for unit tests +**Continuous Replication (Apply Worker):** +1. Read events from PostgreSQL logical replication stream +2. Filter events for tables in `Ready` state +3. Batch events for efficiency +4. Send batches to destination using `Destination::write_events` +5. Acknowledge progress to PostgreSQL -## Design Philosophy +### Concurrency and Synchronization -### Correctness First +ETL uses several concurrency primitives to coordinate workers: -ETL prioritizes data consistency over raw speed: -- **At-least-once delivery:** Better to duplicate than lose data -- **State durability:** Persist progress before acknowledging -- **Schema safety:** Validate destination compatibility +- **Semaphore**: Limits number of concurrent table sync workers +- **Shutdown channels**: Broadcast shutdown signals to all workers +- **Shared state**: StateStore provides atomic state transitions +- **Message passing**: Workers coordinate through state changes rather than direct communication -### Operational Simplicity +The apply worker holds the semaphore permits and distributes them to table sync workers, ensuring resource bounds while allowing parallel initial synchronization. -ETL aims to be easy to operate: -- **Clear error messages:** Actionable information for operators -- **Predictable behavior:** Minimal configuration surprises -- **Observable:** Built-in metrics and logging +## Design Rationale -### Performance Where It Matters +### Cache-First Storage Pattern -ETL optimizes the bottlenecks: -- **Batching:** Amortize per-operation overhead -- **Async I/O:** Maximize network utilization -- **Zero-copy:** Minimize data copying where possible +Both SchemaStore and StateStore separate loading from reading. This pattern provides: +- **Performance**: Fast cache-only reads during high-frequency operations +- **Consistency**: Dual-write ensures cache and persistent storage stay synchronized +- **Startup efficiency**: Bulk loading minimizes startup time -## Next Steps +### Worker Separation -Now that you understand ETL's architecture: +Separating apply workers from table sync workers enables: +- **Parallelism**: Multiple tables can synchronize concurrently +- **Resource control**: Semaphore prevents resource exhaustion +- **Clear handoff**: Explicit phase transitions ensure data consistency +- **Error isolation**: Table-level failures don't affect other tables -- **See it in action** β†’ [Build your first pipeline](../tutorials/first-pipeline/) -- **Learn about performance** β†’ [Performance characteristics](performance/) -- **Understand the foundation** β†’ [PostgreSQL logical replication](replication/) -- **Compare with alternatives** β†’ [ETL vs. other tools](comparisons/) +### State-Driven Coordination -## See Also +Using shared state for worker coordination provides: +- **Persistence**: State survives worker failures and restarts +- **Observability**: External systems can monitor replication progress +- **Recovery**: Workers can resume from last known state +- **Simplicity**: No complex message passing between workers -- [Design decisions](design/) - Why ETL is built the way it is -- [Crate structure](crate-structure/) - How code is organized -- [State management](state-management/) - Deep dive on state handling \ No newline at end of file +The architecture prioritizes data consistency and operational simplicity over raw throughput, ensuring reliable replication with clear error handling and recovery patterns. \ No newline at end of file diff --git a/docs/explanation/replication.md b/docs/explanation/replication.md deleted file mode 100644 index 2aa7a5442..000000000 --- a/docs/explanation/replication.md +++ /dev/null @@ -1,271 +0,0 @@ ---- -type: explanation -title: Why PostgreSQL Logical Replication? -last_reviewed: 2025-01-14 ---- - -# Why PostgreSQL Logical Replication? - -**Understanding the foundation technology that powers ETL and its advantages over alternatives** - -PostgreSQL logical replication is the core technology that ETL builds upon. This document explains how it works, why it's well-suited for ETL use cases, and how it compares to other change data capture approaches. - -## What is Logical Replication? - -Logical replication streams changes from PostgreSQL databases at the **logical level** (rows and operations) rather than the **physical level** (disk blocks and binary changes). This means ETL receives structured, interpretable data changes that can be easily transformed and routed to different destinations. - -### Key Characteristics - -- **Row-based:** Changes are captured as individual row operations (INSERT, UPDATE, DELETE) -- **Selective:** Choose which tables to replicate via publications -- **Real-time:** Changes stream immediately as they're committed -- **Durable:** Uses PostgreSQL's Write-Ahead Log (WAL) for reliability -- **Ordered:** Changes arrive in commit order within each table - -## How Logical Replication Works - -### The WAL-Based Foundation - -PostgreSQL's logical replication is built on its Write-Ahead Log (WAL): - -1. **Transaction commits** are written to WAL before being applied to data files -2. **Logical decoding** translates WAL entries into structured change events -3. **Replication slots** track which changes have been consumed -4. **Publications** define which tables and operations to replicate - -``` -Application PostgreSQL ETL Pipeline - β”‚ β”‚ β”‚ - │──── INSERT ────│ β”‚ - β”‚ │──── WAL entry ────────│ - β”‚ β”‚ │──── Structured change - β”‚ β”‚ β”‚ (table, operation, data) - │◄─── SUCCESS ───│ β”‚ -``` - -### Publications and Subscriptions - -**Publications** define what to replicate: - -```sql --- Replicate specific tables -CREATE PUBLICATION app_data FOR TABLE users, orders, products; - --- Replicate all tables (use with caution) -CREATE PUBLICATION all_data FOR ALL TABLES; - --- Replicate only specific operations -CREATE PUBLICATION inserts_only FOR TABLE users WITH (publish = 'insert'); -``` - -**Replication slots** track consumption: - -```sql --- ETL creates and manages these automatically -SELECT pg_create_logical_replication_slot('etl_slot', 'pgoutput'); -``` - -### Data Consistency Guarantees - -Logical replication provides strong consistency: - -- **Transactional consistency:** All changes from a transaction arrive together -- **Ordering guarantees:** Changes within a table maintain commit order -- **Durability:** WAL ensures no committed changes are lost -- **At-least-once delivery:** Changes may be delivered multiple times but never lost - -## Why ETL Uses Logical Replication - -### Real-Time Performance - -Unlike polling-based approaches, logical replication provides **immediate change notification**: - -- **Low latency:** Changes stream as they happen (milliseconds to seconds) -- **No database overhead:** No impact on application queries -- **Efficient bandwidth:** Only actual changes are transmitted - -### Operational Simplicity - -Logical replication is **built into PostgreSQL**: - -- **No triggers to maintain:** Changes are captured automatically -- **No application changes:** Existing applications work unchanged -- **Reliable recovery:** Built-in WAL retention and replay -- **Minimal configuration:** Just enable logical replication and create publications - -### Complete Change Capture - -Captures **all types of changes**: - -- **DML operations:** INSERT, UPDATE, DELETE operations -- **Bulk operations:** COPY, bulk updates, and imports -- **Transaction boundaries:** Commit and rollback information -- **Schema information:** Column types and table structure - -## Comparing Replication Approaches - -### Logical Replication vs. Physical Replication - -| Aspect | Logical Replication | Physical Replication | -|--------|-------------------|-------------------| -| **Granularity** | Table/row level | Entire database cluster | -| **Selectivity** | Choose specific tables | All or nothing | -| **Version compatibility** | Cross-version support | Same major version only | -| **Overhead** | Moderate (logical decoding) | Low (binary copy) | -| **Use case** | ETL, selective sync | Backup, disaster recovery | - -### Logical Replication vs. Trigger-Based CDC - -| Aspect | Logical Replication | Trigger-Based CDC | -|--------|-------------------|-----------------| -| **Performance impact** | Minimal on source | High (trigger execution) | -| **Change coverage** | All operations including bulk | Only row-by-row operations | -| **Maintenance** | Built-in PostgreSQL feature | Custom triggers to maintain | -| **Reliability** | WAL-based durability | Depends on trigger implementation | -| **Schema changes** | Handles automatically | Triggers need updates | - -### Logical Replication vs. Query-Based Polling - -| Aspect | Logical Replication | Query-Based Polling | -|--------|-------------------|-------------------| -| **Latency** | Real-time (seconds) | Polling interval (minutes) | -| **Source load** | Minimal | Repeated full table scans | -| **Delete detection** | Automatic | Requires soft deletes | -| **Infrastructure** | Simple (ETL + PostgreSQL) | Complex (schedulers, state tracking) | -| **Change ordering** | Guaranteed | Can miss intermediate states | - -## Limitations and Considerations - -### What Logical Replication Doesn't Capture - -- **DDL operations:** Schema changes (CREATE, ALTER, DROP) are not replicated -- **TRUNCATE operations:** Not captured by default (can be enabled in PostgreSQL 11+) -- **Sequence changes:** nextval() calls on sequences -- **Large object changes:** BLOB/CLOB modifications -- **Temporary table operations:** Temp tables are not replicated - -### Performance Considerations - -**WAL generation overhead:** -- Logical replication increases WAL volume by ~10-30% -- More detailed logging required for logical decoding -- May require WAL retention tuning for catch-up scenarios - -**Replication slot management:** -- Unused slots prevent WAL cleanup (disk space growth) -- Slow consumers can cause WAL buildup -- Need monitoring and automatic cleanup - -**Network bandwidth:** -- All change data flows over network -- Large transactions can cause bandwidth spikes -- Consider batching and compression for high-volume scenarios - -## ETL's Enhancements to Logical Replication - -ETL builds on PostgreSQL's logical replication with additional features: - -### Intelligent Batching - -- **Configurable batch sizes:** Balance latency vs. throughput -- **Time-based batching:** Ensure maximum latency bounds -- **Backpressure handling:** Slow down if destinations can't keep up - -### Error Handling and Recovery - -- **Retry logic:** Handle transient destination failures -- **Circuit breakers:** Prevent cascade failures -- **State persistence:** Resume from exact WAL positions after restarts - -### Multi-Destination Routing - -- **Fan-out replication:** Send same data to multiple destinations -- **Selective routing:** Different tables to different destinations -- **Transformation pipelines:** Modify data en route to destinations - -### Operational Features - -- **Metrics and monitoring:** Track replication lag, throughput, errors -- **Schema change detection:** Automatic handling of table structure changes -- **Resource management:** Memory and connection pooling - -## Use Cases and Patterns - -### Real-Time Analytics - -Stream transactional data to analytical systems: - -``` -PostgreSQL (OLTP) ──ETL──▷ BigQuery (OLAP) - β”‚ β”‚ - β”œβ”€β”€ Users insert orders β”œβ”€β”€ Real-time dashboards - β”œβ”€β”€ Inventory updates β”œβ”€β”€ Business intelligence - └── Payment processing └── Data science workflows -``` - -### Event-Driven Architecture - -Use database changes as event sources: - -``` -PostgreSQL ──ETL──▷ Event Bus ──▷ Microservices - β”‚ β”‚ β”‚ - β”œβ”€β”€ Order created β”œβ”€β”€ Events β”œβ”€β”€ Email service - β”œβ”€β”€ User updated β”œβ”€β”€ Topics β”œβ”€β”€ Notification service - └── Inventory low └── Streams └── Recommendation engine -``` - -### Data Lake Ingestion - -Continuously populate data lakes: - -``` -PostgreSQL ──ETL──▷ Data Lake ──▷ ML/Analytics - β”‚ β”‚ β”‚ - β”œβ”€β”€ App database β”œβ”€β”€ Parquet β”œβ”€β”€ Feature stores - β”œβ”€β”€ User behavior β”œβ”€β”€ Delta β”œβ”€β”€ Model training - └── Business data └── Iceberg └── Batch processing -``` - -## Choosing Logical Replication - -**Logical replication is ideal when you need:** - -- Real-time or near real-time change capture -- Selective table replication -- Cross-version or cross-platform data movement -- Minimal impact on source database performance -- Built-in reliability and durability guarantees - -**Consider alternatives when you need:** - -- **Immediate consistency:** Use synchronous replication or 2PC -- **Schema change replication:** Consider schema migration tools -- **Cross-database replication:** Look at database-specific solutions -- **Complex transformations:** ETL tools might be simpler - -## Future of Logical Replication - -PostgreSQL continues to enhance logical replication: - -- **Row-level security:** Filter replicated data by user permissions -- **Binary protocol improvements:** Faster, more efficient encoding -- **Cross-version compatibility:** Better support for version differences -- **Performance optimizations:** Reduced overhead and increased throughput - -ETL evolves alongside these improvements, providing a stable interface while leveraging new capabilities as they become available. - -## Next Steps - -Now that you understand the foundation: - -- **See it in practice** β†’ [ETL Architecture](architecture/) -- **Compare alternatives** β†’ [ETL vs. Other Tools](comparisons/) -- **Build your first pipeline** β†’ [First Pipeline Tutorial](../tutorials/first-pipeline/) -- **Configure PostgreSQL** β†’ [PostgreSQL Setup](../how-to/configure-postgres/) - -## See Also - -- [PostgreSQL Logical Replication Docs](https://www.postgresql.org/docs/current/logical-replication.html) - Official documentation -- [Design decisions](design/) - Why ETL is built the way it is -- [Performance characteristics](performance/) - Understanding ETL's behavior under load \ No newline at end of file diff --git a/docs/how-to/debugging.md b/docs/how-to/debugging.md deleted file mode 100644 index 92a509e2b..000000000 --- a/docs/how-to/debugging.md +++ /dev/null @@ -1,490 +0,0 @@ ---- -type: how-to -audience: developers, operators -prerequisites: - - Basic understanding of ETL pipelines - - Access to PostgreSQL and ETL logs - - Familiarity with ETL configuration -version_last_tested: 0.1.0 -last_reviewed: 2025-01-14 -risk_level: low ---- - -# Debug Pipeline Issues - -**Diagnose and resolve common ETL pipeline problems quickly and systematically** - -This guide helps you identify, diagnose, and fix issues with ETL pipelines using a structured troubleshooting approach. - -## Goal - -Learn to systematically debug ETL issues: - -- Identify the source of pipeline problems -- Use logging and monitoring to diagnose issues -- Apply appropriate fixes for common failure patterns -- Prevent similar issues in the future - -## Prerequisites - -- Running ETL pipeline (even if failing) -- Access to PostgreSQL server and logs -- ETL application logs and configuration -- Basic SQL knowledge for diagnostic queries - -## Decision Points - -**Choose your debugging approach based on symptoms:** - -| Symptom | Most Likely Cause | Start Here | -|---------|-------------------|------------| -| Pipeline won't start | Configuration/connection issues | [Connection Problems](#connection-problems) | -| Pipeline starts but no data | Publication/replication setup | [Replication Issues](#replication-issues) | -| Pipeline stops unexpectedly | Resource/permission problems | [Runtime Failures](#runtime-failures) | -| Data missing or incorrect | Schema/destination issues | [Data Quality Problems](#data-quality-problems) | -| Slow performance | Batching/network issues | [Performance Issues](#performance-issues) | - -## Systematic Debugging Process - -### Step 1: Gather Information - -Before diving into fixes, collect diagnostic information: - -**Check ETL logs:** -```bash -# If using structured logging -grep -E "(ERROR|FATAL|PANIC)" etl.log | tail -20 - -# Look for specific patterns -grep "connection" etl.log -grep "replication slot" etl.log -grep "publication" etl.log -``` - -**Check PostgreSQL logs:** -```sql --- Recent PostgreSQL errors -SELECT pg_current_logfile(); --- Then check that file for errors around your pipeline start time -``` - -**Collect system information:** -```sql --- Check replication slots -SELECT slot_name, slot_type, active, confirmed_flush_lsn -FROM pg_replication_slots; - --- Check publications -SELECT pubname, puballtables, pubinsert, pubupdate, pubdelete -FROM pg_publication; - --- Check database connections -SELECT pid, usename, application_name, state, query_start -FROM pg_stat_activity -WHERE application_name LIKE '%etl%'; -``` - -### Step 2: Identify the Problem Category - -Use this decision tree to narrow down the issue: - -``` -Pipeline fails to start? -β”œβ”€ YES β†’ Connection Problems -└─ NO β†’ Pipeline starts but... - β”œβ”€ No data flowing β†’ Replication Issues - β”œβ”€ Pipeline crashes β†’ Runtime Failures - β”œβ”€ Wrong/missing data β†’ Data Quality Problems - └─ Slow performance β†’ Performance Issues -``` - -## Common Problem Categories - -### Connection Problems - -**Symptoms:** -- "Connection refused" errors -- "Authentication failed" errors -- "Database does not exist" errors -- Pipeline exits immediately on startup - -**Diagnosis:** - -```bash -# Test basic connection -psql -h your-host -p 5432 -U etl_user -d your_db -c "SELECT 1;" - -# Test from ETL server specifically -# (run this from where ETL runs) -telnet your-host 5432 -``` - -**Common causes and fixes:** - -| Error Message | Cause | Fix | -|--------------|-------|-----| -| "Connection refused" | PostgreSQL not running or firewall | Check `systemctl status postgresql` and firewall rules | -| "Authentication failed" | Wrong password/user | Verify credentials and `pg_hba.conf` | -| "Database does not exist" | Wrong database name | Check database name in connection string | -| "SSL required" | TLS configuration mismatch | Update `TlsConfig` to match server requirements | - -### Replication Issues - -**Symptoms:** -- Pipeline starts successfully but no data flows -- "Publication not found" errors -- "Replication slot already exists" errors -- Initial sync never completes - -**Diagnosis:** - -```sql --- Check if publication exists and has tables -SELECT schemaname, tablename -FROM pg_publication_tables -WHERE pubname = 'your_publication_name'; - --- Check if replication slot is active -SELECT slot_name, active, confirmed_flush_lsn -FROM pg_replication_slots -WHERE slot_name = 'your_slot_name'; - --- Check table permissions -SELECT grantee, table_schema, table_name, privilege_type -FROM information_schema.role_table_grants -WHERE grantee = 'etl_user' AND table_name = 'your_table'; -``` - -**Common fixes:** - -**Publication doesn't exist:** -```sql -CREATE PUBLICATION your_publication FOR TABLE table1, table2; -``` - -**No tables in publication:** -```sql --- Add tables to existing publication -ALTER PUBLICATION your_publication ADD TABLE missing_table; -``` - -**Permission denied on tables:** -```sql -GRANT SELECT ON TABLE your_table TO etl_user; -``` - -**Stale replication slot:** -```sql --- Drop and recreate (will lose position) -SELECT pg_drop_replication_slot('stale_slot_name'); -``` - -### Runtime Failures - -**Symptoms:** -- Pipeline runs for a while then crashes -- "Out of memory" errors -- "Too many open files" errors -- Destination write failures - -**Diagnosis:** - -```bash -# Check system resources -htop # or top -df -h # disk space -ulimit -n # file descriptor limit - -# Check ETL memory usage -ps aux | grep etl -``` - -**Common fixes:** - -**Memory issues:** -```rust -// Reduce batch sizes in configuration -BatchConfig { - max_size: 500, // Reduce from 1000+ - max_fill_ms: 2000, -} -``` - -**File descriptor limits:** -```bash -# Temporary fix -ulimit -n 10000 - -# Permanent fix (add to /etc/security/limits.conf) -etl_user soft nofile 65536 -etl_user hard nofile 65536 -``` - -**Destination timeouts:** -```rust -// Add retry configuration or connection pooling -// Check destination system health and capacity -``` - -### Data Quality Problems - -**Symptoms:** -- Some rows missing in destination -- Data appears corrupted or truncated -- Schema mismatch errors -- Timestamp/timezone issues - -**Diagnosis:** - -```sql --- Compare row counts between source and destination -SELECT COUNT(*) FROM source_table; --- vs destination count - --- Check for recent schema changes -SELECT schemaname, tablename, attname, atttypid -FROM pg_attribute -JOIN pg_class ON attrelid = oid -JOIN pg_namespace ON relnamespace = pg_namespace.oid -WHERE schemaname = 'public' AND tablename = 'your_table'; - --- Check for problematic data types -SELECT column_name, data_type, character_maximum_length -FROM information_schema.columns -WHERE table_name = 'your_table' - AND data_type IN ('json', 'jsonb', 'text', 'bytea'); -``` - -**Common fixes:** - -**Schema evolution:** -```sql --- Restart pipeline after schema changes --- ETL will detect and adapt to new schema -``` - -**Data type issues:** -```rust -// Enable feature flag for unknown types -etl = { git = "https://github.com/supabase/etl", features = ["unknown-types-to-bytes"] } -``` - -**Character encoding problems:** -```sql --- Check database encoding -SHOW server_encoding; -SHOW client_encoding; -``` - -### Performance Issues - -**Symptoms:** -- Very slow initial sync -- High replication lag -- High CPU/memory usage -- Destination write bottlenecks - -**Diagnosis:** - -```sql --- Monitor replication lag -SELECT slot_name, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as lag -FROM pg_replication_slots; - --- Check WAL generation rate -SELECT pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')) as total_wal; - --- Monitor long-running queries -SELECT pid, now() - pg_stat_activity.query_start AS duration, query -FROM pg_stat_activity -WHERE (now() - pg_stat_activity.query_start) > interval '5 minutes'; -``` - -**Performance tuning:** - -```rust -// Optimize batch configuration -PipelineConfig { - batch: BatchConfig { - max_size: 2000, // Increase batch size - max_fill_ms: 10000, // Allow longer batching - }, - max_table_sync_workers: 8, // Increase parallelism - // ... other config -} -``` - -```sql --- PostgreSQL tuning --- In postgresql.conf: --- shared_buffers = 1GB --- effective_cache_size = 4GB --- wal_buffers = 16MB --- checkpoint_completion_target = 0.9 -``` - -## Advanced Debugging Techniques - -### Enable Debug Logging - -**For ETL:** -```bash -# Set environment variable -export ETL_LOG_LEVEL=debug - -# Or in configuration -RUST_LOG=etl=debug cargo run -``` - -**For PostgreSQL:** -```sql --- Temporarily enable detailed logging -SET log_statement = 'all'; -SET log_min_duration_statement = 0; -``` - -### Monitor Replication in Real-Time - -```sql --- Create a monitoring query -WITH replication_status AS ( - SELECT - slot_name, - active, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), confirmed_flush_lsn)) as lag_size, - extract(EPOCH FROM (now() - pg_stat_replication.reply_time))::int as lag_seconds - FROM pg_replication_slots - LEFT JOIN pg_stat_replication ON slot_name = application_name - WHERE slot_name LIKE '%etl%' -) -SELECT * FROM replication_status; -``` - -### Test Individual Components - -**Test publication setup:** -```sql --- Simulate ETL's publication query -SELECT schemaname, tablename -FROM pg_publication_tables -WHERE pubname = 'your_publication'; -``` - -**Test replication slot consumption:** -```sql --- Create a test logical replication session -SELECT * FROM pg_logical_slot_get_changes('your_slot', NULL, NULL, 'pretty-print', '1'); -``` - -### Memory and Resource Analysis - -```bash -# Monitor ETL resource usage over time -while true; do - echo "$(date): $(ps -o pid,vsz,rss,pcpu -p $(pgrep etl))" - sleep 30 -done >> etl_resources.log - -# Analyze memory patterns -cat etl_resources.log | grep -E "RSS|VSZ" | tail -20 -``` - -## Prevention Best Practices - -### Configuration Validation - -```rust -// Always validate configuration before starting -impl PipelineConfig { - pub fn validate(&self) -> Result<(), ConfigError> { - if self.batch.max_size > 10000 { - return Err(ConfigError::BatchSizeTooLarge); - } - // ... other validations - } -} -``` - -### Health Checks - -```rust -// Implement health check endpoints -async fn health_check() -> Result { - // Check PostgreSQL connection - // Check replication slot status - // Check destination connectivity - // Return overall status -} -``` - -### Monitoring and Alerting - -```sql --- Set up monitoring queries to run periodically --- Alert on: --- - Replication lag > 1GB or 5 minutes --- - Inactive replication slots --- - Failed pipeline restarts --- - Unusual error rates -``` - -## Recovery Procedures - -### Recovering from WAL Position Loss - -```sql --- If replication slot is lost, you may need to recreate --- WARNING: This will cause a full resync -SELECT pg_create_logical_replication_slot('new_slot_name', 'pgoutput'); -``` - -### Handling Destination Failures - -```rust -// ETL typically handles this automatically with retries -// For manual intervention: -// 1. Fix destination issues -// 2. ETL will resume from last known WAL position -// 3. May see duplicate data (destinations should handle this) -``` - -### Schema Change Recovery - -```sql --- After schema changes, ETL usually adapts automatically --- If not, restart the pipeline to force schema refresh -``` - -## Getting Help - -When you need additional support: - -1. **Search existing issues:** Check [GitHub issues](https://github.com/supabase/etl/issues) -2. **Collect diagnostic information:** Use queries and commands from this guide -3. **Prepare a minimal reproduction:** Isolate the problem to its essential parts -4. **Open an issue:** Include PostgreSQL version, ETL version, configuration, and logs - -### Information to Include in Bug Reports - -- ETL version and build information -- PostgreSQL version and configuration relevant settings -- Complete error messages and stack traces -- Configuration files (with sensitive information redacted) -- Steps to reproduce the issue -- Expected vs. actual behavior - -## Next Steps - -After resolving your immediate issue: - -- **Optimize performance** β†’ [Performance Tuning](performance/) -- **Implement monitoring** β†’ [Monitoring best practices](../explanation/monitoring/) -- **Plan for schema changes** β†’ [Schema Change Handling](schema-changes/) -- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) - -## See Also - -- [PostgreSQL setup guide](configure-postgres/) - Prevent configuration issues -- [Performance optimization](performance/) - Tune for better throughput -- [ETL architecture](../explanation/architecture/) - Understand system behavior \ No newline at end of file diff --git a/docs/tutorials/custom-implementations.md b/docs/tutorials/custom-implementations.md index c84f47759..5129c9aac 100644 --- a/docs/tutorials/custom-implementations.md +++ b/docs/tutorials/custom-implementations.md @@ -4,64 +4,72 @@ audience: developers prerequisites: - Complete first pipeline tutorial - Advanced Rust knowledge (traits, async, Arc/Mutex) - - Understanding of ETL architecture + - PostgreSQL database running locally + - curl command for testing HTTP endpoints version_last_tested: 0.1.0 -last_reviewed: 2025-01-14 -estimated_time: 25 +last_reviewed: 2025-08-14 +estimated_time: 30 +risk_level: medium --- -# Build Custom Stores and Destinations +# Build Custom Stores and Destinations in 30 minutes -**Learn ETL's extension patterns by implementing simple custom components** - -This tutorial teaches you ETL's design patterns by implementing minimal custom stores and destinations. You'll understand the separation between state and schema storage, and learn the patterns needed for production extensions. +**Learn ETL's extension patterns by implementing working custom components** ## What You'll Build -Simple custom implementations to understand the patterns: +By the end of this tutorial, you'll have: -- **Custom in-memory store** with logging to see the flow -- **Custom HTTP destination** with basic retry logic +- A **working custom in-memory store** that logs all operations for debugging +- A **custom HTTP destination** that sends data with automatic retries +- A **complete pipeline** using your custom components that processes real data -**Time required:** 25 minutes -**Difficulty:** Advanced +**Time required:** 30 minutes +**Prerequisites:** Advanced Rust knowledge, running PostgreSQL, basic HTTP knowledge -## Understanding ETL's Store Design +## Safety Note -ETL is design to be a modular replication library. This means that it relies on abstractions to allow for easy extension. +This tutorial creates files in your current directory and makes HTTP requests. To clean up afterward, simply delete the generated Rust project files. -One core aspect of ETL is the store. The store is composed by two parts: +## Step 1: Create Project Structure -- **Schema Store**: Stores table schemas. -- **State Store**: Stores replication state. +Create a new Rust project for your custom ETL components: -Having the store as an extension point allows you to implement your own store for your own use case. For example, you might -want to store replication state in a simple text file, or you might just want the replication state to be stored in memory. It's -important to note that the implementation of the store will significantly affect the performance and safety of the pipeline. This means -that it's your responsibility to make sure that the store is designed to be performant, thread safe and durable (in case you need persistence). The pipeline -doesn't make any assumptions about the store, it just stores data and retrieves it. +```bash +cargo new etl-custom --lib +cd etl-custom +``` -One important thing about the stores, is that they both offer `load_*` and `get_*` methods. The rationale behind this design is -that `load_*` methods are used to load data into a cache implemented within the store and `get_*` exclusively read from that cache. If there is no -need to load data into the cache, because the store implementation doesn't write data anywhere, you can just implement `load_*` as a no-op.` +**Result:** You should see `Created library 'etl-custom' package` output. -### `SchemaStore`: Store for Table Schemas +## Step 2: Add Dependencies -The `SchemaStore` trait is responsible for storing and retrieving table schemas. +Replace your `Cargo.toml` with the required dependencies: -Table schemas are required by ETL since they are used to correctly parse and handle incoming data from PostgreSQL and they -also serve to correctly map tables from Postgres to the destination. +```toml +[package] +name = "etl-custom" +version = "0.1.0" +edition = "2021" -### `StateStore`: Store for Replication State +[[bin]] +name = "main" +path = "src/main.rs" -The `StateStore` trait is responsible for storing and retrieving replication state. +[dependencies] +etl = { git = "https://github.com/supabase/etl" } +tokio = { version = "1.0", features = ["full"] } +reqwest = { version = "0.11", features = ["json"] } +serde_json = "1.0" +tracing = "0.1" +tracing-subscriber = "0.3" +``` -The state is crucial for proper pipeline operation since it's used to track the progress of replication and (if persistent) -allows a pipeline to be safely paused and later resumed. +**Result:** Running `cargo check` should download dependencies without errors. -## Step 1: Create Simple Custom Store +## Step 3: Create Custom Store Implementation -Create `src/custom_store.rs`: +Create `src/custom_store.rs` with a dual-storage implementation: ```rust use std::collections::HashMap; @@ -75,98 +83,121 @@ use etl::store::schema::SchemaStore; use etl::store::state::StateStore; use etl::types::{TableId, TableSchema}; +// Represents data stored in our in-memory cache for fast access #[derive(Debug, Clone)] struct CachedEntry { - schema: Option>, - state: Option, - mapping: Option, + schema: Option>, // Table structure info + state: Option, // Current replication progress + mapping: Option, // Source -> destination table mapping } +// Represents data as it would be stored persistently (e.g., in files/database) #[derive(Debug, Clone)] struct PersistentEntry { - schema: Option, + schema: Option, // Not Arc-wrapped in "persistent" storage state: Option, mapping: Option, } #[derive(Debug, Clone)] pub struct CustomStore { - // We simulate cached entries. + // Fast in-memory cache for all read operations - this is what ETL queries cache: Arc>>, - // We simulate persistent entries. + // Simulated persistent storage - in real implementation this might be files/database persistent: Arc>>, } impl CustomStore { pub fn new() -> Self { - info!("Creating custom store (2 maps: cache + persistent)"); + info!("Creating custom store with dual-layer architecture (cache + persistent)"); Self { cache: Arc::new(Mutex::new(HashMap::new())), persistent: Arc::new(Mutex::new(HashMap::new())), } } + // Helper to ensure we have a cache entry to work with - creates if missing fn ensure_cache_slot<'a>( cache: &'a mut HashMap, id: TableId, ) -> &'a mut CachedEntry { - cache - .entry(id) - .or_insert_with(|| CachedEntry { schema: None, state: None, mapping: None }) + cache.entry(id).or_insert_with(|| { + // Initialize empty entry if this table hasn't been seen before + CachedEntry { + schema: None, + state: None, + mapping: None + } + }) } + // Helper to ensure we have a persistent entry to work with - creates if missing fn ensure_persistent_slot<'a>( persistent: &'a mut HashMap, id: TableId, ) -> &'a mut PersistentEntry { - persistent - .entry(id) - .or_insert_with(|| PersistentEntry { schema: None, state: None, mapping: None }) + persistent.entry(id).or_insert_with(|| { + // Initialize empty persistent entry if this table hasn't been seen before + PersistentEntry { + schema: None, + state: None, + mapping: None + } + }) } } +// Implementation of ETL's SchemaStore trait - handles table structure information impl SchemaStore for CustomStore { + // ETL calls this frequently during data processing - must be fast (cache-only) async fn get_table_schema(&self, table_id: &TableId) -> EtlResult>> { let cache = self.cache.lock().await; let result = cache.get(table_id).and_then(|e| e.schema.clone()); - info!("Schema cache read for table {}: {}", table_id.0, result.is_some()); + info!("Schema cache read for table {}: found={}", table_id.0, result.is_some()); Ok(result) } + // Return all cached schemas - used by ETL for bulk operations async fn get_table_schemas(&self) -> EtlResult>> { let cache = self.cache.lock().await; - Ok(cache - .values() - .filter_map(|e| e.schema.clone()) - .collect()) + let schemas: Vec<_> = cache.values() + .filter_map(|e| e.schema.clone()) // Only include entries that have schemas + .collect(); + info!("Retrieved {} schemas from cache", schemas.len()); + Ok(schemas) } + // Called at startup - load persistent data into cache for fast access async fn load_table_schemas(&self) -> EtlResult { - info!("Loading schemas from 'persistent' into cache (startup)"); + info!("Loading schemas from persistent storage into cache (startup phase)"); let persistent = self.persistent.lock().await; let mut cache = self.cache.lock().await; let mut loaded = 0; for (id, pentry) in persistent.iter() { if let Some(schema) = &pentry.schema { + // Move schema from persistent storage to cache, wrapping in Arc for sharing let centry = Self::ensure_cache_slot(&mut cache, *id); centry.schema = Some(Arc::new(schema.clone())); loaded += 1; } } - info!("Loaded {} schemas into cache", loaded); + info!("Loaded {} schemas into cache from persistent storage", loaded); Ok(loaded) } + // Store new schema - implements dual-write pattern (persistent first, then cache) async fn store_table_schema(&self, table_schema: TableSchema) -> EtlResult<()> { let id = table_schema.id; - info!("Storing schema for table {} (dual-write)", id.0); + info!("Storing schema for table {} using dual-write pattern", id.0); + // First write to persistent storage (this would be a file/database in reality) { let mut persistent = self.persistent.lock().await; let p = Self::ensure_persistent_slot(&mut persistent, id); p.schema = Some(table_schema.clone()); } + // Then update cache for immediate availability { let mut cache = self.cache.lock().await; let c = Self::ensure_cache_slot(&mut cache, id); @@ -176,7 +207,9 @@ impl SchemaStore for CustomStore { } } +// Implementation of ETL's StateStore trait - handles replication progress tracking impl StateStore for CustomStore { + // Get current replication state for a table - cache-only for speed async fn get_table_replication_state( &self, table_id: TableId, @@ -187,45 +220,52 @@ impl StateStore for CustomStore { Ok(result) } + // Get all replication states - used by ETL to understand overall progress async fn get_table_replication_states( &self, ) -> EtlResult> { let cache = self.cache.lock().await; - Ok(cache - .iter() - .filter_map(|(id, e)| e.state.clone().map(|s| (*id, s))) - .collect()) + let states: HashMap<_, _> = cache.iter() + .filter_map(|(id, e)| e.state.clone().map(|s| (*id, s))) // Only include tables with state + .collect(); + info!("Retrieved {} table states from cache", states.len()); + Ok(states) } + // Load persistent states into cache at startup async fn load_table_replication_states(&self) -> EtlResult { - info!("Loading states from 'persistent' into cache"); + info!("Loading replication states from persistent storage into cache"); let persistent = self.persistent.lock().await; let mut cache = self.cache.lock().await; let mut loaded = 0; for (id, pentry) in persistent.iter() { if let Some(state) = pentry.state.clone() { + // Move state from persistent to cache let centry = Self::ensure_cache_slot(&mut cache, *id); centry.state = Some(state); loaded += 1; } } - info!("Loaded {} states into cache", loaded); + info!("Loaded {} replication states into cache", loaded); Ok(loaded) } + // Update replication state - critical for tracking progress, uses dual-write async fn update_table_replication_state( &self, table_id: TableId, state: TableReplicationPhase, ) -> EtlResult<()> { - info!("Updating state for table {} to {:?} (dual-write)", table_id.0, state); + info!("Updating replication state for table {} to {:?} (dual-write)", table_id.0, state); + // First persist the state (ensures durability) { let mut persistent = self.persistent.lock().await; let p = Self::ensure_persistent_slot(&mut persistent, table_id); p.state = Some(state.clone()); } + // Then update cache (ensures immediate availability) { let mut cache = self.cache.lock().await; let c = Self::ensure_cache_slot(&mut cache, table_id); @@ -234,57 +274,70 @@ impl StateStore for CustomStore { Ok(()) } + // Rollback state to previous version - not implemented in this simple example async fn rollback_table_replication_state( &self, _table_id: TableId, ) -> EtlResult { + // In a real implementation, you'd maintain state history and rollback to previous version todo!("Implement state history tracking for rollback") } + // Get table name mapping from source to destination async fn get_table_mapping(&self, source_table_id: &TableId) -> EtlResult> { let cache = self.cache.lock().await; - Ok(cache.get(source_table_id).and_then(|e| e.mapping.clone())) + let mapping = cache.get(source_table_id).and_then(|e| e.mapping.clone()); + info!("Mapping lookup for table {}: {:?}", source_table_id.0, mapping); + Ok(mapping) } + // Get all table mappings - used when ETL needs to understand all table relationships async fn get_table_mappings(&self) -> EtlResult> { let cache = self.cache.lock().await; - Ok(cache - .iter() - .filter_map(|(id, e)| e.mapping.clone().map(|m| (*id, m))) - .collect()) + let mappings: HashMap<_, _> = cache.iter() + .filter_map(|(id, e)| e.mapping.clone().map(|m| (*id, m))) // Only include mapped tables + .collect(); + info!("Retrieved {} table mappings from cache", mappings.len()); + Ok(mappings) } + // Load persistent mappings into cache at startup async fn load_table_mappings(&self) -> EtlResult { - info!("Loading mappings from 'persistent' into cache"); + info!("Loading table mappings from persistent storage into cache"); let persistent = self.persistent.lock().await; let mut cache = self.cache.lock().await; let mut loaded = 0; for (id, pentry) in persistent.iter() { if let Some(m) = &pentry.mapping { + // Load mapping into cache let centry = Self::ensure_cache_slot(&mut cache, *id); centry.mapping = Some(m.clone()); loaded += 1; } } + info!("Loaded {} table mappings into cache", loaded); Ok(loaded) } + // Store a new table mapping (source table -> destination table name) async fn store_table_mapping( &self, source_table_id: TableId, destination_table_id: String, ) -> EtlResult<()> { info!( - "Storing mapping: {} -> {} (dual-write)", + "Storing table mapping: {} -> {} (dual-write)", source_table_id.0, destination_table_id ); + // First persist the mapping { let mut persistent = self.persistent.lock().await; let p = Self::ensure_persistent_slot(&mut persistent, source_table_id); p.mapping = Some(destination_table_id.clone()); } + // Then update cache { let mut cache = self.cache.lock().await; let c = Self::ensure_cache_slot(&mut cache, source_table_id); @@ -295,9 +348,11 @@ impl StateStore for CustomStore { } ``` -## Step 2: Create Simple HTTP Destination +**Result:** Your file should compile without errors when you run `cargo check`. + +## Step 4: Create HTTP Destination Implementation -Create `src/http_destination.rs`: +Create `src/http_destination.rs` with retry logic and proper error handling: ```rust use reqwest::{Client, Method}; @@ -310,166 +365,199 @@ use etl::error::{ErrorKind, EtlError, EtlResult}; use etl::types::{Event, TableId, TableRow}; use etl::{bail, etl_error}; -const MAX_RETRIES: usize = 3; -const BASE_BACKOFF_MS: u64 = 500; +// Configuration constants for retry behavior +const MAX_RETRIES: usize = 3; // Try up to 3 times before giving up +const BASE_BACKOFF_MS: u64 = 500; // Start with 500ms delay, then exponential backoff pub struct HttpDestination { - client: Client, - base_url: String, + client: Client, // HTTP client for making requests + base_url: String, // Base URL for the destination API (e.g., "https://api.example.com") } impl HttpDestination { + /// Create a new HTTP destination that will send data to the specified base URL pub fn new(base_url: String) -> EtlResult { + // Configure HTTP client with reasonable timeout let client = Client::builder() - .timeout(Duration::from_secs(10)) + .timeout(Duration::from_secs(10)) // 10 second timeout for each request .build() .map_err(|e| etl_error!(ErrorKind::Unknown, "Failed to create HTTP client", e))?; + + info!("Created HTTP destination pointing to: {}", base_url); Ok(Self { client, base_url }) } + /// Helper to construct full URLs by combining base URL with endpoint paths fn url(&self, path: &str) -> String { format!( "{}/{}", - self.base_url.trim_end_matches('/'), - path.trim_start_matches('/') + self.base_url.trim_end_matches('/'), // Remove trailing slash from base + path.trim_start_matches('/') // Remove leading slash from path ) } - /// Small, generic sender with retry + backoff. + /// Generic HTTP sender with automatic retry logic and exponential backoff + /// This handles all the complex retry logic so individual methods can focus on data formatting async fn send_json(&self, method: Method, path: &str, body: Option<&Value>) -> EtlResult<()> { let url = self.url(path); + info!("Attempting HTTP {} to {}", method, url); + // Retry loop with exponential backoff for attempt in 0..MAX_RETRIES { + // Build the request let mut req = self.client.request(method.clone(), &url); if let Some(b) = body { - req = req.json(b); + req = req.json(b); // Add JSON body if provided } + // Send the request and handle response match req.send().await { + // Success case - 2xx status codes Ok(resp) if resp.status().is_success() => { info!( - "HTTP {} {} succeeded (attempt {})", - method, - url, - attempt + 1 + "HTTP {} {} succeeded on attempt {}/{}", + method, url, attempt + 1, MAX_RETRIES ); return Ok(()); } + // HTTP error response (4xx/5xx) Ok(resp) => { let status = resp.status(); warn!( - "HTTP {} {} failed with {}, attempt {}", - method, - url, - status, - attempt + 1 + "HTTP {} {} returned status {}, attempt {}/{}", + method, url, status, attempt + 1, MAX_RETRIES ); - // Fail-fast on 4xx - if !status.is_server_error() { + + // Don't retry client errors (4xx) - they won't succeed on retry + if status.is_client_error() { bail!( ErrorKind::Unknown, - "HTTP client error", + "HTTP client error - not retrying", format!("Status: {}", status) ); } + // Server errors (5xx) will be retried + } + // Network/connection errors - these are worth retrying + Err(e) => { + warn!( + "HTTP {} {} network error on attempt {}/{}: {}", + method, url, attempt + 1, MAX_RETRIES, e + ); } - Err(e) => warn!( - "HTTP {} {} network error on attempt {}: {}", - method, - url, - attempt + 1, - e - ), } - // Exponential backoff: 500ms, 1s, 2s - let delay = Duration::from_millis(BASE_BACKOFF_MS * 2u64.pow(attempt as u32)); - tokio::time::sleep(delay).await; + // If this wasn't the last attempt, wait before retrying + if attempt + 1 < MAX_RETRIES { + // Exponential backoff: 500ms, 1s, 2s (attempt 0, 1, 2) + let delay = Duration::from_millis(BASE_BACKOFF_MS * 2u64.pow(attempt as u32)); + info!("Waiting {:?} before retry", delay); + tokio::time::sleep(delay).await; + } } + // All retries failed bail!( ErrorKind::Unknown, - "HTTP request failed after retries", - format!("Max retries ({MAX_RETRIES}) exceeded") + "HTTP request failed after all retries", + format!("Exhausted {} attempts to {}", MAX_RETRIES, url) ) } } +// Implementation of ETL's Destination trait - this is what ETL calls to send data impl Destination for HttpDestination { + /// Called when ETL needs to clear all data from a table (e.g., during full refresh) async fn truncate_table(&self, table_id: TableId) -> EtlResult<()> { - info!("HTTP: Truncating table {}", table_id.0); + info!("Truncating destination table: {}", table_id.0); + + // Send DELETE request to truncate endpoint self.send_json( Method::DELETE, - &format!("tables/{}/truncate", table_id.0), - None, - ) - .await + &format!("tables/{}/truncate", table_id.0), // e.g., "tables/users/truncate" + None, // No body needed for truncate + ).await } + /// Called when ETL has a batch of rows to send to the destination + /// This is the main data flow method - gets called frequently during replication async fn write_table_rows( &self, table_id: TableId, table_rows: Vec, ) -> EtlResult<()> { + // Skip empty batches - no work to do if table_rows.is_empty() { + info!("Skipping empty batch for table {}", table_id.0); return Ok(()); } info!( - "HTTP: Writing {} rows for table {}", + "Sending {} rows to destination table {}", table_rows.len(), table_id.0 ); - // Simple serialization β€” stringify values for demo-compat. + // Convert ETL's internal row format to JSON that our API expects + // In a real implementation, you'd format this according to your destination's schema let rows_json: Vec<_> = table_rows .iter() .map(|row| { json!({ - "values": row.values.iter().map(|v| format!("{:?}", v)).collect::>() + "values": row.values.iter() + .map(|v| format!("{:?}", v)) // Simple string conversion for demo + .collect::>() }) }) .collect(); + // Create the JSON payload our API expects let payload = json!({ "table_id": table_id.0, "rows": rows_json }); + // Send POST request with the row data self.send_json( Method::POST, - &format!("tables/{}/rows", table_id.0), + &format!("tables/{}/rows", table_id.0), // e.g., "tables/users/rows" Some(&payload), - ) - .await + ).await } + /// Called when ETL has replication events to send (e.g., transaction markers) + /// These are metadata events about the replication process itself async fn write_events(&self, events: Vec) -> EtlResult<()> { + // Skip if no events to process if events.is_empty() { return Ok(()); } - info!("HTTP: Writing {} events", events.len()); + info!("Sending {} replication events to destination", events.len()); + // Convert events to JSON format let events_json: Vec<_> = events .iter() .map(|event| { json!({ - "event_type": format!("{:?}", event), + "event_type": format!("{:?}", event), // Convert event to string for demo }) }) .collect(); let payload = json!({ "events": events_json }); + // Send events to generic events endpoint self.send_json(Method::POST, "events", Some(&payload)).await } } ``` -## Step 3: Use Your Custom Components +**Result:** Run `cargo check` again - it should compile successfully with both your store and destination implementations. + +## Step 5: Create Working Pipeline Example -Create `src/main.rs`: +Create `src/main.rs` that demonstrates your custom components in action: ```rust mod custom_store; @@ -480,90 +568,142 @@ use http_destination::HttpDestination; use etl::config::{BatchConfig, PgConnectionConfig, PipelineConfig, TlsConfig}; use etl::pipeline::Pipeline; use tracing::{info, Level}; +use std::time::Duration; #[tokio::main] async fn main() -> Result<(), Box> { - tracing_subscriber::fmt().with_max_level(Level::INFO).init(); + // Initialize logging so we can see what our custom components are doing + tracing_subscriber::fmt() + .with_max_level(Level::INFO) + .init(); - info!("Starting ETL with custom store and destination"); + info!("=== Starting ETL Pipeline with Custom Components ==="); - // Create custom components + // Step 1: Create our custom store + // This will handle both schema storage and replication state tracking + info!("Creating custom dual-layer store (cache + persistent simulation)"); let store = CustomStore::new(); - let destination = HttpDestination::new("https://httpbin.org/post".to_string())?; - // Standard PostgreSQL config + // Step 2: Create our custom HTTP destination + // Using httpbin.org which echoes back what we send - perfect for testing + info!("Creating HTTP destination with retry logic"); + let destination = HttpDestination::new( + "https://httpbin.org/post".to_string() // This endpoint will show us what we sent + )?; + + // Step 3: Configure the PostgreSQL connection + // Update these values to match your local PostgreSQL setup let pipeline_config = PipelineConfig { - id: 1, - publication_name: "my_publication".to_string(), + id: 1, // Unique pipeline identifier + publication_name: "etl_demo_pub".to_string(), // PostgreSQL publication name + + // PostgreSQL connection details - CHANGE THESE to match your setup pg_connection: PgConnectionConfig { host: "localhost".to_string(), port: 5432, - name: "postgres".to_string(), - username: "postgres".to_string(), - password: Some("your_password".to_string().into()), - tls: TlsConfig { enabled: false, trusted_root_certs: String::new() }, + name: "postgres".to_string(), // Database name + username: "postgres".to_string(), // Database user + password: Some("postgres".to_string().into()), // Update with your password + tls: TlsConfig { + enabled: false, // Disable TLS for local development + trusted_root_certs: String::new() + }, + }, + + // Batching configuration - controls how ETL groups data for efficiency + batch: BatchConfig { + max_size: 100, // Send data when we have 100 rows + max_fill_ms: 5000 // Or send data every 5 seconds, whichever comes first }, - batch: BatchConfig { max_size: 100, max_fill_ms: 5000 }, - table_error_retry_delay_ms: 10000, - max_table_sync_workers: 2, + + // Error handling configuration + table_error_retry_delay_ms: 10000, // Wait 10s before retrying failed tables + max_table_sync_workers: 2, // Use 2 workers for parallel table syncing }; - // Create pipeline with custom components + // Step 4: Create the pipeline with our custom components + // This combines your custom store and destination with ETL's core replication logic + info!("Creating ETL pipeline with custom store and HTTP destination"); let mut pipeline = Pipeline::new(pipeline_config, store, destination); + + // Step 5: Start the pipeline + // This will: + // 1. Load any existing state from your custom store + // 2. Connect to PostgreSQL and start listening for changes + // 3. Begin replicating data through your custom destination + info!("Starting pipeline - this will connect to PostgreSQL and begin replication"); pipeline.start().await?; - pipeline.wait().await?; + // For demo purposes, let it run for 30 seconds then gracefully shut down + info!("Pipeline running! Watch the logs to see your custom components in action."); + info!("Will run for 30 seconds then shut down gracefully..."); + + tokio::time::sleep(Duration::from_secs(30)).await; + + info!("Shutting down pipeline gracefully..."); + // pipeline.shutdown().await?; // Uncomment if available in your ETL version + + // In production, you'd typically call: + // pipeline.wait().await?; // This blocks forever until manual shutdown + + info!("=== ETL Pipeline Demo Complete ==="); Ok(()) } ``` -Add dependencies to `Cargo.toml`: +**Result:** Running `cargo run` should now start your pipeline and show detailed logs from your custom components. -```toml -[dependencies] -etl = { git = "https://github.com/supabase/etl" } -tokio = { version = "1.0", features = ["full"] } -reqwest = { version = "0.11", features = ["json"] } -serde_json = "1.0" -chrono = { version = "0.4", features = ["serde"] } -tracing = "0.1" -tracing-subscriber = "0.3" -anyhow = "1.0" +## Step 6: Test Your Implementation + +Verify your custom components work correctly: + +```bash +# Check that everything compiles +cargo check +``` + +**Result:** Should see "Finished dev [unoptimized + debuginfo] target(s)" + +```bash +# Run the pipeline (will fail without PostgreSQL setup, but shows component initialization) +cargo run ``` -## Key Patterns You've Learned +**Result:** You should see logs from your custom store being created and HTTP destination being configured. + +## Checkpoint: What You've Built -### Store Architecture -- **Cache-first reads**: Never hit persistent storage for reads -- **Dual-write updates**: Write to persistent then cache atomically -- **Startup loading**: Load persistent data into cache once -- **Thread safety**: Arc/Mutex for concurrent worker access +You now have working custom ETL components: -### Destination Patterns -- **Retry logic**: Exponential backoff for transient failures -- **Error classification**: Retry server errors, fail fast on client errors -- **Data transformation**: Convert ETL types to API-friendly formats -- **Batching awareness**: Handle empty batches gracefully +βœ… **Custom Store**: Implements dual-layer caching with detailed logging +βœ… **HTTP Destination**: Sends data via HTTP with automatic retry logic +βœ… **Complete Pipeline**: Integrates both components with ETL's core engine +βœ… **Proper Error Handling**: Follows ETL's error patterns and logging -## What You've Learned +## Key Patterns You've Mastered -You now understand ETL's extension patterns: +**Store Architecture:** +- Cache-first reads for performance +- Dual-write pattern for data consistency +- Startup loading from persistent storage +- Thread-safe concurrent access with Arc/Mutex -- **Storage separation**: Schema vs state concerns with different access patterns -- **Cache-first architecture**: Fast reads from memory, dual writes for consistency -- **Thread-safe design**: Arc/Mutex patterns for concurrent access -- **Retry patterns**: Exponential backoff with error classification -- **Trait contracts**: What ETL expects from custom implementations +**Destination Patterns:** +- Exponential backoff retry logic +- Smart error classification (retry 5xx, fail 4xx) +- Efficient batching and empty batch handling +- Clean data transformation from ETL to API formats ## Next Steps -- **Test your implementations** β†’ [Testing ETL Pipelines](testing-pipelines/) -- **Debug issues** β†’ [Debugging Guide](../how-to/debugging/) -- **Understand architecture** β†’ [ETL Architecture](../explanation/architecture/) -- **See production examples** β†’ [Custom Destinations Guide](../how-to/custom-destinations/) +- **Connect to real PostgreSQL** β†’ [PostgreSQL Setup Guide](../how-to/postgresql-setup/) +- **Add production-ready persistence** β†’ [Production Stores Guide](../how-to/production-stores/) +- **Test your pipeline thoroughly** β†’ [Testing ETL Pipelines](../how-to/testing-pipelines/) +- **Deploy to production** β†’ [Deployment Guide](../how-to/deployment/) ## See Also -- [State management explanation](../explanation/state-management/) - Deep dive on ETL's state handling -- [Architecture overview](../explanation/architecture/) - Understanding component relationships -- [API reference](../reference/) - Complete trait documentation \ No newline at end of file +- [ETL Store Design](../explanation/store-architecture/) - Deep dive on storage patterns +- [Destination Patterns](../explanation/destination-patterns/) - Advanced destination implementation +- [Store Trait Reference](../reference/traits/store/) - Complete API documentation +- [Destination Trait Reference](../reference/traits/destination/) - Complete API documentation \ No newline at end of file diff --git a/mkdocs.yaml b/mkdocs.yaml index be8361d57..3c9d43bce 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -15,13 +15,11 @@ nav: - How-to Guides: - Overview: how-to/index.md - Configure PostgreSQL: how-to/configure-postgres.md - - Debug Replication Issues: how-to/debugging.md - Reference: - Overview: reference/index.md - Explanation: - Overview: explanation/index.md - Architecture: explanation/architecture.md - - Replication Protocol: explanation/replication.md theme: name: "material" From 957c6aab534363824109ba08fcc3f0996174afd3 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 18 Aug 2025 11:32:57 +0200 Subject: [PATCH 5/9] Update --- docs/explanation/architecture.md | 148 ++++------ docs/explanation/index.md | 85 +----- docs/how-to/configure-postgres.md | 349 +++++++---------------- docs/how-to/index.md | 64 +---- docs/index.md | 46 +-- docs/reference/index.md | 10 +- docs/test-mermaid.md | 39 --- docs/tutorials/custom-implementations.md | 27 +- docs/tutorials/first-pipeline.md | 24 +- docs/tutorials/index.md | 32 +-- 10 files changed, 201 insertions(+), 623 deletions(-) delete mode 100644 docs/test-mermaid.md diff --git a/docs/explanation/architecture.md b/docs/explanation/architecture.md index cb465da8a..d5ec59f3c 100644 --- a/docs/explanation/architecture.md +++ b/docs/explanation/architecture.md @@ -1,14 +1,9 @@ ---- -type: explanation -title: ETL Architecture Overview -last_reviewed: 2025-08-14 ---- # ETL Architecture Overview **Understanding how ETL components work together to replicate data from PostgreSQL** -ETL's architecture centers around four core abstractions that work together to provide reliable, high-performance data replication: Pipeline, Destination, SchemaStore, and StateStore. This document explains how these components interact and coordinate data flow from PostgreSQL logical replication to target systems. +ETL's architecture centers around four core abstractions that work together to provide reliable, high-performance data replication: `Pipeline`, `Destination`, `SchemaStore`, and `StateStore`. This document explains how these components interact and coordinate data flow from PostgreSQL logical replication to target systems. A diagram of the overall architecture is shown below: @@ -56,19 +51,20 @@ flowchart LR ## Core Abstractions -### Pipeline: The Orchestrator +### Pipeline The Pipeline is ETL's central component that orchestrates all replication activity. It manages worker lifecycles, coordinates data flow, and handles error recovery. **Key responsibilities:** -- Establishes PostgreSQL replication connection -- Spawns and manages worker processes -- Coordinates initial table synchronization with ongoing replication -- Handles shutdown and error scenarios -### Destination: Where Data Goes +- Initializes the state of the pipeline +- Spawns the apply worker and table sync workers pool +- Tracks workers handles to wait for their termination +- Exposes the shutdown mechanism for gracefully terminating the pipeline -The Destination trait defines how replicated data is delivered to target systems: +### Destination + +The `Destination` trait defines how replicated data is delivered to target systems: ```rust pub trait Destination { @@ -84,11 +80,15 @@ pub trait Destination { } ``` -The trait provides three operations: `truncate_table` clears destination tables before bulk loading, `write_table_rows` handles bulk data insertion during initial synchronization, and `write_events` processes streaming replication changes. +The trait provides three operations: + +- `truncate_table`: clears destination tables before bulk loading. +- `write_table_rows`: handles bulk data insertion during initial synchronization. +- `write_events`: processes streaming replication changes. -### SchemaStore: Table Structure Management +### SchemaStore -The SchemaStore trait manages table schema information: +The `SchemaStore` trait manages table schema information: ```rust pub trait SchemaStore { @@ -108,11 +108,11 @@ pub trait SchemaStore { } ``` -The store follows a cache-first pattern: `load_table_schemas` populates an in-memory cache at startup, while `get_*` methods read only from cache for performance. `store_table_schema` implements dual-write to both persistent storage and cache. +The store follows a cache-first pattern: `load_table_schemas` populates an in-memory cache at startup, while `get_table_schemas` methods read only from cache for performance. `store_table_schema` implements dual-write to both persistent storage and cache. -### StateStore: Replication Progress Tracking +### StateStore -The StateStore trait manages replication state and table mappings: +The `StateStore` trait manages replication state and table mappings: ```rust pub trait StateStore { @@ -157,24 +157,28 @@ pub trait StateStore { } ``` -Like SchemaStore, StateStore uses cache-first reads with `load_*` methods for startup population and dual-write patterns for updates. The store tracks both replication progress through `TableReplicationPhase` and source-to-destination table name mappings. +Like `SchemaStore`, `StateStore` uses cache-first reads with `load_*` methods for startup population and dual-write patterns for updates. + +The store tracks both replication progress through `TableReplicationPhase` and source-to-destination table name mappings. ## Data Flow Architecture ### Worker Coordination -ETL's data flow is orchestrated through two types of workers: +ETL's data flow is orchestrated through two types of workers. + +#### Apply Worker -**Apply Worker** - The primary replication processor: - Processes PostgreSQL logical replication stream -- Spawns table sync workers as needed +- Spawns table sync workers when new table are discovered - Coordinates with table sync workers through shared state - Handles final event processing for tables in `Ready` state -**Table Sync Workers** - Initial data synchronization: +#### Table Sync Workers + - Perform bulk copying of existing table data - Coordinate handoff to apply worker when synchronization completes -- Multiple workers run in parallel, limited by configured semaphore +- Multiple table sync workers run in parallel, limited by configured semaphore to bound number of connections ### Worker Startup Sequence @@ -185,8 +189,10 @@ The Pipeline follows this startup sequence: 3. **Table Discovery**: Apply worker identifies tables requiring synchronization 4. **Table Sync Spawning**: Apply worker spawns table sync workers for tables in `Init` state 5. **Coordination**: Workers communicate through shared state store +6. **Streaming**: Apply worker starts streaming replication events of table in `Ready` state and at every commit point + checks for new tables to synchronize -The apply worker always starts first because it coordinates the overall replication process and spawns table sync workers on demand. +_The apply worker always starts first because it coordinates the overall replication process and spawns table sync workers on demand._ ### Table Replication Phases @@ -207,80 +213,26 @@ pub enum TableReplicationPhase { **Phase Ownership and Transitions:** -- **Init**: Set by pipeline when table first discovered -- **DataSync**: Table sync worker begins bulk data copying -- **FinishedCopy**: Table sync worker completes bulk copy, begins catching up with replication stream -- **SyncWait**: Table sync worker requests apply worker to pause (memory-only, not persisted) -- **Catchup**: Apply worker pauses and signals LSN position for table sync worker (memory-only) -- **SyncDone**: Table sync worker catches up to apply worker's LSN and signals completion -- **Ready**: Apply worker takes over all processing for this table -- **Errored**: Either worker encounters unrecoverable error - -### Synchronization Handoff - -The critical coordination happens during the transition from table sync worker to apply worker control: - -1. **Table sync worker** completes bulk copy (`FinishedCopy`) -2. **Table sync worker** processes replication events to catch up -3. **Table sync worker** sets state to `SyncWait` (signals apply worker to pause) -4. **Apply worker** detects `SyncWait` at transaction boundary and pauses -5. **Apply worker** sets state to `Catchup` with current LSN position -6. **Table sync worker** processes events up to the `Catchup` LSN -7. **Table sync worker** sets state to `SyncDone` with final LSN and terminates -8. **Apply worker** detects `SyncDone` and transitions table to `Ready` -9. **Apply worker** resumes processing and handles all future events for the table - -This coordination ensures no events are lost during the handoff and that the table reaches a consistent state. - -### Event Processing Flow - -**Initial Synchronization (Table Sync Worker):** -1. Truncate destination table using `Destination::truncate_table` -2. Copy existing data in batches using `Destination::write_table_rows` -3. Process replication stream events using `Destination::write_events` -4. Coordinate handoff to apply worker - -**Continuous Replication (Apply Worker):** -1. Read events from PostgreSQL logical replication stream -2. Filter events for tables in `Ready` state -3. Batch events for efficiency -4. Send batches to destination using `Destination::write_events` -5. Acknowledge progress to PostgreSQL - -### Concurrency and Synchronization - -ETL uses several concurrency primitives to coordinate workers: - -- **Semaphore**: Limits number of concurrent table sync workers -- **Shutdown channels**: Broadcast shutdown signals to all workers -- **Shared state**: StateStore provides atomic state transitions -- **Message passing**: Workers coordinate through state changes rather than direct communication - -The apply worker holds the semaphore permits and distributes them to table sync workers, ensuring resource bounds while allowing parallel initial synchronization. - -## Design Rationale - -### Cache-First Storage Pattern - -Both SchemaStore and StateStore separate loading from reading. This pattern provides: -- **Performance**: Fast cache-only reads during high-frequency operations -- **Consistency**: Dual-write ensures cache and persistent storage stay synchronized -- **Startup efficiency**: Bulk loading minimizes startup time +- **Init**: The table is discovered and ready to be copied +- **DataSync**: The table copy has started and is in progress +- **FinishedCopy**: The table has been fully copied and is ready to start CDC streaming +- **SyncWait**: The table is ready to start CDC streaming and is waiting for the apply worker to tell which LSN to catchup +- **Catchup**: The table is catching up to the the LSN specified by the apply worker +- **SyncDone**: The table has caught up to the LSN specified by the apply worker +- **Ready**: The table is now copied and caught up with the apply worker, now all events are processed by the apply worker for this table +- **Errored**: The table has encountered an error and is excluded from replication until a rollback is performed -### Worker Separation +## Next Steps -Separating apply workers from table sync workers enables: -- **Parallelism**: Multiple tables can synchronize concurrently -- **Resource control**: Semaphore prevents resource exhaustion -- **Clear handoff**: Explicit phase transitions ensure data consistency -- **Error isolation**: Table-level failures don't affect other tables +Now that you understand ETL's architecture: -### State-Driven Coordination +- **Build your first pipeline** β†’ [First Pipeline Tutorial](../tutorials/first-pipeline.md) +- **Implement custom components** β†’ [Custom Stores and Destinations](../tutorials/custom-implementations.md) +- **Configure PostgreSQL properly** β†’ [Configure PostgreSQL for Replication](../how-to/configure-postgres.md) -Using shared state for worker coordination provides: -- **Persistence**: State survives worker failures and restarts -- **Observability**: External systems can monitor replication progress -- **Recovery**: Workers can resume from last known state -- **Simplicity**: No complex message passing between workers +## See Also -The architecture prioritizes data consistency and operational simplicity over raw throughput, ensuring reliable replication with clear error handling and recovery patterns. \ No newline at end of file +- [Build Your First ETL Pipeline](../tutorials/first-pipeline.md) - Hands-on tutorial using these components +- [Custom Stores and Destinations](../tutorials/custom-implementations.md) - Implement your own stores and destinations +- [API Reference](../reference/index.md) - Complete trait documentation +- [Configure PostgreSQL for Replication](../how-to/configure-postgres.md) - Set up the source database \ No newline at end of file diff --git a/docs/explanation/index.md b/docs/explanation/index.md index 92d766c35..705e6bcb7 100644 --- a/docs/explanation/index.md +++ b/docs/explanation/index.md @@ -1,7 +1,3 @@ ---- -type: explanation -title: Understanding ETL ---- # Explanations @@ -11,91 +7,28 @@ Explanations help you build mental models of how ETL works and why it's designed ## Core Concepts -### [ETL Architecture Overview](architecture/) +### [ETL Architecture Overview](architecture.md) **The big picture of how ETL components work together** Understand the relationship between pipelines, destinations, stores, and the PostgreSQL replication protocol. Learn how data flows through the system and where extension points exist. *Topics covered:* Component architecture, data flow, extension patterns, scalability considerations. -### [Why Postgres Logical Replication?](replication/) -**The foundation technology and its trade-offs** - -Explore how PostgreSQL's logical replication works, why ETL builds on this foundation, and how it compares to other change data capture approaches. - -*Topics covered:* WAL-based replication, publications and subscriptions, alternatives like triggers or polling, performance characteristics. - -### [Design Decisions and Trade-offs](design/) -**Key choices that shape ETL's behavior** - -Learn about the major design decisions in ETL, the problems they solve, and the trade-offs they represent. Understanding these choices helps you use ETL effectively. - -*Topics covered:* Rust as implementation language, async architecture, batching strategy, error handling philosophy. - -## System Characteristics - -### [Performance and Scalability](performance/) -**How ETL behaves under different loads and configurations** - -Understand ETL's performance characteristics, bottlenecks, and scaling patterns. Learn how different configuration choices affect throughput and resource usage. - -*Topics covered:* Throughput patterns, memory usage, network considerations, scaling strategies. - -### [Crate Structure and Organization](crate-structure/) -**How ETL's modular design supports different use cases** - -Explore how ETL is organized into multiple crates, what each crate provides, and how they work together. Understand the reasoning behind this modular architecture. - -*Topics covered:* Core vs. optional crates, dependency management, feature flags, extensibility. - -## Integration Patterns - -### [Working with Destinations](destinations-explained/) -**Understanding the destination abstraction and ecosystem** - -Learn how destinations work conceptually, why they're designed as they are, and how to choose between different destination options. - -*Topics covered:* Destination trait design, batching strategy, error handling patterns, building ecosystems. - -### [State Management Philosophy](state-management/) -**How ETL tracks replication state and schema changes** - -Understand ETL's approach to managing replication state, handling schema evolution, and ensuring consistency across restarts. - -*Topics covered:* State storage options, schema change handling, consistency guarantees, recovery behavior. - -## Broader Context - -### [ETL vs. Other Replication Tools](comparisons/) -**How ETL fits in the data replication landscape** - -Compare ETL to other PostgreSQL replication tools, general-purpose ETL systems, and cloud-managed solutions. Understand when to choose each approach. - -*Topics covered:* Tool comparisons, use case fit, ecosystem integration, operational trade-offs. - -### [Future Directions](roadmap/) -**Where ETL is heading and how to influence its evolution** - -Learn about planned features, architectural improvements, and community priorities. Understand how to contribute to ETL's development. - -*Topics covered:* Planned features, architectural evolution, community involvement, contribution guidelines. - ## Reading Guide -**New to data replication?** Start with [Postgres Logical Replication](replication/) to understand the foundation technology. - -**Coming from other tools?** Jump to [ETL vs. Other Tools](comparisons/) to see how ETL fits in the landscape. +**New to ETL?** Start with the [ETL Architecture](architecture.md) to understand how the system works. -**Planning a production deployment?** Read [Architecture](architecture/) and [Performance](performance/) to understand system behavior. +**Planning a production deployment?** Read [Architecture](architecture.md) to understand system behavior. -**Building extensions?** Focus on [Crate Structure](crate-structure/) and [Destinations](destinations-explained/) for extension patterns. +**Building extensions?** Check out the [Custom Implementations Tutorial](../tutorials/custom-implementations.md). ## Next Steps -After building conceptual understanding: -- **Start building** β†’ [Tutorials](../tutorials/) -- **Solve specific problems** β†’ [How-To Guides](../how-to/) -- **Look up technical details** β†’ [Reference](../reference/) +After building a conceptual understanding: + +- **Start building** β†’ [Tutorials](../tutorials/index.md) +- **Solve specific problems** β†’ [How-To Guides](../how-to/index.md) +- **Look up technical details** β†’ [Reference](../reference/index.md) ## Contributing to Explanations diff --git a/docs/how-to/configure-postgres.md b/docs/how-to/configure-postgres.md index 0e42e743e..3d0d0c263 100644 --- a/docs/how-to/configure-postgres.md +++ b/docs/how-to/configure-postgres.md @@ -1,326 +1,177 @@ ---- -type: how-to -audience: developers, database administrators -prerequisites: - - PostgreSQL server access with superuser privileges - - Understanding of PostgreSQL configuration - - Knowledge of PostgreSQL user management -version_last_tested: 0.1.0 -last_reviewed: 2025-01-14 -risk_level: medium ---- - # Configure PostgreSQL for Replication **Set up PostgreSQL with the correct permissions and settings for ETL logical replication** -This guide walks you through configuring PostgreSQL to support logical replication for ETL, including WAL settings, user permissions, and publication setup. - -## Goal - -Configure PostgreSQL to: - -- Enable logical replication at the server level -- Create appropriate user accounts with minimal required permissions -- Set up publications for the tables you want to replicate -- Configure replication slots for reliable WAL consumption +This guide covers the essential PostgreSQL concepts and configuration needed for logical replication with ETL. ## Prerequisites -- PostgreSQL 12 or later +- PostgreSQL 10 or later - Superuser access to the PostgreSQL server - Ability to restart PostgreSQL server (for configuration changes) -- Network connectivity from ETL to PostgreSQL - -## Decision Points - -**Choose your approach based on your environment:** -| Environment | Security Level | Recommended Setup | -|-------------|----------------|-------------------| -| **Development** | Low | Single superuser account | -| **Staging** | Medium | Dedicated replication user with specific permissions | -| **Production** | High | Least-privilege user with row-level security | +## Understanding WAL Logical -## Configuration Steps +PostgreSQL's Write-Ahead Log (WAL) is the foundation of logical replication. When `wal_level = logical`, PostgreSQL: -### Step 1: Enable Logical Replication - -Edit your PostgreSQL configuration file (usually `postgresql.conf`): +- Records detailed information about data changes (not just physical changes) +- Includes enough metadata to reconstruct logical changes +- Allows external tools to decode and stream these changes ```ini -# Enable logical replication +# Enable logical replication in postgresql.conf wal_level = logical - -# Increase max replication slots (default is 10) -max_replication_slots = 20 - -# Increase max WAL senders (default is 10) -max_wal_senders = 20 - -# Optional: Increase checkpoint segments for better performance -checkpoint_segments = 32 -checkpoint_completion_target = 0.9 ``` -**If using PostgreSQL 13+**, also consider: - -```ini -# Enable publication of truncate operations (optional) -wal_sender_timeout = 60s - -# Improve WAL retention for catching up -wal_keep_size = 1GB -``` - -**Restart PostgreSQL** to apply these settings: - +**Restart PostgreSQL** after changing this setting: ```bash -# On systemd systems sudo systemctl restart postgresql - -# On other systems -sudo pg_ctl restart -D /path/to/data/directory ``` -### Step 2: Create a Replication User +## Replication Slots -Create a dedicated user with appropriate permissions: +Replication slots ensure that PostgreSQL retains WAL data for logical replication consumers, even if they disconnect temporarily. -```sql --- Create replication user -CREATE USER etl_replicator WITH PASSWORD 'secure_password_here'; - --- Grant replication privileges -ALTER USER etl_replicator REPLICATION; - --- Grant connection privileges -GRANT CONNECT ON DATABASE your_database TO etl_replicator; +### What are Replication Slots? --- Grant schema usage (adjust schema names as needed) -GRANT USAGE ON SCHEMA public TO etl_replicator; - --- Grant select on specific tables (more secure than all tables) -GRANT SELECT ON TABLE users, orders, products TO etl_replicator; - --- Alternative: Grant select on all tables in schema (less secure but easier) --- GRANT SELECT ON ALL TABLES IN SCHEMA public TO etl_replicator; --- ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT SELECT ON TABLES TO etl_replicator; -``` +- **Persistent markers** in PostgreSQL that track replication progress +- **Prevent WAL cleanup** until the consumer catches up +- **Guarantee data consistency** across disconnections -### Step 3: Configure Connection Security - -**For development (less secure):** - -Edit `pg_hba.conf` to allow connections: - -``` -# Allow local connections with password -host your_database etl_replicator localhost md5 - -# Allow connections from specific IP range -host your_database etl_replicator 10.0.0.0/8 md5 -``` - -**For production (more secure):** - -Use SSL/TLS connections: - -``` -# Require SSL connections -hostssl your_database etl_replicator 10.0.0.0/8 md5 -``` - -Reload PostgreSQL configuration: +### Creating Replication Slots ```sql -SELECT pg_reload_conf(); +-- Create a logical replication slot +SELECT pg_create_logical_replication_slot('my_slot', 'pgoutput'); ``` -### Step 4: Create Publications - -Connect as a superuser or table owner and create publications: +### Viewing Replication Slots ```sql --- Create publication for specific tables -CREATE PUBLICATION etl_publication FOR TABLE users, orders, products; - --- Alternative: Create publication for all tables (use with caution) --- CREATE PUBLICATION etl_publication FOR ALL TABLES; - --- View existing publications -SELECT * FROM pg_publication; - --- View tables in a publication -SELECT * FROM pg_publication_tables WHERE pubname = 'etl_publication'; +-- See all replication slots +SELECT slot_name, slot_type, active, restart_lsn +FROM pg_replication_slots; ``` -### Step 5: Test the Configuration - -Verify your setup works: +### Deleting Replication Slots ```sql --- Test replication slot creation (as etl_replicator user) -SELECT pg_create_logical_replication_slot('test_slot', 'pgoutput'); - --- Verify the slot was created -SELECT * FROM pg_replication_slots WHERE slot_name = 'test_slot'; - --- Clean up test slot -SELECT pg_drop_replication_slot('test_slot'); +-- Drop a replication slot when no longer needed +SELECT pg_drop_replication_slot('my_slot'); ``` -### Step 6: Configure ETL Connection - -Update your ETL configuration to use the new setup: +**Warning:** Only delete slots when you're sure they're not in use. Deleting an active slot can cause data loss. -```rust -use etl::config::{PgConnectionConfig, TlsConfig}; - -let pg_config = PgConnectionConfig { - host: "your-postgres-server.com".to_string(), - port: 5432, - name: "your_database".to_string(), - username: "etl_replicator".to_string(), - password: Some("secure_password_here".into()), - tls: TlsConfig { - enabled: true, // Enable for production - trusted_root_certs: "/path/to/ca-certificates.crt".to_string(), - }, -}; -``` +## Max Replication Slots -## Validation +Controls how many replication slots PostgreSQL can maintain simultaneously. -Verify your configuration: - -### Test 1: Connection Test - -```bash -# Test connection from ETL server -psql -h your-postgres-server.com -p 5432 -U etl_replicator -d your_database -c "SELECT 1;" +```ini +# Increase max replication slots (default is 10) +max_replication_slots = 20 ``` -### Test 2: Replication Permissions +ETL uses a **single replication slot** for its main apply worker. However, additional slots may be created for parallel table +copies when the pipeline is initialized or when a new table is added to the publication. The `max_table_sync_workers` parameter +controls the number of these parallel copies, ensuring that the total replication slots used by ETL never exceed `max_table_sync_workers + 1`. -```sql --- As etl_replicator user, verify you can: --- 1. Create replication slots -SELECT pg_create_logical_replication_slot('validation_slot', 'pgoutput'); +**When to increase:** --- 2. Read from tables in the publication -SELECT COUNT(*) FROM users; +- Running multiple ETL pipelines +- Development/testing with frequent slot creation --- 3. Access publication information -SELECT * FROM pg_publication_tables WHERE pubname = 'etl_publication'; +## WAL Keep Size --- Clean up -SELECT pg_drop_replication_slot('validation_slot'); -``` +Determines how much WAL data to retain on disk, providing a safety buffer for replication consumers. -### Test 3: ETL Pipeline Test - -Run a simple ETL pipeline to verify end-to-end functionality: +```ini +# Keep 1GB of WAL data (PostgreSQL 13+) +wal_keep_size = 1GB -```rust -// Use your configuration to create a test pipeline -// This should complete initial sync successfully +# For PostgreSQL 12 and earlier, use: +# wal_keep_segments = 256 # Each segment is typically 16MB ``` -## Troubleshooting - -### "ERROR: logical decoding requires wal_level >= logical" - -**Solution:** Update `postgresql.conf` with `wal_level = logical` and restart PostgreSQL. +**Purpose:** -### "ERROR: permission denied to create replication slot" +- Prevents WAL deletion when replication consumers fall behind +- Provides recovery time if ETL pipelines temporarily disconnect +- Balances disk usage with replication reliability -**Solutions:** -- Ensure user has `REPLICATION` privilege: `ALTER USER etl_replicator REPLICATION;` -- Check if you're connecting to the right database -- Verify `pg_hba.conf` allows the connection +## Publications -### "ERROR: publication does not exist" +Publications define which tables and operations to replicate. -**Solutions:** -- Verify publication name matches exactly: `SELECT * FROM pg_publication;` -- Ensure you're connected to the correct database -- Check if publication was created by another user +### Creating Publications -### "Connection refused" or timeout issues - -**Solutions:** -- Check `postgresql.conf` has `listen_addresses = '*'` (or specific IPs) -- Verify `pg_hba.conf` allows your connection -- Check firewall settings on PostgreSQL server -- Confirm PostgreSQL is running: `sudo systemctl status postgresql` - -### "ERROR: too many replication slots" +```sql +-- Create publication for specific tables +CREATE PUBLICATION my_publication FOR TABLE users, orders; -**Solutions:** -- Increase `max_replication_slots` in `postgresql.conf` -- Clean up unused replication slots: `SELECT pg_drop_replication_slot('unused_slot');` -- Monitor slot usage: `SELECT * FROM pg_replication_slots;` +-- Create publication for all tables (use with caution) +CREATE PUBLICATION all_tables FOR ALL TABLES; -## Security Best Practices +-- Include only specific operations +CREATE PUBLICATION inserts_only FOR TABLE users WITH (publish = 'insert'); +``` -### Principle of Least Privilege +### Managing Publications -- **Don't use superuser accounts** for ETL in production -- **Grant SELECT only on tables** that need replication -- **Use specific database names** instead of template1 or postgres -- **Limit connection sources** with specific IP ranges in pg_hba.conf +```sql +-- View existing publications +SELECT * FROM pg_publication; -### Network Security +-- See which tables are in a publication +SELECT * FROM pg_publication_tables WHERE pubname = 'my_publication'; -- **Always use SSL/TLS** in production: `hostssl` in pg_hba.conf -- **Use certificate authentication** for highest security -- **Restrict network access** with firewalls and VPCs -- **Monitor connections** with log_connections = on +-- Add tables to existing publication +ALTER PUBLICATION my_publication ADD TABLE products; -### Operational Security +-- Remove tables from publication +ALTER PUBLICATION my_publication DROP TABLE products; -- **Rotate passwords regularly** for replication users -- **Monitor replication slots** for unused or stalled slots -- **Set up alerting** for replication lag and failures -- **Audit publication changes** in your change management process +-- Drop publication +DROP PUBLICATION my_publication; +``` -## Performance Considerations +## Complete Configuration Example -### WAL Configuration +Here's a minimal `postgresql.conf` setup: ```ini -# For high-throughput systems -wal_buffers = 16MB -checkpoint_completion_target = 0.9 -wal_writer_delay = 200ms -commit_delay = 1000 -``` +# Enable logical replication +wal_level = logical -### Monitoring Queries +# Increase replication capacity +max_replication_slots = 20 +max_wal_senders = 20 -Track replication performance: +# Keep WAL data for safety +wal_keep_size = 1GB # PostgreSQL 13+ +# wal_keep_segments = 64 # PostgreSQL 12 and earlier +``` -```sql --- Monitor replication lag -SELECT - slot_name, - pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) as lag -FROM pg_replication_slots; +After editing the configuration: --- Monitor WAL generation rate -SELECT pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), '0/0')) as total_wal; -``` +1. **Restart PostgreSQL** +2. **Create your publication**: + ```sql + CREATE PUBLICATION etl_publication FOR TABLE your_table; + ``` +3. **Verify the setup**: + ```sql + SHOW wal_level; + SHOW max_replication_slots; + SELECT * FROM pg_publication WHERE pubname = 'etl_publication'; + ``` ## Next Steps -- **Build your first pipeline** β†’ [First ETL Pipeline](../tutorials/first-pipeline/) -- **Handle schema changes** β†’ [Schema Change Management](schema-changes/) -- **Optimize performance** β†’ [Performance Tuning](performance/) -- **Set up monitoring** β†’ [Debugging Guide](debugging/) +- **Build your first pipeline** β†’ [Build Your First ETL Pipeline](../tutorials/first-pipeline.md) +- **Build custom implementations** β†’ [Custom Stores and Destinations](../tutorials/custom-implementations.md) ## See Also -- [PostgreSQL Logical Replication Documentation](https://www.postgresql.org/docs/current/logical-replication.html) -- [ETL Architecture](../explanation/architecture/) - Understanding how ETL uses these settings -- [Connection Configuration Reference](../reference/pg-connection-config/) - All available connection options \ No newline at end of file +- [Build Your First ETL Pipeline](../tutorials/first-pipeline.md) - Hands-on tutorial using these settings +- [ETL Architecture](../explanation/architecture.md) - Understanding how ETL uses these settings +- [API Reference](../reference/index.md) - All available connection options \ No newline at end of file diff --git a/docs/how-to/index.md b/docs/how-to/index.md index 9bf8101a0..17e31e7d6 100644 --- a/docs/how-to/index.md +++ b/docs/how-to/index.md @@ -1,7 +1,3 @@ ---- -type: how-to -title: How-To Guides ---- # How-To Guides @@ -11,68 +7,22 @@ How-to guides provide step-by-step instructions for accomplishing specific goals ## Database Configuration -### [Configure PostgreSQL for Replication](configure-postgres/) -Set up PostgreSQL with the correct permissions, settings, and publications for ETL pipelines. +### [Configure PostgreSQL for Replication](configure-postgres.md) +Set up PostgreSQL with the correct settings, and publications for ETL pipelines. **When to use:** Setting up a new PostgreSQL source for replication. -## Destinations and Output - -### [Build Custom Destinations](custom-destinations/) -Create your own destination implementations for specific data warehouses or storage systems. - -**When to use:** ETL doesn't support your target system out of the box. - -### [Handle Schema Changes](schema-changes/) -Manage table schema changes without breaking your replication pipeline. - -**When to use:** Your source database schema evolves over time. - -## Operations and Monitoring - -### [Debug Pipeline Issues](debugging/) -Diagnose and resolve common pipeline problems like connection failures, data inconsistencies, and performance bottlenecks. - -**When to use:** Your pipeline isn't working as expected. - -### [Optimize Performance](performance/) -Tune your ETL pipeline for maximum throughput and minimal resource usage. - -**When to use:** Your pipeline is working but needs to handle more data or run faster. - -### [Test ETL Pipelines](testing/) -Build comprehensive test suites for your ETL applications using mocks and test utilities. - -**When to use:** Ensuring reliability before deploying to production. - -## Before You Start - -**Prerequisites:** -- Complete the [first pipeline tutorial](../tutorials/first-pipeline/) -- Have a working ETL development environment -- Understanding of your specific use case requirements - -## Guide Structure - -Each how-to guide follows this pattern: - -1. **Goal statement** - What you'll accomplish -2. **Prerequisites** - Required setup and knowledge -3. **Decision points** - Key choices that affect the approach -4. **Step-by-step procedure** - Actions to take -5. **Validation** - How to verify success -6. **Troubleshooting** - Common issues and solutions - ## Next Steps After solving your immediate problem: -- **Learn more concepts** β†’ [Explanations](../explanation/) -- **Look up technical details** β†’ [Reference](../reference/) -- **Build foundational knowledge** β†’ [Tutorials](../tutorials/) + +- **Learn more concepts** β†’ [Explanations](../explanation/index.md) +- **Look up technical details** β†’ [Reference](../reference/index.md) +- **Build foundational knowledge** β†’ [Tutorials](../tutorials/index.md) ## Need Help? If these guides don't cover your specific situation: -1. Check if it's addressed in [Debugging](debugging/) +1. Check the [PostgreSQL configuration guide](configure-postgres.md) 2. Search existing [GitHub issues](https://github.com/supabase/etl/issues) 3. [Open a new issue](https://github.com/supabase/etl/issues/new) with details about your use case \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 01bf0a7fa..d89ff6993 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,8 +1,3 @@ ---- -hide: - - navigation -title: ETL Documentation ---- # ETL Documentation @@ -15,35 +10,22 @@ ETL is a Rust framework by [Supabase](https://supabase.com) that enables you to Choose your path based on your needs: ### New to ETL? -Start with our **[Tutorials](tutorials/)** to learn ETL through hands-on examples: +Start with our **[Tutorials](tutorials/index.md)** to learn ETL through hands-on examples: -- [Build your first ETL pipeline](tutorials/first-pipeline/) - Complete beginner's guide (15 minutes) -- [Set up memory-based testing](tutorials/memory-destination/) - Test your pipeline locally (10 minutes) -- [Testing ETL pipelines](tutorials/testing-pipelines/) - Ensure reliability (20 minutes) +- [Build your first ETL pipeline](tutorials/first-pipeline.md) - Complete beginner's guide (15 minutes) +- [Build custom stores and destinations](tutorials/custom-implementations.md) - Advanced patterns (30 minutes) ### Ready to solve specific problems? -Jump to our **[How-To Guides](how-to/)** for practical solutions: +Jump to our **[How-To Guides](how-to/index.md)** for practical solutions: -- [Configure PostgreSQL for replication](how-to/configure-postgres/) -- [Build custom destinations](how-to/custom-destinations/) -- [Debug pipeline issues](how-to/debugging/) -- [Handle schema changes](how-to/schema-changes/) -- [Optimize performance](how-to/performance/) - -### Need detailed technical information? -Consult our **[Reference](reference/)** documentation: - -- API reference -- Configuration options -- Error codes and messages +- [Configure PostgreSQL for replication](how-to/configure-postgres.md) +- More guides coming soon ### Want to understand the bigger picture? -Read our **[Explanations](explanation/)** for deeper insights: +Read our **[Explanations](explanation/index.md)** for deeper insights: -- [ETL architecture overview](explanation/architecture/) -- [Why Postgres logical replication?](explanation/replication/) -- [Performance characteristics](explanation/performance/) -- [Design decisions](explanation/design/) +- [ETL architecture overview](explanation/architecture.md) +- More explanations coming soon ## Core Concepts @@ -92,7 +74,7 @@ async fn main() -> Result<(), Box> { }; // Create and start the pipeline - let mut pipeline = Pipeline::new(1, config, store, destination); + let mut pipeline = Pipeline::new(config, store, destination); pipeline.start().await?; // Pipeline will run until stopped @@ -104,7 +86,7 @@ async fn main() -> Result<(), Box> { ## Next Steps -- **First time using ETL?** β†’ Start with [Build your first pipeline](tutorials/first-pipeline/) -- **Have a specific goal?** β†’ Browse [How-To Guides](how-to/) -- **Need technical details?** β†’ Check the [Reference](reference/) -- **Want to understand ETL deeply?** β†’ Read [Explanations](explanation/) +- **First time using ETL?** β†’ Start with [Build your first pipeline](tutorials/first-pipeline.md) +- **Need PostgreSQL setup help?** β†’ Check [Configure PostgreSQL for Replication](how-to/configure-postgres.md) +- **Need technical details?** β†’ Check the [Reference](reference/index.md) +- **Want to understand the architecture?** β†’ Read [ETL Architecture](explanation/architecture.md) diff --git a/docs/reference/index.md b/docs/reference/index.md index 8340678a2..df2c3c1b4 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -1,7 +1,3 @@ ---- -type: reference -title: API Reference ---- # Reference @@ -13,6 +9,6 @@ cargo doc --workspace --all-features --no-deps --open ## See Also -- [How-to guides](../how-to/) - Task-oriented instructions -- [Tutorials](../tutorials/) - Learning-oriented lessons -- [Explanations](../explanation/) - Understanding-oriented discussions \ No newline at end of file +- [How-to guides](../how-to/index.md) - Task-oriented instructions +- [Tutorials](../tutorials/index.md) - Learning-oriented lessons +- [Explanations](../explanation/index.md) - Understanding-oriented discussions \ No newline at end of file diff --git a/docs/test-mermaid.md b/docs/test-mermaid.md deleted file mode 100644 index 4d644b200..000000000 --- a/docs/test-mermaid.md +++ /dev/null @@ -1,39 +0,0 @@ -# Mermaid Test - -This page tests Mermaid diagram rendering in MkDocs. - -## Simple Flowchart - -```mermaid -flowchart TD - A[Start] --> B{Is it?} - B -->|Yes| C[OK] - C --> D[Rethink] - D --> B - B ---->|No| E[End] -``` - -## Sequence Diagram - -```mermaid -sequenceDiagram - participant Alice - participant Bob - Alice->>John: Hello John, how are you? - loop Healthcheck - John->>John: Fight against hypochondria - end - Note right of John: Rational thoughts
prevail! - John-->>Alice: Great! - John->>Bob: How about you? - Bob-->>John: Jolly good! -``` - -## Database Schema Example - -```mermaid -erDiagram - CUSTOMER ||--o{ ORDER : places - ORDER ||--|{ LINE-ITEM : contains - CUSTOMER }|..|{ DELIVERY-ADDRESS : uses -``` \ No newline at end of file diff --git a/docs/tutorials/custom-implementations.md b/docs/tutorials/custom-implementations.md index 5129c9aac..60902055b 100644 --- a/docs/tutorials/custom-implementations.md +++ b/docs/tutorials/custom-implementations.md @@ -1,16 +1,3 @@ ---- -type: tutorial -audience: developers -prerequisites: - - Complete first pipeline tutorial - - Advanced Rust knowledge (traits, async, Arc/Mutex) - - PostgreSQL database running locally - - curl command for testing HTTP endpoints -version_last_tested: 0.1.0 -last_reviewed: 2025-08-14 -estimated_time: 30 -risk_level: medium ---- # Build Custom Stores and Destinations in 30 minutes @@ -696,14 +683,12 @@ You now have working custom ETL components: ## Next Steps -- **Connect to real PostgreSQL** β†’ [PostgreSQL Setup Guide](../how-to/postgresql-setup/) -- **Add production-ready persistence** β†’ [Production Stores Guide](../how-to/production-stores/) -- **Test your pipeline thoroughly** β†’ [Testing ETL Pipelines](../how-to/testing-pipelines/) -- **Deploy to production** β†’ [Deployment Guide](../how-to/deployment/) +- **Connect to real PostgreSQL** β†’ [Configure PostgreSQL for Replication](../how-to/configure-postgres.md) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture.md) +- **Contribute to ETL** β†’ [Open an issue](https://github.com/supabase/etl/issues) with your custom implementations ## See Also -- [ETL Store Design](../explanation/store-architecture/) - Deep dive on storage patterns -- [Destination Patterns](../explanation/destination-patterns/) - Advanced destination implementation -- [Store Trait Reference](../reference/traits/store/) - Complete API documentation -- [Destination Trait Reference](../reference/traits/destination/) - Complete API documentation \ No newline at end of file +- [ETL Architecture](../explanation/architecture.md) - Understanding the system design +- [API Reference](../reference/index.md) - Complete trait documentation +- [Build your first pipeline](first-pipeline.md) - Start with the basics if you haven't yet \ No newline at end of file diff --git a/docs/tutorials/first-pipeline.md b/docs/tutorials/first-pipeline.md index c5e31f8fa..0a902b08d 100644 --- a/docs/tutorials/first-pipeline.md +++ b/docs/tutorials/first-pipeline.md @@ -1,14 +1,3 @@ ---- -type: tutorial -audience: developers -prerequisites: - - Rust 1.75 or later - - PostgreSQL server (local or remote) - - Basic Rust and SQL knowledge -version_last_tested: 0.1.0 -last_reviewed: 2025-01-14 -estimated_time: 15 ---- # Build Your First ETL Pipeline @@ -245,13 +234,12 @@ DROP DATABASE etl_tutorial; Now that you understand the basics: -- **Add robust testing** β†’ [Testing ETL Pipelines](testing-pipelines/) -- **Connect to BigQuery** β†’ [How to Set Up BigQuery Destination](../how-to/custom-destinations/) -- **Handle production scenarios** β†’ [How to Debug Pipeline Issues](../how-to/debugging/) -- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) +- **Build custom implementations** β†’ [Custom Stores and Destinations](custom-implementations.md) +- **Configure PostgreSQL properly** β†’ [Configure PostgreSQL for Replication](../how-to/configure-postgres.md) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture.md) ## See Also -- [Memory Destination Tutorial](memory-destination/) - Deep dive into testing with memory -- [API Reference](../reference/) - Complete configuration options -- [Performance Guide](../how-to/performance/) - Optimize your pipelines \ No newline at end of file +- [Custom Implementation Tutorial](custom-implementations.md) - Advanced patterns +- [API Reference](../reference/index.md) - Complete configuration options +- [ETL Architecture](../explanation/architecture.md) - Understand the design \ No newline at end of file diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index b62af4cb2..9c19c52ce 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -1,7 +1,3 @@ ---- -type: tutorial -title: Tutorials ---- # Tutorials @@ -11,28 +7,22 @@ Tutorials provide step-by-step learning paths that take you from zero knowledge ## Getting Started -### [Build Your First ETL Pipeline](first-pipeline/) +### [Build Your First ETL Pipeline](first-pipeline.md) **15 minutes** β€’ **Beginner** Create a complete ETL pipeline that replicates data from PostgreSQL to a memory destination. You'll learn the core concepts of publications, replication slots, and pipeline configuration. *What you'll build:* A working pipeline that streams changes from a sample PostgreSQL table to an in-memory destination. -### [Set Up Memory-Based Testing](memory-destination/) -**10 minutes** β€’ **Beginner** - -Learn how to use ETL's built-in memory destination for rapid prototyping and testing. Perfect for development and CI environments. - -*What you'll build:* A test environment that validates your pipeline logic without external dependencies. ## Advanced Topics -### [Build Custom Stores and Destinations](custom-implementations/) +### [Build Custom Stores and Destinations](custom-implementations.md) **45 minutes** β€’ **Advanced** Implement production-ready custom stores and destinations. Learn ETL's design patterns, build persistent SQLite storage, and create HTTP-based destinations with retry logic. -*What you'll build:* Custom SQLite store for persistent state/schema storage and HTTP destination with production-ready error handling. +*What you'll build:* Custom in-memory store for state/schema storage and HTTP destination. ## Before You Start @@ -48,22 +38,12 @@ Implement production-ready custom stores and destinations. Learn ETL's design pa - Your favorite text editor - About 30-60 minutes total time -## Tutorial Structure - -Each tutorial follows the same pattern: - -1. **Clear outcome** - See exactly what you'll build -2. **Step-by-step instructions** - No guessing, just follow along -3. **Immediate feedback** - See results after each major step -4. **Clean completion** - Working code you can build upon - ## Next Steps After completing the tutorials: -- **Solve specific problems** β†’ [How-To Guides](../how-to/) -- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture/) -- **Look up technical details** β†’ [API Reference](../reference/) +- **Solve specific problems** β†’ [How-To Guides](../how-to/index.md) +- **Understand the architecture** β†’ [ETL Architecture](../explanation/architecture.md) ## Need Help? @@ -71,5 +51,5 @@ If you get stuck: 1. Double-check the prerequisites 2. Ensure your PostgreSQL setup matches the requirements -3. Check our [debugging guide](../how-to/debugging/) +3. Check the [PostgreSQL configuration guide](../how-to/configure-postgres.md) 4. [Open an issue](https://github.com/supabase/etl/issues) with your specific problem \ No newline at end of file From fa10bf11d87e320f6ea169f875928b9c2935aa81 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 18 Aug 2025 11:36:25 +0200 Subject: [PATCH 6/9] Update --- docs/tutorials/custom-implementations.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/tutorials/custom-implementations.md b/docs/tutorials/custom-implementations.md index 60902055b..a0ef212dd 100644 --- a/docs/tutorials/custom-implementations.md +++ b/docs/tutorials/custom-implementations.md @@ -14,10 +14,6 @@ By the end of this tutorial, you'll have: **Time required:** 30 minutes **Prerequisites:** Advanced Rust knowledge, running PostgreSQL, basic HTTP knowledge -## Safety Note - -This tutorial creates files in your current directory and makes HTTP requests. To clean up afterward, simply delete the generated Rust project files. - ## Step 1: Create Project Structure Create a new Rust project for your custom ETL components: @@ -670,12 +666,14 @@ You now have working custom ETL components: ## Key Patterns You've Mastered **Store Architecture:** + - Cache-first reads for performance - Dual-write pattern for data consistency - Startup loading from persistent storage - Thread-safe concurrent access with Arc/Mutex **Destination Patterns:** + - Exponential backoff retry logic - Smart error classification (retry 5xx, fail 4xx) - Efficient batching and empty batch handling From 4622861739169984c98235ed490289d3c9fcbd74 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 18 Aug 2025 11:50:54 +0200 Subject: [PATCH 7/9] Update --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d4d67c0ae..c036f681c 100644 --- a/README.md +++ b/README.md @@ -85,9 +85,9 @@ For tutorials and deeper guidance, see the [Documentation](https://supabase.gith ETL is designed to be extensible. You can implement your own destinations to send data to any destination you like, however it comes with a few built in destinations: -- BigQuery (via `etl-destinations`) +- BigQuery -To add BigQuery support: +Out-of-the-box destinations are available in the `etl-destinations` crate: ```toml [dependencies] From 07d9dc819c80f915b255d4773acb761ba9092f38 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 18 Aug 2025 12:02:02 +0200 Subject: [PATCH 8/9] Update --- etl-api/README.md | 21 +++++++++++++++++++-- etl-benchmarks/README.md | 2 +- etl-examples/README.md | 2 +- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/etl-api/README.md b/etl-api/README.md index 7286aa49f..91a72bb84 100644 --- a/etl-api/README.md +++ b/etl-api/README.md @@ -26,7 +26,24 @@ This API service provides a RESTful interface for managing PostgreSQL replicatio ## Prerequisites -Before you begin, please refer to our [Database Setup Guide](../docs/guides/database-setup.md). +Before running the API, you must have: + +- A running PostgreSQL instance reachable via `DATABASE_URL`. +- The `etl-api` database schema applied (SQLx migrations). + +Quickest way: use the setup script to start Postgres (via Docker) and run migrations automatically. + +```bash +# Starts a local Postgres (if needed) and applies etl-api migrations +./scripts/init_db.sh +``` + +Alternative: if you already have a Postgres database, set `DATABASE_URL` and apply migrations manually: + +```bash +export DATABASE_URL=postgres://USER:PASSWORD@HOST:PORT/DB +sqlx migrate run --source etl-api/migrations +``` ## Development @@ -45,7 +62,7 @@ sqlx migrate add To apply all pending migrations: ```bash -sqlx migrate run +sqlx migrate run --source etl-api/migrations ``` #### Resetting Database diff --git a/etl-benchmarks/README.md b/etl-benchmarks/README.md index 59d1e122a..1fe11781f 100644 --- a/etl-benchmarks/README.md +++ b/etl-benchmarks/README.md @@ -9,7 +9,7 @@ Performance benchmarks for the ETL system to measure and track replication perfo ## Prerequisites Before running benchmarks, ensure you have: -- A PostgreSQL database set up (see [Database Setup Guide](../docs/guides/database-setup.md)) +- A PostgreSQL database set up - A publication created with the tables you want to benchmark - For BigQuery benchmarks: GCP project, dataset, and service account key file diff --git a/etl-examples/README.md b/etl-examples/README.md index 11c32885c..f80a52b6d 100644 --- a/etl-examples/README.md +++ b/etl-examples/README.md @@ -34,7 +34,7 @@ In the above example, `etl` connects to a Postgres database named `postgres` run ## Prerequisites -Before running the examples, you'll need to set up a PostgreSQL database. For detailed instructions on how to use the database setup script, please refer to our [Database Setup Guide](../docs/guides/database-setup.md). +Before running the examples, you'll need to set up a PostgreSQL database with logical replication enabled. ## BigQuery Setup From d8dca452d7f3bba36cbcb41534c708f3582cdfa4 Mon Sep 17 00:00:00 2001 From: Riccardo Busetti Date: Mon, 18 Aug 2025 12:04:08 +0200 Subject: [PATCH 9/9] Update --- etl-api/README.md | 2 +- etl-examples/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/etl-api/README.md b/etl-api/README.md index 91a72bb84..721e6e5ae 100644 --- a/etl-api/README.md +++ b/etl-api/README.md @@ -1,4 +1,4 @@ -# `etl` API +# `etl` - API This API service provides a RESTful interface for managing PostgreSQL replication pipelines. It enables you to: diff --git a/etl-examples/README.md b/etl-examples/README.md index f80a52b6d..4f498a193 100644 --- a/etl-examples/README.md +++ b/etl-examples/README.md @@ -6,7 +6,7 @@ This crate contains practical examples demonstrating how to use the ETL system f - **BigQuery Integration**: Demonstrates replicating PostgreSQL data to Google BigQuery -## Quickstart +## Quick Start To quickly try out `etl`, you can run the BigQuery example. First, create a publication in Postgres which includes the tables you want to replicate: @@ -44,4 +44,4 @@ To run the BigQuery example, you'll need: 2. A service account key file with BigQuery permissions 3. A BigQuery dataset created in your project -The example will automatically create tables in the specified dataset based on your PostgreSQL schema. \ No newline at end of file +The example will automatically create tables in the specified dataset based on your PostgreSQL schema.