From 3a5a0cc04b17f50f9f1abcd3dda164f0fde51dab Mon Sep 17 00:00:00 2001 From: Anton Bobkov Date: Wed, 24 Sep 2025 15:23:31 +0300 Subject: [PATCH] EN version --- ydb/docs/en/core/analyst/olap_quickstart.md | 410 ++++++++++++++++++++ ydb/docs/en/core/analyst/toc_p.yaml | 2 + 2 files changed, 412 insertions(+) create mode 100644 ydb/docs/en/core/analyst/olap_quickstart.md diff --git a/ydb/docs/en/core/analyst/olap_quickstart.md b/ydb/docs/en/core/analyst/olap_quickstart.md new file mode 100644 index 000000000000..6be9c469cd6a --- /dev/null +++ b/ydb/docs/en/core/analyst/olap_quickstart.md @@ -0,0 +1,410 @@ +## Quickstart: Loading TPC-H data from {{ objstorage-name }} and executing analytical queries + +This quickstart describes the process of loading the TPC-H test dataset (scale factor 10, 10 GB) into {{ ydb-short-name }} from external S3-compatible storage and executing analytical queries. + +After completing the steps, the following will be created in the database: + +- External data source [`EXTERNAL DATA SOURCE`](../concepts/datamodel/external_data_source.md). +- External tables [`EXTERNAL TABLE`](../concepts/datamodel/external_table.md) for Parquet file schemas. +- {{ ydb-short-name }} [column-oriented tables](../concepts/datamodel/table.md#column-oriented-tables) optimized for analytical queries. + +Actions include: + +- Loading data from {{ objstorage-name }} into {{ ydb-short-name }} tables using [`UPSERT INTO... SELECT FROM`](../yql/reference/syntax/upsert_into). +- Executing a test query (TPC-H Q0) for verification. + + +### Prerequisites + +- An existing {{ ydb-short-name }} database. +- A tool for executing YQL/SQL queries ({{ ydb-short-name }} UI or [CLI](../reference/ydb-cli/index.md)). + +### Creating an external data source + +In this step, an [`EXTERNAL DATA SOURCE`](../concepts/datamodel/external_data_source.md) object will be created, which is a named reference to external data storage. It contains information about the data location and access method. + +**`EXTERNAL DATA SOURCE`** is an entity in {{ ydb-short-name }} that describes a connection to external storage, such as S3-compatible Object Storage. + +```sql +CREATE EXTERNAL DATA SOURCE IF NOT EXISTS `external/tpc` WITH ( + SOURCE_TYPE="ObjectStorage", + LOCATION="https://storage.yandexcloud.net/tpc/", + AUTH_METHOD="NONE" +); +``` + +Parameters: + +- `SOURCE_TYPE="ObjectStorage"`: storage type (S3-compatible). +- `LOCATION="https://storage.yandexcloud.net/tpc/"`: bucket URL. +- `AUTH_METHOD="NONE"`: public access without authentication. + +### Creating external tables + +In this step, schemas (columns and types) are defined for Parquet files in the external source. This allows accessing data through SQL queries in {{ ydb-short-name }}, linking the logical structure with physical files. + +[`EXTERNAL TABLE`](../concepts/datamodel/external_table.md) is a "virtual" table that describes the structure of the data located in an external source. This links the logical table schema with the physical files. + +```sql +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/customer` ( + c_acctbal Double NOT NULL, + c_mktsegment String NOT NULL, + c_phone String NOT NULL, + c_nationkey Int32 NOT NULL, + c_custkey Int32 NOT NULL, + c_name String NOT NULL, + c_comment String NOT NULL, + c_address String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/customer/", + FORMAT="parquet" +); +``` + +Parameters: + +- `DATA_SOURCE="external/tpc"`: reference to the source. +- `LOCATION="/h/s10/parquet/customer/"`: path to files. +- `FORMAT="parquet"`: data format. + +{% cut "Create the remaining tables" %} + +```sql +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/customer` ( + c_acctbal Double NOT NULL, + c_mktsegment String NOT NULL, + c_phone String NOT NULL, + c_nationkey Int32 NOT NULL, + c_custkey Int32 NOT NULL, + c_name String NOT NULL, + c_comment String NOT NULL, + c_address String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/customer/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/lineitem` ( + l_orderkey Int64 NOT NULL, + l_partkey Int32 NOT NULL, + l_suppkey Int32 NOT NULL, + l_linenumber Int32 NOT NULL, + l_quantity Double NOT NULL, + l_extendedprice Double NOT NULL, + l_discount Double NOT NULL, + l_tax Double NOT NULL, + l_returnflag String NOT NULL, + l_linestatus String NOT NULL, + l_shipdate Date NOT NULL, + l_commitdate Date NOT NULL, + l_receiptdate Date NOT NULL, + l_shipinstruct String NOT NULL, + l_shipmode String NOT NULL, + l_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/lineitem/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/nation` ( + n_nationkey Int32 NOT NULL, + n_name String NOT NULL, + n_regionkey Int32 NOT NULL, + n_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/nation/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/orders` ( + o_orderkey Int64 NOT NULL, + o_custkey Int32 NOT NULL, + o_orderstatus String NOT NULL, + o_totalprice Double NOT NULL, + o_orderdate Date NOT NULL, + o_orderpriority String NOT NULL, + o_clerk String NOT NULL, + o_shippriority Int32 NOT NULL, + o_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/orders/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/part` ( + p_partkey Int32 NOT NULL, + p_name String NOT NULL, + p_mfgr String NOT NULL, + p_brand String NOT NULL, + p_type String NOT NULL, + p_size Int32 NOT NULL, + p_container String NOT NULL, + p_retailprice Double NOT NULL, + p_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/part/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/partsupp` ( + ps_partkey Int32 NOT NULL, + ps_suppkey Int32 NOT NULL, + ps_availqty Int32 NOT NULL, + ps_supplycost Double NOT NULL, + ps_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/partsupp/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/region` ( + r_regionkey Int32 NOT NULL, + r_name String NOT NULL, + r_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/region/", + FORMAT="parquet" +); + +CREATE EXTERNAL TABLE IF NOT EXISTS `external/tpch/s10/supplier` ( + s_suppkey Int32 NOT NULL, + s_name String NOT NULL, + s_address String NOT NULL, + s_nationkey Int32 NOT NULL, + s_phone String NOT NULL, + s_acctbal Double NOT NULL, + s_comment String NOT NULL +) WITH ( + DATA_SOURCE="external/tpc", + LOCATION="/h/s10/parquet/supplier/", + FORMAT="parquet" +); +``` + +{% endcut %} + +### Creating column-oriented tables + +In this step, internal {{ ydb-short-name }} tables are created for data storage. They use the column-oriented format and partitioning, which provides high performance for analytical queries through parallel processing. + +```sql +CREATE TABLE IF NOT EXISTS `tpch/s10/customer` ( + c_acctbal Double NOT NULL, + c_mktsegment String NOT NULL, + c_phone String NOT NULL, + c_nationkey Int32 NOT NULL, + c_custkey Int32 NOT NULL, + c_name String NOT NULL, + c_comment String NOT NULL, + c_address String NOT NULL, + PRIMARY KEY (c_custkey) +) +PARTITION BY HASH (c_custkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); +``` + +Parameters: + +- `PRIMARY KEY (c_custkey)`: table primary key. +- `PARTITION BY HASH (c_custkey)`: hash partitioning for parallelism. +- `STORE = COLUMN`: column-oriented data storage format for accelerating analytical queries. +- `PARTITION_COUNT = 64`: number of partitions for data distribution. + +{% cut "Create the remaining tables" %} + +```sql +CREATE TABLE IF NOT EXISTS `tpch/s10/customer` ( + c_acctbal Double NOT NULL, + c_mktsegment String NOT NULL, + c_phone String NOT NULL, + c_nationkey Int32 NOT NULL, + c_custkey Int32 NOT NULL, + c_name String NOT NULL, + c_comment String NOT NULL, + c_address String NOT NULL, + PRIMARY KEY (c_custkey) +) +PARTITION BY HASH (c_custkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/lineitem` ( + l_orderkey Int64 NOT NULL, + l_partkey Int32 NOT NULL, + l_suppkey Int32 NOT NULL, + l_linenumber Int32 NOT NULL, + l_quantity Double NOT NULL, + l_extendedprice Double NOT NULL, + l_discount Double NOT NULL, + l_tax Double NOT NULL, + l_returnflag String NOT NULL, + l_linestatus String NOT NULL, + l_shipdate Date NOT NULL, + l_commitdate Date NOT NULL, + l_receiptdate Date NOT NULL, + l_shipinstruct String NOT NULL, + l_shipmode String NOT NULL, + l_comment String NOT NULL, + PRIMARY KEY (l_orderkey, l_linenumber) +) +PARTITION BY HASH (l_orderkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/nation` ( + n_nationkey Int32 NOT NULL, + n_name String NOT NULL, + n_regionkey Int32 NOT NULL, + n_comment String NOT NULL, + PRIMARY KEY (n_nationkey) +) +PARTITION BY HASH (n_nationkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/orders` ( + o_orderkey Int64 NOT NULL, + o_custkey Int32 NOT NULL, + o_orderstatus String NOT NULL, + o_totalprice Double NOT NULL, + o_orderdate Date NOT NULL, + o_orderpriority String NOT NULL, + o_clerk String NOT NULL, + o_shippriority Int32 NOT NULL, + o_comment String NOT NULL, + PRIMARY KEY (o_orderkey) +) +PARTITION BY HASH (o_orderkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/part` ( + p_partkey Int32 NOT NULL, + p_name String NOT NULL, + p_mfgr String NOT NULL, + p_brand String NOT NULL, + p_type String NOT NULL, + p_size Int32 NOT NULL, + p_container String NOT NULL, + p_retailprice Double NOT NULL, + p_comment String NOT NULL, + PRIMARY KEY (p_partkey) +) +PARTITION BY HASH (p_partkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/partsupp` ( + ps_partkey Int32 NOT NULL, + ps_suppkey Int32 NOT NULL, + ps_availqty Int32 NOT NULL, + ps_supplycost Double NOT NULL, + ps_comment String NOT NULL, + PRIMARY KEY (ps_partkey, ps_suppkey) +) +PARTITION BY HASH (ps_partkey, ps_suppkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/region` ( + r_regionkey Int32 NOT NULL, + r_name String NOT NULL, + r_comment String NOT NULL, + PRIMARY KEY (r_regionkey) +) +PARTITION BY HASH (r_regionkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); + +CREATE TABLE IF NOT EXISTS `tpch/s10/supplier` ( + s_suppkey Int32 NOT NULL, + s_name String NOT NULL, + s_address String NOT NULL, + s_nationkey Int32 NOT NULL, + s_phone String NOT NULL, + s_acctbal Double NOT NULL, + s_comment String NOT NULL, + PRIMARY KEY (s_suppkey) +) +PARTITION BY HASH (s_suppkey) +WITH ( + STORE = COLUMN, + PARTITION_COUNT = 64 +); +``` + +{% endcut %} + +### Loading data into column-oriented tables + +In this step, data is copied from the external storage into internal {{ ydb-short-name }} tables. This provides the local storage for fast analytics. The process may take time (for 10 GB — on the order of minutes, depending on resources). + +```sql +UPSERT INTO `tpch/s10/customer` SELECT * FROM `external/tpch/s10/customer`; +UPSERT INTO `tpch/s10/lineitem` SELECT * FROM `external/tpch/s10/lineitem`; +UPSERT INTO `tpch/s10/nation` SELECT * FROM `external/tpch/s10/nation`; +UPSERT INTO `tpch/s10/orders` SELECT * FROM `external/tpch/s10/orders`; +UPSERT INTO `tpch/s10/part` SELECT * FROM `external/tpch/s10/part`; +UPSERT INTO `tpch/s10/partsupp` SELECT * FROM `external/tpch/s10/partsupp`; +UPSERT INTO `tpch/s10/region` SELECT * FROM `external/tpch/s10/region`; +UPSERT INTO `tpch/s10/supplier` SELECT * FROM `external/tpch/s10/supplier`; +``` + +### Executing a test analytical query + +In this step, the TPC-H Q0 test query is executed to verify the correctness of data loading and performance. The query aggregates metrics from the `lineitem` table (about 6 million rows), with grouping and subsequent data sorting, demonstrating the efficiency of column-oriented tables. + +```sql +SELECT + l_returnflag, + l_linestatus, + SUM(l_quantity) AS sum_qty, + SUM(l_extendedprice) AS sum_base_price, + SUM(l_extendedprice * (1 - l_discount)) AS sum_disc_price, + SUM(l_extendedprice * (1 - l_discount) * (1 + l_tax)) AS sum_charge, + AVG(l_quantity) AS avg_qty, + AVG(l_extendedprice) AS avg_price, + AVG(l_discount) AS avg_disc, + COUNT(*) AS count_order +FROM + `tpch/s10/lineitem` +WHERE + l_shipdate <= Date('1998-12-01') - Interval("P90D") +GROUP BY + l_returnflag, + l_linestatus +ORDER BY + l_returnflag, + l_linestatus; +``` + + +### Next steps + +- Execute other queries from the TPC-H suite on the loaded tables using {{ ydb-short-name }} CLI and [TPC-H workload testing](../reference/ydb-cli/workload-tpch.md). +- Integration with BI systems for data [visualization](../integrations/visualization/index.md) and [analysis](../integrations/gui/index.md). \ No newline at end of file diff --git a/ydb/docs/en/core/analyst/toc_p.yaml b/ydb/docs/en/core/analyst/toc_p.yaml index 50acb0001b67..7ad0d12eeb1e 100644 --- a/ydb/docs/en/core/analyst/toc_p.yaml +++ b/ydb/docs/en/core/analyst/toc_p.yaml @@ -1,4 +1,6 @@ items: + - name: Quick start + href: olap_quickstart.md - name: Dataset import href: datasets/index.md include: