In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [None]:
spark = SparkSession.builder \
    .appName("ClickHouse ETL") \
    .config("spark.jars",
            "/opt/spark/jars/clickhouse-jdbc-0.6.0.jar,"
            "/opt/spark/jars/clickhouse-spark-connector_2.12-0.8.0.jar") \
    .getOrCreate()

# Настройки подключения к БД и Spark
ch_url = "jdbc:clickhouse://clickhouse:8123/default"
ch_options = {
    "host": "clickhouse",
    "port": "8123",
    "user": "default",
    "password": "",
    "database": "default"
}

Подключение к Postgres и чтение таблиц измерение и таблицы фактор

In [None]:
pg_url = "jdbc:postgresql://postgres:5432/bober_db"
pg_properties = {"user": "bober", "password": "bober", "driver": "org.postgresql.Driver"}
df = spark.read.jdbc(url=pg_url, table="mock_data", properties=pg_properties)

# Загружаем все таблицы звезды
fact = spark.read.jdbc(url=pg_url, table="fact_sales", properties=pg_properties)
dim_product = spark.read.jdbc(url=pg_url, table="dim_product", properties=pg_properties)
dim_customer = spark.read.jdbc(url=pg_url, table="dim_customer", properties=pg_properties)
dim_store = spark.read.jdbc(url=pg_url, table="dim_store", properties=pg_properties)
dim_supplier = spark.read.jdbc(url=pg_url, table="dim_supplier", properties=pg_properties)
dim_date = spark.read.jdbc(url=pg_url, table="dim_date", properties=pg_properties)
dim_date.head(1)

Создание таблиц в `ClickHouse`

In [None]:
import requests

def create_clickhouse_table(table_name, create_query):
    """Создание таблицы в ClickHouse через HTTP API"""
    try:
        url = "http://clickhouse:8123/"
        response = requests.post(url, data=create_query)
        if response.status_code == 200:
            print(f"✓ Таблица {table_name} создана")
            return True
        else:
            print(f"✗ Ошибка создания {table_name}: {response.text}")
            return False
    except Exception as e:
        print(f"✗ Ошибка при создании таблицы {table_name}: {e}")
        return False

# Создаем все таблицы
print("Создание таблиц в ClickHouse...")

# Продажи по продуктам
create_clickhouse_table("vitrina_product_sales", """
CREATE TABLE IF NOT EXISTS vitrina_product_sales (
    product_id UInt32,
    name String,
    category String,
    total_quantity UInt64,
    total_revenue Decimal(15,2),
    avg_rating Float32,
    review_count UInt32
) ENGINE = MergeTree()
ORDER BY (category, product_id)
""")

# Продажи по клиентам
create_clickhouse_table("vitrina_customer_sales", """
CREATE TABLE IF NOT EXISTS vitrina_customer_sales (
    customer_id UInt32,
    customer_name String,
    country String,
    total_spent Decimal(15,2),
    order_count UInt32,
    avg_check Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY (country, customer_id)
""")

# Продажи по времени
create_clickhouse_table("vitrina_time_sales", """
CREATE TABLE IF NOT EXISTS vitrina_time_sales (
    year UInt16,
    month UInt8,
    total_revenue Decimal(15,2),
    total_quantity UInt64,
    order_count UInt32,
    avg_check Decimal(15,2),
    avg_order_size Float32
) ENGINE = MergeTree()
ORDER BY (year, month)
""")

# Продажи по магазинам
create_clickhouse_table("vitrina_store_sales", """
CREATE TABLE IF NOT EXISTS vitrina_store_sales (
    store_id UInt32,
    store_name String,
    store_city String,
    store_country String,
    total_revenue Decimal(15,2),
    order_count UInt32,
    avg_check Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY (store_country, store_id)
""")

# Продажи по поставщикам
create_clickhouse_table("vitrina_supplier_sales", """
CREATE TABLE IF NOT EXISTS vitrina_supplier_sales (
    supplier_id UInt32,
    supplier_name String,
    supplier_country String,
    total_revenue Decimal(15,2),
    avg_price Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY (supplier_country, supplier_id)
""")

# Качество продукции
create_clickhouse_table("vitrina_product_quality", """
CREATE TABLE IF NOT EXISTS vitrina_product_quality (
    product_id UInt32,
    name String,
    rating Float32,
    review_count UInt32,
    total_quantity UInt64,
    total_revenue Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY product_id
""")

# Дополнительные таблицы
create_clickhouse_table("top10_sold_products", """
CREATE TABLE IF NOT EXISTS top10_sold_products (
    product_id UInt32,
    name String,
    category String,
    total_quantity UInt64,
    total_revenue Decimal(15,2),
    avg_rating Float32,
    review_count UInt32
) ENGINE = MergeTree()
ORDER BY product_id
""")

create_clickhouse_table("category_revenue", """
CREATE TABLE IF NOT EXISTS category_revenue (
    category String,
    category_revenue Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY category
""")

create_clickhouse_table("top10_customers_by_spent", """
CREATE TABLE IF NOT EXISTS top10_customers_by_spent (
    customer_id UInt32,
    customer_name String,
    country String,
    total_spent Decimal(15,2),
    order_count UInt32,
    avg_check Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY customer_id
""")

create_clickhouse_table("customer_country_distribution", """
CREATE TABLE IF NOT EXISTS customer_country_distribution (
    country String,
    total_spent_by_country Decimal(15,2),
    customer_count UInt32
) ENGINE = MergeTree()
ORDER BY country
""")

create_clickhouse_table("top5_stores_by_revenue", """
CREATE TABLE IF NOT EXISTS top5_stores_by_revenue (
    store_id UInt32,
    store_name String,
    store_city String,
    store_country String,
    total_revenue Decimal(15,2),
    order_count UInt32,
    avg_check Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY store_id
""")

create_clickhouse_table("top5_suppliers_by_revenue", """
CREATE TABLE IF NOT EXISTS top5_suppliers_by_revenue (
    supplier_id UInt32,
    supplier_name String,
    supplier_country String,
    total_revenue Decimal(15,2),
    avg_price Decimal(15,2)
) ENGINE = MergeTree()
ORDER BY supplier_id
""")

create_clickhouse_table("product_quality_correlation", """
CREATE TABLE IF NOT EXISTS product_quality_correlation (
    corr_rating_revenue Float64,
    corr_rating_quantity Float64,
    description String
) ENGINE = MergeTree()
ORDER BY description
""")

Загрузка данных в `ClickHouse`

In [None]:
def write_to_clickhouse_existing(df, table_name):
    """Запись DataFrame в существующую таблицу ClickHouse"""
    try:
        df.write \
            .format("jdbc") \
            .option("url", ch_jdbc_url) \
            .option("dbtable", table_name) \
            .option("user", ch_properties["user"]) \
            .option("password", ch_properties["password"]) \
            .option("driver", ch_properties["driver"]) \
            .option("batchsize", 100000) \
            .mode("append") \
            .save()
        
        print(f"✓ Данные записаны в таблицу {table_name}")
        
    except Exception as e:
        print(f"✗ Ошибка при записи в таблицу {table_name}: {e}")

# Витрина продаж по продуктам
print("Создание витрины продаж по продуктам...")
product_vitrina = fact.join(dim_product, fact.product_id == dim_product.product_id) \
    .groupBy(dim_product.product_id, dim_product.name, dim_product.category) \
    .agg(
        sum("sale_quantity").alias("total_quantity"),
        sum("sale_total_price").alias("total_revenue"),
        first("rating").alias("avg_rating"),
        first("reviews").alias("review_count")
    )

write_to_clickhouse_existing(product_vitrina, "vitrina_product_sales")

# Топ-10 самых продаваемых
top10_products = product_vitrina.orderBy(desc("total_quantity")).limit(10)
write_to_clickhouse_existing(top10_products, "top10_sold_products")

# Выручка по категориям
category_revenue = product_vitrina.groupBy("category") \
    .agg(sum("total_revenue").alias("category_revenue"))
write_to_clickhouse_existing(category_revenue, "category_revenue")


# Продажи по клиентам
print("Создание витрины продаж по клиентам...")
customer_vitrina = fact.join(dim_customer, fact.customer_id == dim_customer.customer_id) \
    .groupBy(dim_customer.customer_id, dim_customer.first_name, dim_customer.last_name, dim_customer.country) \
    .agg(
        sum("sale_total_price").alias("total_spent"),
        count("*").alias("order_count"),
        avg("sale_total_price").alias("avg_check")
    ) \
    .withColumn("customer_name", concat_ws(" ", col("first_name"), col("last_name"))) \
    .select("customer_id", "customer_name", "country", "total_spent", "order_count", "avg_check")

write_to_clickhouse_existing(customer_vitrina, "vitrina_customer_sales")

top10_customers = customer_vitrina.orderBy(desc("total_spent")).limit(10)
write_to_clickhouse_existing(top10_customers, "top10_customers_by_spent")

customer_country_dist = customer_vitrina.groupBy("country") \
    .agg(
        sum("total_spent").alias("total_spent_by_country"),
        count("*").alias("customer_count")
    )
write_to_clickhouse_existing(customer_country_dist, "customer_country_distribution")

# Продажи по времени
print("Создание витрины продаж по времени...")
time_vitrina = fact.join(dim_date, fact.date_id == dim_date.date_id) \
    .groupBy(dim_date.year, dim_date.month) \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        sum("sale_quantity").alias("total_quantity"),
        count("*").alias("order_count")
    ) \
    .withColumn("avg_check", col("total_revenue") / col("order_count")) \
    .withColumn("avg_order_size", col("total_quantity") / col("order_count"))

write_to_clickhouse_existing(time_vitrina, "vitrina_time_sales")

# Продажи по магазинам 
print("Создание витрины продаж по магазинам...")
store_vitrina = fact.join(dim_store, fact.store_id == dim_store.store_id) \
    .groupBy(dim_store.store_id, dim_store.store_name, dim_store.store_city, dim_store.store_country) \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        count("*").alias("order_count"),
        avg("sale_total_price").alias("avg_check")
    )

write_to_clickhouse_existing(store_vitrina, "vitrina_store_sales")

top5_stores = store_vitrina.orderBy(desc("total_revenue")).limit(5)
write_to_clickhouse_existing(top5_stores, "top5_stores_by_revenue")

# Продажи по поставщикам 
print("Создание витрины продаж по поставщикам...")
supplier_vitrina = fact.join(dim_product, fact.product_id == dim_product.product_id) \
    .join(dim_supplier, fact.supplier_id == dim_supplier.supplier_id) \
    .groupBy(dim_supplier.supplier_id, dim_supplier.supplier_name, dim_supplier.supplier_country) \
    .agg(
        sum("sale_total_price").alias("total_revenue"),
        sum(col("price") * col("sale_quantity")).alias("weighted_price_sum"),
        sum("sale_quantity").alias("total_quantity")
    ) \
    .withColumn("avg_price", col("weighted_price_sum") / col("total_quantity")) \
    .select("supplier_id", "supplier_name", "supplier_country", "total_revenue", "avg_price")

write_to_clickhouse_existing(supplier_vitrina, "vitrina_supplier_sales")

top5_suppliers = supplier_vitrina.orderBy(desc("total_revenue")).limit(5)
write_to_clickhouse_existing(top5_suppliers, "top5_suppliers_by_revenue")

# Качество продукции
print("Создание витрины качества продукции...")
quality_vitrina = fact.join(dim_product, fact.product_id == dim_product.product_id) \
    .groupBy(dim_product.product_id, dim_product.name) \
    .agg(
        first("rating").alias("rating"),
        first("reviews").alias("review_count"),
        sum("sale_quantity").alias("total_quantity"),
        sum("sale_total_price").alias("total_revenue")
    )

write_to_clickhouse_existing(quality_vitrina, "vitrina_product_quality")

# Корреляция
correlation = quality_vitrina.agg(
    corr("rating", "total_revenue").alias("corr_rating_revenue"),
    corr("rating", "total_quantity").alias("corr_rating_quantity")
).withColumn("description", lit("Correlation between rating and sales"))

write_to_clickhouse_existing(correlation, "product_quality_correlation")

print("=" * 60)
print("ВСЕ ДАННЫЕ УСПЕШНО ЗАГРУЖЕНЫ В CLICKHOUSE!")
print("=" * 60)

# Проверка данных
def check_table_count(table_name):
    """Проверка количества записей в таблице"""
    try:
        count_df = spark.read \
            .format("jdbc") \
            .option("url", ch_jdbc_url) \
            .option("dbtable", f"(SELECT count(*) as cnt FROM {table_name}) as t") \
            .option("user", ch_properties["user"]) \
            .option("password", ch_properties["password"]) \
            .option("driver", ch_properties["driver"]) \
            .load()
        count = count_df.first()["cnt"]
        print(f"✓ Таблица {table_name}: {count} записей")
        return count
    except Exception as e:
        print(f"✗ Ошибка при проверке таблицы {table_name}: {e}")
        return 0

print("\nПроверка загруженных данных:")
tables_to_check = [
    "vitrina_product_sales", "vitrina_customer_sales", "vitrina_time_sales",
    "vitrina_store_sales", "vitrina_supplier_sales", "vitrina_product_quality",
    "top10_sold_products", "top10_customers_by_spent", "top5_stores_by_revenue",
    "top5_suppliers_by_revenue", "product_quality_correlation"
]

for table in tables_to_check:
    check_table_count(table)