In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder \
    .appName("PetStoreDataMarts") \
    .config("spark.driver.extraClassPath", 
            "/home/jovyan/work/postgresql-42.7.1.jar:/home/jovyan/work/clickhouse-jdbc-0.4.6-all.jar") \
    .config("spark.executor.extraClassPath", 
            "/home/jovyan/work/postgresql-42.7.1.jar:/home/jovyan/work/clickhouse-jdbc-0.4.6-all.jar") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

clickhouse_properties = {
    "url": "jdbc:clickhouse://clickhouse-server:8123/default",
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "user": "default",
    "password": "clickhouse"
}

postgres_properties = {
    "url": "jdbc:postgresql://postgres:5432/pet_store_lab_2",
    "driver": "org.postgresql.Driver",
    "user": "postgres",
    "password": "password"
}

df_facts = spark.read.jdbc(
    url=postgres_properties["url"],
    table="fact_sales",
    properties=postgres_properties
)
df_facts.createOrReplaceTempView("facts")

df_products = spark.read.jdbc(
    url=postgres_properties["url"],
    table="dim_product",
    properties=postgres_properties
)
df_products.createOrReplaceTempView("products")

df_customers = spark.read.jdbc(
    url=postgres_properties["url"],
    table="dim_customer",
    properties=postgres_properties
)
df_customers.createOrReplaceTempView("customers")

df_dates = spark.read.jdbc(
    url=postgres_properties["url"],
    table="dim_date",
    properties=postgres_properties
)
df_dates.createOrReplaceTempView("dates")

df_stores = spark.read.jdbc(
    url=postgres_properties["url"],
    table="dim_store",
    properties=postgres_properties
)
df_stores.createOrReplaceTempView("stores")

df_suppliers = spark.read.jdbc(
    url=postgres_properties["url"],
    table="dim_supplier",
    properties=postgres_properties
)
df_suppliers.createOrReplaceTempView("suppliers")

print("Данные успешно загружены из PostgreSQL")

Данные успешно загружены из PostgreSQL


In [2]:
!pip install jaydebeapi
!pip install requests



In [3]:
import requests

clickhouse_properties = {
    "url": "jdbc:clickhouse://clickhouse-server:8123/default",
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "user": "default",
    "password": "clickhouse"
}

In [4]:
# Для витрины product_sales
drop_product_sales_sql = "DROP TABLE IF EXISTS default.product_sales"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_product_sales_sql
)
if response.status_code == 200:
    print("Таблица product_sales успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы product_sales из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS product_sales")
print("Временная таблица product_sales удалена из Spark SQL")

# Для витрины customer_sales
drop_customer_sales_sql = "DROP TABLE IF EXISTS default.customer_sales"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_customer_sales_sql
)
if response.status_code == 200:
    print("Таблица customer_sales успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы customer_sales из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS customer_sales")
print("Временная таблица customer_sales удалена из Spark SQL")

# Для витрины time_sales
drop_time_sales_sql = "DROP TABLE IF EXISTS default.time_sales"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_time_sales_sql
)
if response.status_code == 200:
    print("Таблица time_sales успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы time_sales из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS time_sales")
print("Временная таблица time_sales удалена из Spark SQL")

# Для витрины store_sales
drop_store_sales_sql = "DROP TABLE IF EXISTS default.store_sales"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_store_sales_sql
)
if response.status_code == 200:
    print("Таблица store_sales успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы store_sales из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS store_sales")
print("Временная таблица store_sales удалена из Spark SQL")

# Для витрины supplier_sales
drop_supplier_sales_sql = "DROP TABLE IF EXISTS default.supplier_sales"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_supplier_sales_sql
)
if response.status_code == 200:
    print("Таблица supplier_sales успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы supplier_sales из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS supplier_sales")
print("Временная таблица supplier_sales удалена из Spark SQL")

# Для витрины product_quality
drop_product_quality_sql = "DROP TABLE IF EXISTS default.product_quality"
response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=drop_product_quality_sql
)
if response.status_code == 200:
    print("Таблица product_quality успешно удалена из ClickHouse")
else:
    print(f"Ошибка при удалении таблицы product_quality из ClickHouse: {response.text}")

spark.sql("DROP TABLE IF EXISTS product_quality")
print("Временная таблица product_quality удалена из Spark SQL")

Таблица product_sales успешно удалена из ClickHouse
Временная таблица product_sales удалена из Spark SQL
Таблица customer_sales успешно удалена из ClickHouse
Временная таблица customer_sales удалена из Spark SQL
Таблица time_sales успешно удалена из ClickHouse
Временная таблица time_sales удалена из Spark SQL
Таблица store_sales успешно удалена из ClickHouse
Временная таблица store_sales удалена из Spark SQL
Таблица supplier_sales успешно удалена из ClickHouse
Временная таблица supplier_sales удалена из Spark SQL
Таблица product_quality успешно удалена из ClickHouse
Временная таблица product_quality удалена из Spark SQL


In [5]:
# Создание витрины продаж по продуктам
create_table_sql = """
CREATE TABLE IF NOT EXISTS default.product_sales (
    product_id UInt32,
    product_name String,
    product_category String,
    product_brand String,
    total_quantity_sold UInt32,
    total_revenue Decimal64(2),
    avg_price Decimal64(2),
    avg_rating Decimal32(2),
    total_reviews UInt32,
    unique_customers UInt32,
    avg_unit_price Decimal64(2)
) ENGINE = MergeTree()
ORDER BY product_id
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_table_sql
)

if response.status_code == 200:
    print("Таблица product_sales успешно создана")
else:
    print(f"Ошибка при создании таблицы: {response.text}")

product_sales_sql = """
SELECT 
    p.product_id,
    p.product_name,
    p.product_category,
    p.product_brand,
    SUM(f.sale_quantity) as total_quantity_sold,
    SUM(f.sale_total_price) as total_revenue,
    AVG(f.unit_price) as avg_price,
    AVG(f.product_rating) as avg_rating,
    SUM(f.product_reviews) as total_reviews,
    COUNT(DISTINCT f.customer_id) as unique_customers,
    ROUND(SUM(f.sale_total_price) / SUM(f.sale_quantity), 2) as avg_unit_price
FROM facts f
JOIN products p ON f.product_id = p.product_id
GROUP BY 
    p.product_id,
    p.product_name,
    p.product_category,
    p.product_brand
"""

product_sales_df = spark.sql(product_sales_sql)

try:
    product_sales_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="product_sales",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу product_sales")
except Exception as e:
    print(f"Ошибка при записи данных: {str(e)}")

check_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="product_sales",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине product_sales:", check_df.count())
print("\nПример данных из витрины product_sales:")
check_df.show(5)

Таблица product_sales успешно создана
Данные успешно записаны в таблицу product_sales

Количество записей в витрине product_sales: 1000

Пример данных из витрины product_sales:
+----------+------------+----------------+-------------+-------------------+-------------+---------+----------+-------------+----------------+--------------+
|product_id|product_name|product_category|product_brand|total_quantity_sold|total_revenue|avg_price|avg_rating|total_reviews|unique_customers|avg_unit_price|
+----------+------------+----------------+-------------+-------------------+-------------+---------+----------+-------------+----------------+--------------+
|         1|    Dog Food|            Food|        Skajo|                 40|      2197.32|    46.84|      3.03|         5771|               1|         54.93|
|         2|     Cat Toy|            Food|    Wordpedia|                 45|      3213.26|    51.61|      2.98|         5175|               1|         71.41|
|         3|   Bird Cage|        

In [6]:
# Создание витрины продаж по клиентам
customer_sales_sql = """
SELECT 
    c.customer_id,
    CONCAT(c.customer_first_name, ' ', c.customer_last_name) as customer_name,
    COALESCE(c.customer_email, '') as customer_email,
    COALESCE(c.customer_country, '') as customer_country,
    COALESCE(c.customer_postal_code, '') as customer_postal,
    COUNT(*) as total_orders,
    SUM(f.sale_total_price) as total_spent,
    AVG(f.sale_total_price) as avg_check,
    COUNT(DISTINCT f.product_id) as unique_products_bought,
    MAX(f.date_id) as last_purchase_date,
    SUM(f.product_reviews) as total_reviews_left
FROM facts f
JOIN customers c ON f.customer_id = c.customer_id
GROUP BY 
    c.customer_id,
    c.customer_first_name,
    c.customer_last_name,
    c.customer_email,
    c.customer_country,
    c.customer_postal_code
"""

customer_sales_df = spark.sql(customer_sales_sql)

create_customer_sales_table = """
CREATE TABLE IF NOT EXISTS default.customer_sales (
    customer_id UInt32,
    customer_name String,
    customer_email String,
    customer_country String,
    customer_postal String,
    total_orders UInt32,
    total_spent Decimal64(2),
    avg_check Decimal64(2),
    unique_products_bought UInt32,
    last_purchase_date Date,
    total_reviews_left UInt32
) ENGINE = MergeTree()
ORDER BY customer_id;
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_customer_sales_table
)

if response.status_code == 200:
    print("Таблица customer_sales успешно создана в ClickHouse")
else:
    print(f"Ошибка при создании таблицы customer_sales: {response.text}")

try:
    customer_sales_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="customer_sales",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу customer_sales")
except Exception as e:
    print(f"Ошибка при записи данных в customer_sales: {str(e)}")

check_customer_sales_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="customer_sales",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине customer_sales:", check_customer_sales_df.count())
print("\nПример данных из витрины customer_sales:")
check_customer_sales_df.show(5)

Таблица customer_sales успешно создана в ClickHouse
Данные успешно записаны в таблицу customer_sales

Количество записей в витрине customer_sales: 1000

Пример данных из витрины customer_sales:
+-----------+----------------+--------------------+----------------+---------------+------------+-----------+---------+----------------------+------------------+------------------+
|customer_id|   customer_name|      customer_email|customer_country|customer_postal|total_orders|total_spent|avg_check|unique_products_bought|last_purchase_date|total_reviews_left|
+-----------+----------------+--------------------+----------------+---------------+------------+-----------+---------+----------------------+------------------+------------------+
|          1|  Barron Rawlyns|bmassingham0@army...|           China|               |          10|    2197.32|   219.73|                     1|        2021-11-25|              5771|
|          2|    Ham Knowller|  cscudder1@time.com|          Poland|         73-11

In [7]:
# Создание витрины продаж по времени
time_sales_sql = """
SELECT 
    d.date_id,
    d.year,
    d.month,
    d.day,
    COUNT(*) as total_orders,
    SUM(f.sale_quantity) as total_items_sold,
    SUM(f.sale_total_price) as total_revenue,
    AVG(f.sale_total_price) as avg_order_value,
    COUNT(DISTINCT f.customer_id) as unique_customers,
    COUNT(DISTINCT f.product_id) as unique_products,
    SUM(f.product_reviews) as total_reviews,
    AVG(f.product_rating) as avg_rating
FROM facts f
JOIN dates d ON f.date_id = d.date_id
GROUP BY 
    d.date_id,
    d.year,
    d.month,
    d.day
"""

time_sales_df = spark.sql(time_sales_sql)

create_time_sales_table = """
CREATE TABLE IF NOT EXISTS default.time_sales (
    date_id Date,
    year UInt16,
    month UInt8,
    day UInt8,
    total_orders UInt32,
    total_items_sold UInt32,
    total_revenue Decimal64(2),
    avg_order_value Decimal64(2),
    unique_customers UInt32,
    unique_products UInt32,
    total_reviews UInt32,
    avg_rating Decimal32(2)
) ENGINE = MergeTree()
ORDER BY date_id;
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_time_sales_table
)

if response.status_code == 200:
    print("Таблица time_sales успешно создана в ClickHouse")
else:
    print(f"Ошибка при создании таблицы time_sales: {response.text}")

try:
    time_sales_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="time_sales",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу time_sales")
except Exception as e:
    print(f"Ошибка при записи данных в time_sales: {str(e)}")

check_time_sales_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="time_sales",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине time_sales:", check_time_sales_df.count())
print("\nПример данных из витрины time_sales:")
check_time_sales_df.show(5)

Таблица time_sales успешно создана в ClickHouse
Данные успешно записаны в таблицу time_sales

Количество записей в витрине time_sales: 364

Пример данных из витрины time_sales:
+----------+----+-----+---+------------+----------------+-------------+---------------+----------------+---------------+-------------+----------+
|   date_id|year|month|day|total_orders|total_items_sold|total_revenue|avg_order_value|unique_customers|unique_products|total_reviews|avg_rating|
+----------+----+-----+---+------------+----------------+-------------+---------------+----------------+---------------+-------------+----------+
|2021-01-01|2021|    1|  1|          32|             168|      7042.47|         220.07|              32|             32|        17001|      2.90|
|2021-01-02|2021|    1|  2|          28|             164|      8195.44|         292.69|              28|             28|        14796|      3.12|
|2021-01-03|2021|    1|  3|          26|             158|      6679.27|         256.89|      

In [8]:
# Создание витрины продаж по магазинам
store_sales_sql = """
SELECT 
    s.store_id,
    s.store_name,
    s.store_city,
    s.store_country,
    COUNT(*) as total_orders,
    COUNT(DISTINCT f.customer_id) as unique_customers,
    COUNT(DISTINCT f.product_id) as unique_products_sold,
    SUM(f.sale_quantity) as total_items_sold,
    SUM(f.sale_total_price) as total_revenue,
    AVG(f.sale_total_price) as avg_check,
    SUM(f.product_reviews) as total_reviews,
    AVG(f.product_rating) as avg_rating
FROM facts f
JOIN stores s ON f.store_id = s.store_id
GROUP BY 
    s.store_id,
    s.store_name,
    s.store_city,
    s.store_country
"""

store_sales_df = spark.sql(store_sales_sql)

create_store_sales_table = """
CREATE TABLE IF NOT EXISTS default.store_sales (
    store_id UInt32,
    store_name String,
    store_city String,
    store_country String,
    total_orders UInt32,
    unique_customers UInt32,
    unique_products_sold UInt32,
    total_items_sold UInt32,
    total_revenue Decimal64(2),
    avg_check Decimal64(2),
    total_reviews UInt32,
    avg_rating Decimal32(2)
) ENGINE = MergeTree()
ORDER BY store_id;
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_store_sales_table
)

if response.status_code == 200:
    print("Таблица store_sales успешно создана в ClickHouse")
else:
    print(f"Ошибка при создании таблицы store_sales: {response.text}")

try:
    store_sales_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="store_sales",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу store_sales")
except Exception as e:
    print(f"Ошибка при записи данных в store_sales: {str(e)}")

check_store_sales_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="store_sales",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине store_sales:", check_store_sales_df.count())
print("\nПример данных из витрины store_sales:")
check_store_sales_df.show(5)

Таблица store_sales успешно создана в ClickHouse
Данные успешно записаны в таблицу store_sales

Количество записей в витрине store_sales: 383

Пример данных из витрины store_sales:
+--------+----------+----------+-------------+------------+----------------+--------------------+----------------+-------------+---------+-------------+----------+
|store_id|store_name|store_city|store_country|total_orders|unique_customers|unique_products_sold|total_items_sold|total_revenue|avg_check|total_reviews|avg_rating|
+--------+----------+----------+-------------+------------+----------------+--------------------+----------------+-------------+---------+-------------+----------+
|       0|     Ainyx| Norrtälje|       Canada|          18|              17|                  17|              86|      5640.82|   313.37|         8371|      2.74|
|       1|    Avamba|   Gambang|       Sweden|          29|              28|                  28|             168|      6716.17|   231.59|        16470|      2.74|

In [9]:
# Создание витрины продаж по поставщикам
supplier_sales_sql = """
SELECT 
    s.supplier_id,
    s.supplier_name,
    s.supplier_country,
    COUNT(DISTINCT f.product_id) as unique_products,
    COUNT(*) as total_orders,
    SUM(f.sale_quantity) as total_items_sold,
    SUM(f.sale_total_price) as total_revenue,
    AVG(f.unit_price) as avg_product_price,
    COUNT(DISTINCT f.customer_id) as unique_customers,
    SUM(f.product_reviews) as total_reviews,
    AVG(f.product_rating) as avg_product_rating,
    COALESCE(AVG(CASE WHEN f.product_rating >= 4 THEN 1 ELSE 0 END), 0) as high_rated_ratio
FROM facts f
JOIN suppliers s ON f.supplier_id = s.supplier_id
GROUP BY 
    s.supplier_id,
    s.supplier_name,
    s.supplier_country
"""

supplier_sales_df = spark.sql(supplier_sales_sql)

create_supplier_sales_table = """
CREATE TABLE IF NOT EXISTS default.supplier_sales (
    supplier_id UInt32,
    supplier_name String,
    supplier_country String,
    unique_products UInt32,
    total_orders UInt32,
    total_items_sold UInt32,
    total_revenue Decimal64(2),
    avg_product_price Decimal64(2),
    unique_customers UInt32,
    total_reviews UInt32,
    avg_product_rating Decimal32(2),
    high_rated_ratio Decimal32(2)
) ENGINE = MergeTree()
ORDER BY supplier_id;
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_supplier_sales_table
)

if response.status_code == 200:
    print("Таблица supplier_sales успешно создана в ClickHouse")
else:
    print(f"Ошибка при создании таблицы supplier_sales: {response.text}")

try:
    supplier_sales_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="supplier_sales",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу supplier_sales")
except Exception as e:
    print(f"Ошибка при записи данных в supplier_sales: {str(e)}")

check_supplier_sales_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="supplier_sales",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине supplier_sales:", check_supplier_sales_df.count())
print("\nПример данных из витрины supplier_sales:")
check_supplier_sales_df.show(5)

Таблица supplier_sales успешно создана в ClickHouse
Данные успешно записаны в таблицу supplier_sales

Количество записей в витрине supplier_sales: 383

Пример данных из витрины supplier_sales:
+-----------+-------------+----------------+---------------+------------+----------------+-------------+-----------------+----------------+-------------+------------------+----------------+
|supplier_id|supplier_name|supplier_country|unique_products|total_orders|total_items_sold|total_revenue|avg_product_price|unique_customers|total_reviews|avg_product_rating|high_rated_ratio|
+-----------+-------------+----------------+---------------+------------+----------------+-------------+-----------------+----------------+-------------+------------------+----------------+
|          0|        Abata|        Malaysia|             26|          26|             168|      6347.87|            47.25|              26|        13309|              3.15|            0.23|
|          1|        Abatz|          Russia|   

In [10]:
# Создание витрины качества продукции
product_quality_sql = """
SELECT 
    p.product_id,
    p.product_name,
    p.product_category,
    p.product_brand,
    AVG(f.product_rating) as avg_rating,
    SUM(f.product_reviews) as total_reviews,
    COUNT(*) as total_orders,
    SUM(f.sale_quantity) as total_quantity_sold,
    SUM(f.sale_total_price) as total_revenue,
    COUNT(DISTINCT f.customer_id) as unique_customers,
    AVG(f.unit_price) as avg_price,
    COALESCE(AVG(CASE WHEN f.product_rating >= 4 THEN 1 ELSE 0 END), 0) as high_rating_ratio,
    COALESCE(AVG(CASE WHEN f.product_rating <= 2 THEN 1 ELSE 0 END), 0) as low_rating_ratio
FROM facts f
JOIN products p ON f.product_id = p.product_id
GROUP BY 
    p.product_id,
    p.product_name,
    p.product_category,
    p.product_brand
"""

product_quality_df = spark.sql(product_quality_sql)

create_product_quality_table = """
CREATE TABLE IF NOT EXISTS default.product_quality (
    product_id UInt32,
    product_name String,
    product_category String,
    product_brand String,
    avg_rating Decimal32(2),
    total_reviews UInt32,
    total_orders UInt32,
    total_quantity_sold UInt32,
    total_revenue Decimal64(2),
    unique_customers UInt32,
    avg_price Decimal64(2),
    high_rating_ratio Decimal32(2),
    low_rating_ratio Decimal32(2)
) ENGINE = MergeTree()
ORDER BY product_id;
"""

response = requests.post(
    'http://clickhouse-server:8123',
    params={'user': 'default', 'password': 'clickhouse'},
    data=create_product_quality_table
)

if response.status_code == 200:
    print("Таблица product_quality успешно создана в ClickHouse")
else:
    print(f"Ошибка при создании таблицы product_quality: {response.text}")

try:
    product_quality_df.write \
        .jdbc(url=clickhouse_properties["url"],
              table="product_quality",
              mode="append",
              properties=clickhouse_properties)
    print("Данные успешно записаны в таблицу product_quality")
except Exception as e:
    print(f"Ошибка при записи данных в product_quality: {str(e)}")

check_product_quality_df = spark.read.jdbc(
    url=clickhouse_properties["url"],
    table="product_quality",
    properties=clickhouse_properties
)

print("\nКоличество записей в витрине product_quality:", check_product_quality_df.count())
print("\nПример данных из витрины product_quality:")
check_product_quality_df.show(5)

Таблица product_quality успешно создана в ClickHouse
Данные успешно записаны в таблицу product_quality

Количество записей в витрине product_quality: 1000

Пример данных из витрины product_quality:
+----------+------------+----------------+-------------+----------+-------------+------------+-------------------+-------------+----------------+---------+-----------------+----------------+
|product_id|product_name|product_category|product_brand|avg_rating|total_reviews|total_orders|total_quantity_sold|total_revenue|unique_customers|avg_price|high_rating_ratio|low_rating_ratio|
+----------+------------+----------------+-------------+----------+-------------+------------+-------------------+-------------+----------------+---------+-----------------+----------------+
|         1|    Dog Food|            Food|        Skajo|      3.03|         5771|          10|                 40|      2197.32|               1|    46.84|             0.20|            0.20|
|         2|     Cat Toy|            F

In [11]:
print("Проверка количества записей в витринах:")
for table in ['product_sales', 'customer_sales', 'time_sales', 'store_sales', 'supplier_sales', 'product_quality']:
    check_df = spark.read.jdbc(
        url=clickhouse_properties["url"],
        table=table,
        properties=clickhouse_properties
    )
    print(f"{table}: {check_df.count()} записей")

Проверка количества записей в витринах:
product_sales: 1000 записей
customer_sales: 1000 записей
time_sales: 364 записей
store_sales: 383 записей
supplier_sales: 383 записей
product_quality: 1000 записей


In [12]:
# 1. Топ-10 самых продаваемых продуктов
top_products_sql = """
SELECT 
    product_name,
    product_category,
    total_quantity_sold,
    total_revenue,
    avg_rating
FROM product_sales
ORDER BY total_quantity_sold DESC
LIMIT 10
"""

# 2. Топ-10 клиентов с наибольшей суммой покупок
top_customers_sql = """
SELECT 
    customer_name,
    customer_country,
    total_spent,
    total_orders,
    avg_check
FROM customer_sales
ORDER BY total_spent DESC
LIMIT 10
"""

# 3. Месячные тренды продаж
monthly_trends_sql = """
SELECT 
    year,
    month,
    SUM(total_revenue) as monthly_revenue,
    AVG(avg_order_value) as avg_order_value,
    SUM(total_orders) as total_orders
FROM time_sales
GROUP BY year, month
ORDER BY year, month
"""

# 4. Топ-5 магазинов с наибольшей выручкой
top_stores_sql = """
SELECT 
    store_name,
    store_city,
    store_country,
    total_revenue,
    unique_customers,
    avg_check
FROM store_sales
ORDER BY total_revenue DESC
LIMIT 5
"""

# 5. Топ-5 поставщиков с наибольшей выручкой
top_suppliers_sql = """
SELECT 
    supplier_name,
    supplier_country,
    total_revenue,
    avg_product_price,
    total_items_sold,
    avg_product_rating
FROM supplier_sales
ORDER BY total_revenue DESC
LIMIT 5
"""

# 6. Продукты с наивысшим и наименьшим рейтингом
product_ratings_sql = """
SELECT 
    product_name,
    product_category,
    avg_rating,
    total_reviews,
    total_quantity_sold,
    total_revenue
FROM product_quality
WHERE total_reviews > 1000  -- Фильтр для исключения продуктов с малым количеством отзывов
ORDER BY avg_rating DESC
LIMIT 5
"""

for query_name, query in [
    ("Топ-10 продаваемых продуктов", top_products_sql),
    ("Топ-10 клиентов", top_customers_sql),
    ("Месячные тренды продаж", monthly_trends_sql),
    ("Топ-5 магазинов", top_stores_sql),
    ("Топ-5 поставщиков", top_suppliers_sql),
    ("Топ-5 продуктов по рейтингу", product_ratings_sql)
]:
    print(f"\n=== {query_name} ===")
    result_df = spark.read.jdbc(
        url=clickhouse_properties["url"],
        table=f"({query})",
        properties=clickhouse_properties
    )
    result_df.show()


=== Топ-10 продаваемых продуктов ===
+------------+----------------+-------------------+-------------+----------+
|product_name|product_category|total_quantity_sold|total_revenue|avg_rating|
+------------+----------------+-------------------+-------------+----------+
|     Cat Toy|            Food|                 84|      2116.63|      2.69|
|     Cat Toy|            Cage|                 84|      2791.52|      2.07|
|    Dog Food|            Food|                 80|      2964.14|      3.14|
|   Bird Cage|            Food|                 80|      2113.29|      2.80|
|   Bird Cage|            Food|                 78|      2809.37|      3.18|
|     Cat Toy|             Toy|                 77|      2621.96|      3.24|
|     Cat Toy|             Toy|                 77|      2185.49|      2.88|
|     Cat Toy|            Cage|                 77|      2325.62|      2.99|
|    Dog Food|            Food|                 77|      2691.13|      2.87|
|    Dog Food|             Toy|       