In [96]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id, col, row_number, lit, count, desc, regexp_replace, sum, avg, year, month
from pyspark.sql.window import Window
from pyspark.sql.types import *


In [2]:
spark = SparkSession.builder \
    .appName("Spark SQL with PostgreSQL and ClickHouse") \
    .config("spark.jars", "postgresql-42.6.0.jar,clickhouse-jdbc-0.4.6.jar") \
    .getOrCreate()

In [3]:

schema = StructType([
    # Customer fields
    StructField("id", IntegerType(), nullable=False),
    StructField("customer_first_name", StringType()),
    StructField("customer_last_name", StringType()),
    StructField("customer_age", IntegerType()),
    StructField("customer_email", StringType()),
    StructField("customer_country", StringType()),
    StructField("customer_postal_code", StringType()),
    StructField("customer_pet_type", StringType()),
    StructField("customer_pet_name", StringType()),
    StructField("customer_pet_breed", StringType()),
    
    # Seller fields
    StructField("seller_first_name", StringType()),
    StructField("seller_last_name", StringType()),
    StructField("seller_email", StringType()),
    StructField("seller_country", StringType()),
    StructField("seller_postal_code", StringType()),
    
    # Product fields
    StructField("product_name", StringType()),
    StructField("product_category", StringType()),
    StructField("product_price", DecimalType(10, 2)),  # Will map to PostgreSQL money
    StructField("product_quantity", IntegerType()),
    
    # Sale fields
    StructField("sale_date", DateType()),
    StructField("sale_customer_id", IntegerType()),
    StructField("sale_seller_id", IntegerType()),
    StructField("sale_product_id", IntegerType()),
    StructField("sale_quantity", IntegerType()),
    StructField("sale_total_price", DecimalType(10, 2)),  # PostgreSQL money
    
    # Store fields
    StructField("store_name", StringType()),
    StructField("store_location", StringType()),
    StructField("store_city", StringType()),
    StructField("store_state", StringType()),
    StructField("store_country", StringType()),
    StructField("store_phone", StringType()),
    StructField("store_email", StringType()),
    
    # Additional product details
    StructField("pet_category", StringType()),
    StructField("product_weight", FloatType()),
    StructField("product_color", StringType()),
    StructField("product_size", StringType()),
    StructField("product_brand", StringType()),
    StructField("product_material", StringType()),
    StructField("product_description", StringType()),
    StructField("product_rating", FloatType()),
    StructField("product_reviews", IntegerType()),
    StructField("product_release_date", DateType()),
    StructField("product_expiry_date", DateType()),
    
    # Supplier fields
    StructField("supplier_name", StringType()),
    StructField("supplier_contact", StringType()),
    StructField("supplier_email", StringType()),
    StructField("supplier_phone", StringType()),
    StructField("supplier_address", StringType()),
    StructField("supplier_city", StringType()),
    StructField("supplier_country", StringType())
])

In [4]:
jdbc_url = "jdbc:postgresql://postgres:5432/spark_db"
properties = {
    "user": "spark_user",
    "password": "spark_password",
    "driver": "org.postgresql.Driver"
}

In [5]:
data = spark.read.jdbc(url=jdbc_url, table="mock_data", properties=properties)

In [6]:
data.schema


StructType([StructField('id', IntegerType(), True), StructField('customer_first_name', StringType(), True), StructField('customer_last_name', StringType(), True), StructField('customer_age', IntegerType(), True), StructField('customer_email', StringType(), True), StructField('customer_country', StringType(), True), StructField('customer_postal_code', StringType(), True), StructField('customer_pet_type', StringType(), True), StructField('customer_pet_name', StringType(), True), StructField('customer_pet_breed', StringType(), True), StructField('seller_first_name', StringType(), True), StructField('seller_last_name', StringType(), True), StructField('seller_email', StringType(), True), StructField('seller_country', StringType(), True), StructField('seller_postal_code', StringType(), True), StructField('product_name', StringType(), True), StructField('product_category', StringType(), True), StructField('product_price', StringType(), True), StructField('product_quantity', IntegerType(), Tr

In [7]:
customer = data.select(
    'customer_pet_type',
    'customer_pet_name',
    'customer_pet_breed',
    'customer_country',
    'customer_postal_code',
    'customer_first_name',
    'customer_last_name',
    'customer_age',
    'customer_email'
).distinct()

window = Window.orderBy('customer_first_name')
customer = customer.withColumn("id", row_number().over(window))
customer.show()

+-----------------+-----------------+------------------+----------------+--------------------+-------------------+------------------+------------+--------------------+---+
|customer_pet_type|customer_pet_name|customer_pet_breed|customer_country|customer_postal_code|customer_first_name|customer_last_name|customer_age|      customer_email| id|
+-----------------+-----------------+------------------+----------------+--------------------+-------------------+------------------+------------+--------------------+---+
|             bird|          Mitchel|           Siamese|     Philippines|                1106|              Aaren|            Eagers|          44| wjesson3r@patch.com|  1|
|              cat|            Danit|           Siamese|            Iraq|                NULL|              Aaren|      Chipperfield|          42|jgiffordhf@wordpr...|  2|
|             bird|        Konstanze|          Parakeet|           China|                NULL|              Aaren|           Adriano|       

In [8]:
seller = data.select(
    'seller_first_name',
    'seller_last_name',
    'seller_country',
    'seller_postal_code',
    'seller_email'
).distinct()

window = Window.orderBy('seller_first_name')
seller = seller.withColumn("id", row_number().over(window))
seller.show()

+-----------------+----------------+--------------------+------------------+--------------------+---+
|seller_first_name|seller_last_name|      seller_country|seller_postal_code|        seller_email| id|
+-----------------+----------------+--------------------+------------------+--------------------+---+
|           Aarika|        Brussell|           Indonesia|              NULL|abrussellad@time.com|  1|
|            Aaron|        Hemphrey|             Iceland|               225|ahemphreym0@goodr...|  2|
|            Aaron|         Sheerin|           Indonesia|              NULL|  asheerinqc@mit.edu|  3|
|               Ab|          Copsey|         Philippines|              4217|   acopseyns@mail.ru|  4|
|          Abagael|         Charley|       New Caledonia|             98828| acharleyh3@imdb.com|  5|
|          Abagael|       McCorkell|                Peru|              NULL|amccorkellq3@adob...|  6|
|          Abagael|        Michurin|               China|              NULL|amichu

In [9]:
store = data.select(
    'store_location',
    'store_city',
    'store_state',
    'store_country',
    'store_phone',
    'store_email',
    'store_name',
).distinct()

window = Window.orderBy('store_name')
store = store.withColumn("id", row_number().over(window))
store.show()

+--------------+--------------+-----------+--------------+------------+--------------------+----------+---+
|store_location|    store_city|store_state| store_country| store_phone|         store_email|store_name| id|
+--------------+--------------+-----------+--------------+------------+--------------------+----------+---+
|      Suite 60|        Kujung|       NULL|     Indonesia|159-258-9340|  lbeasley1v@mit.edu|     Abata|  1|
|     Room 1590|       Floirac|         97|         China|953-151-1747|rgowthorperg@apac...|     Abata|  2|
|   PO Box 7914|    Skellefteå|         AC|         China|591-818-2372|byourellme@youtub...|     Abata|  3|
|    14th Floor|      Norsborg|         AB|      Portugal|155-350-7386|shanshaw8h@sfgate...|     Abata|  4|
|       Apt 814|     Terpinnya|       NULL|     Indonesia|782-681-7074|apicfordkr@cbsnew...|     Abata|  5|
|     9th Floor|        Angers|         B5|Czech Republic|563-662-9741|gelementl6@soundc...|     Abata|  6|
|      Suite 88|       Houst

In [10]:
product = data.select(
    'product_weight',
    'product_color',
    'product_size',
    'product_material',
    'product_description',
    'product_rating',
    'product_reviews',
    'product_brand',
    'product_name',
    'product_category',
    'product_price',
    'product_quantity',
    'product_release_date',
    'product_expiry_date',
).distinct()

window = Window.orderBy('product_name')
product = product.withColumn("id", row_number().over(window))
product.show()

+--------------+-------------+------------+----------------+--------------------+--------------+---------------+-------------+------------+----------------+-------------+----------------+--------------------+-------------------+---+
|product_weight|product_color|product_size|product_material| product_description|product_rating|product_reviews|product_brand|product_name|product_category|product_price|product_quantity|product_release_date|product_expiry_date| id|
+--------------+-------------+------------+----------------+--------------------+--------------+---------------+-------------+------------+----------------+-------------+----------------+--------------------+-------------------+---+
|          49.6|         Pink|      Medium|          Rubber|Integer tincidunt...|           3.0|            740|     Innotype|   Bird Cage|             Toy|        91.56|              86|          2016-12-26|         2023-10-21|  1|
|          11.3|       Indigo|       Small|      Plexiglass|Aenean l

In [75]:
sale = data.select(
    'sale_quantity',
    'sale_total_price',
    'sale_date',
    'sale_customer_id',
    'sale_seller_id',
    'sale_product_id',
).distinct()

sale = sale.withColumn("sale_total_price", regexp_replace("sale_total_price", "\\$", "").cast("double"))


window = Window.orderBy('sale_date')
sale = sale.withColumn("id", row_number().over(window))
sale.show()

+-------------+----------------+----------+----------------+--------------+---------------+---+
|sale_quantity|sale_total_price| sale_date|sale_customer_id|sale_seller_id|sale_product_id| id|
+-------------+----------------+----------+----------------+--------------+---------------+---+
|            3|          482.62|2021-01-01|             274|           274|            274|  1|
|            9|          251.54|2021-01-01|             696|           696|            696|  2|
|            7|          132.83|2021-01-01|             679|           679|            679|  3|
|            9|          402.83|2021-01-01|             513|           513|            513|  4|
|            6|          160.78|2021-01-01|             603|           603|            603|  5|
|            5|           79.85|2021-01-01|             635|           635|            635|  6|
|            9|          241.93|2021-01-01|             205|           205|            205|  7|
|            5|          488.38|2021-01-

In [67]:
supplier = data.select(
    'supplier_name',
    'supplier_contact',
    'supplier_email',
    'supplier_phone',
    'supplier_address',
    'supplier_city',
    'supplier_country'
).distinct()

window = Window.orderBy('supplier_name')
supplier = supplier.withColumn("id", row_number().over(window))
supplier.show()

+-------------+------------------+--------------------+--------------+----------------+--------------+----------------+---+
|supplier_name|  supplier_contact|      supplier_email|supplier_phone|supplier_address| supplier_city|supplier_country| id|
+-------------+------------------+--------------------+--------------+----------------+--------------+----------------+---+
|        Abata|      Zed Maxweell|zmaxweellk1@stumb...|  973-673-7617|    PO Box 40605|     Hai Riêng|         Ukraine|  1|
|        Abata|   Farris Blanking|fblankingd8@googl...|  530-854-6864|        Suite 51|       Jindong|         Armenia|  2|
|        Abata|       Ryan Vorley|   rvorleycj@bbb.org|  532-682-8788|        Room 921|   Solidaridad|          Poland|  3|
|        Abata|     Vaughan Mogra|vmograf9@moonfrui...|  188-774-8493|         Apt 646|    Notre Dame|  Czech Republic|  4|
|        Abata|       Rex Wilshaw|   rwilshaw6w@ed.gov|  252-226-8879|        Suite 80|        Nianba|    Saudi Arabia|  5|
|       

In [18]:
fact = data.join(customer,
    on=[
        data.customer_pet_type == customer.customer_pet_type,
        data.customer_pet_name == customer.customer_pet_name,
        data.customer_pet_breed == customer.customer_pet_breed,
        data.customer_country == customer.customer_country,
        data.customer_postal_code == customer.customer_postal_code,
        data.customer_first_name == customer.customer_first_name,
        data.customer_last_name == customer.customer_last_name,
        data.customer_age == customer.customer_age,
        data.customer_email == customer.customer_email
    ],
    how = 'left'
).join(seller,
      on = [
        data.seller_first_name == seller.seller_first_name,
        data.seller_last_name == seller.seller_last_name,
        data.seller_country == seller.seller_country,
        data.seller_postal_code == seller.seller_postal_code,
        data.seller_email == seller.seller_email
      ],
    how = 'left'
).join(store,
      on = [
        data.store_location == store.store_location,
        data.store_city == store.store_city,
        data.store_state == store.store_state,
        data.store_country == store.store_country,
        data.store_phone == store.store_phone,
        data.store_email == store.store_email,
        data.store_name == store.store_name
      ],
    how = 'left'
).join(product,
      on = [
        data.product_weight == product.product_weight,
        data.product_color == product.product_color,
        data.product_size == product.product_size,
        data.product_material == product.product_material,
        data.product_description == product.product_description,
        data.product_rating == product.product_rating,
        data.product_reviews == product.product_reviews,
        data.product_brand == product.product_brand,
        data.product_name == product.product_name,
        data.product_category == product.product_category,
        data.product_quantity == product.product_quantity,
        data.product_release_date == product.product_release_date,
        data.product_reviews == product.product_reviews,
        data.product_expiry_date == product.product_expiry_date,
      ],
    how = 'left'
).join(sale,
      on = [
        data.sale_quantity == sale.sale_quantity,
        data.sale_total_price == sale.sale_total_price,
        data.sale_date == sale.sale_date,
        data.sale_customer_id == sale.sale_customer_id,
        data.sale_seller_id == sale.sale_seller_id,
        data.sale_product_id == sale.sale_product_id
      ],
    how = 'left'
).join(supplier,
      on = [
        data.supplier_name == supplier.supplier_name,
        data.supplier_contact == supplier.supplier_contact,
        data.supplier_email == supplier.supplier_email,
        data.supplier_phone == supplier.supplier_phone,
        data.supplier_address == supplier.supplier_address,
        data.supplier_city == supplier.supplier_city,
        data.supplier_country == supplier.supplier_country 
      ],
    how = 'left'
).select(
    data['id'],
    customer['id'].alias('customer_id'),
    seller['id'].alias('seller_id'),
    store['id'].alias('store_id'),
    product['id'].alias('product_id'),
    sale['id'].alias('sale_id'),
    supplier['id'].alias('supplier_id')
)
fact.show()


+---+-----------+---------+--------+----------+-------+-----------+
| id|customer_id|seller_id|store_id|product_id|sale_id|supplier_id|
+---+-----------+---------+--------+----------+-------+-----------+
|  8|       6229|     NULL|    9218|      3155|   1749|       7702|
| 12|       6451|     1165|    NULL|      2614|   5253|       7399|
|  5|       NULL|     NULL|    NULL|      2572|    957|       4209|
| 21|       NULL|     NULL|    NULL|       996|   7730|       4233|
| 11|       8827|     3014|    4489|      8142|   8368|       4447|
| 16|       7918|     NULL|    4386|      6916|   3531|       1599|
|  4|       8488|     NULL|    NULL|      9043|   6085|       3454|
| 10|       5132|     NULL|    NULL|      8163|    578|       4371|
|  2|       4284|     1508|    NULL|      3440|   8759|       4512|
|  6|       3675|     2817|    NULL|      2174|   9647|       3235|
| 14|       NULL|     NULL|    NULL|      2139|   3412|       5923|
| 13|       3008|     5791|     211|      7063| 

In [20]:
fact.count()

10000

In [21]:
customer.write.jdbc(url=jdbc_url, table="customer", properties=properties)
seller.write.jdbc(url=jdbc_url, table="seller", properties=properties)
store.write.jdbc(url=jdbc_url, table="store", properties=properties)
product.write.jdbc(url=jdbc_url, table="product", properties=properties)
sale.write.jdbc(url=jdbc_url, table="sale", properties=properties)
supplier.write.jdbc(url=jdbc_url, table="supplier", properties=properties)
fact.write.jdbc(url=jdbc_url, table="fact", properties=properties)

In [79]:
print(sale.schema)

StructType([StructField('sale_quantity', IntegerType(), True), StructField('sale_total_price', DoubleType(), True), StructField('sale_date', DateType(), True), StructField('sale_customer_id', IntegerType(), True), StructField('sale_seller_id', IntegerType(), True), StructField('sale_product_id', IntegerType(), True), StructField('id', IntegerType(), False)])


In [103]:
product_report = (
    fact.join(product, on = [fact.product_id == product.id]).join(sale, on = [fact.sale_id == sale.id])
    .groupBy("product_id")
    .agg(
        (count(col('product_id')) * sum(col('sale_quantity'))).alias("quantity sold"),
        sum(col("sale_total_price")),
        sum(col('product_reviews')),
        avg(col('product_rating'))
    ).orderBy(desc('quantity sold')).limit(10)
)
product_report.show()

+----------+-------------+---------------------+--------------------+-------------------+
|product_id|quantity sold|sum(sale_total_price)|sum(product_reviews)|avg(product_rating)|
+----------+-------------+---------------------+--------------------+-------------------+
|        83|           10|                49.88|                 730|  4.199999809265137|
|       134|           10|               327.41|                 642| 2.4000000953674316|
|        85|           10|               184.99|                 689|                3.5|
|        42|           10|                42.38|                 142|  4.599999904632568|
|        95|           10|               353.07|                 669|  3.299999952316284|
|        56|           10|               376.52|                 584|                1.5|
|       100|           10|                40.74|                 435| 2.9000000953674316|
|        68|           10|               341.06|                 740|  4.599999904632568|
|       10

In [119]:
customer_report = (
    fact.join(customer, fact.customer_id == customer.id)
    .join(sale, fact.sale_id == sale.id)
    .groupBy("customer_id", "customer_country")
    .agg(
        sum("sale_total_price").alias("total_spent"),
        (sum("sale_total_price") / count("sale_id")).alias("average_check")
    )
    .orderBy(desc("total_spent"))
)
customer_report.show()

+-----------+----------------+-----------+-------------+
|customer_id|customer_country|total_spent|average_check|
+-----------+----------------+-----------+-------------+
|       4363|        Portugal|      499.8|        499.8|
|       5786|          Poland|     499.73|       499.73|
|       4716|          Russia|     499.69|       499.69|
|       7202|          Mexico|     499.62|       499.62|
|       3242|     Philippines|     499.58|       499.58|
|        483|          Russia|     499.42|       499.42|
|       7124|          Russia|     499.29|       499.29|
|       6081|     Philippines|     499.24|       499.24|
|       3326|        Portugal|     499.21|       499.21|
|       5805|           Italy|      499.2|        499.2|
|       5964|          Poland|     499.09|       499.09|
|       7409|        Portugal|     498.86|       498.86|
|       4075|        Colombia|     498.65|       498.65|
|        316|           Japan|     498.56|       498.56|
|       6382|     Philippines| 

In [115]:
product_report = (
    fact.join(customer, on = [fact.customer_id == customer.id]).join(sale, on = [fact.sale_id == sale.id])
    .groupBy("sale")
    .agg(
        sum(col("sale_total_price")).alias('check'),
        avg(col("sale_total_price")),
    ).orderBy(desc('check')).limit(10)
)
product_report.show()

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `sale` cannot be resolved. Did you mean one of the following? [`id`, `id`, `id`, `sale_id`, `sale_date`].;
'Aggregate ['sale], ['sale, sum(sale_total_price#8202) AS check#17089, avg(sale_total_price#8202) AS avg(sale_total_price)#17091]
+- Join Inner, (sale_id#4370 = id#8210)
   :- Join Inner, (customer_id#4366 = id#110)
   :  :- Project [id#0, id#110 AS customer_id#4366, id#168 AS seller_id#4367, id#208 AS store_id#4368, id#265 AS product_id#4369, id#349 AS sale_id#4370, id#394 AS supplier_id#4371]
   :  :  +- Join LeftOuter, (((((((supplier_name#43 = supplier_name#4151) AND (supplier_contact#44 = supplier_contact#4152)) AND (supplier_email#45 = supplier_email#4153)) AND (supplier_phone#46 = supplier_phone#4154)) AND (supplier_address#47 = supplier_address#4155)) AND (supplier_city#48 = supplier_city#4156)) AND (supplier_country#49 = supplier_country#4157))
   :  :     :- Join LeftOuter, ((((((sale_quantity#23 = sale_quantity#3889) AND (sale_total_price#24 = sale_total_price#3890)) AND (sale_date#19 = sale_date#3885)) AND (sale_customer_id#20 = sale_customer_id#3886)) AND (sale_seller_id#21 = sale_seller_id#3887)) AND (sale_product_id#22 = sale_product_id#3888))
   :  :     :  :- Join LeftOuter, ((((((((((((((product_weight#33 = product_weight#3671) AND (product_color#34 = product_color#3672)) AND (product_size#35 = product_size#3673)) AND (product_material#37 = product_material#3675)) AND (product_description#38 = product_description#3676)) AND (product_rating#39 = product_rating#3677)) AND (product_reviews#40 = product_reviews#3678)) AND (product_brand#36 = product_brand#3674)) AND (product_name#15 = product_name#3653)) AND (product_category#16 = product_category#3654)) AND (product_quantity#18 = product_quantity#3656)) AND (product_release_date#41 = product_release_date#3679)) AND (product_reviews#40 = product_reviews#3678)) AND (product_expiry_date#42 = product_expiry_date#3680))
   :  :     :  :  :- Join LeftOuter, (((((((store_location#26 = store_location#3466) AND (store_city#27 = store_city#3467)) AND (store_state#28 = store_state#3468)) AND (store_country#29 = store_country#3469)) AND (store_phone#30 = store_phone#3470)) AND (store_email#31 = store_email#3471)) AND (store_name#25 = store_name#3465))
   :  :     :  :  :  :- Join LeftOuter, (((((seller_first_name#10 = seller_first_name#3268) AND (seller_last_name#11 = seller_last_name#3269)) AND (seller_country#13 = seller_country#3271)) AND (seller_postal_code#14 = seller_postal_code#3272)) AND (seller_email#12 = seller_email#3270))
   :  :     :  :  :  :  :- Join LeftOuter, (((((((((customer_pet_type#7 = customer_pet_type#3095) AND (customer_pet_name#8 = customer_pet_name#3096)) AND (customer_pet_breed#9 = customer_pet_breed#3097)) AND (customer_country#5 = customer_country#3093)) AND (customer_postal_code#6 = customer_postal_code#3094)) AND (customer_first_name#1 = customer_first_name#3089)) AND (customer_last_name#2 = customer_last_name#3090)) AND (customer_age#3 = customer_age#3091)) AND (customer_email#4 = customer_email#3092))
   :  :     :  :  :  :  :  :- Relation [id#0,customer_first_name#1,customer_last_name#2,customer_age#3,customer_email#4,customer_country#5,customer_postal_code#6,customer_pet_type#7,customer_pet_name#8,customer_pet_breed#9,seller_first_name#10,seller_last_name#11,seller_email#12,seller_country#13,seller_postal_code#14,product_name#15,product_category#16,product_price#17,product_quantity#18,sale_date#19,sale_customer_id#20,sale_seller_id#21,sale_product_id#22,sale_quantity#23,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     :  :  :  :  :  +- Project [customer_pet_type#3095, customer_pet_name#3096, customer_pet_breed#3097, customer_country#3093, customer_postal_code#3094, customer_first_name#3089, customer_last_name#3090, customer_age#3091, customer_email#3092, id#110]
   :  :     :  :  :  :  :     +- Project [customer_pet_type#3095, customer_pet_name#3096, customer_pet_breed#3097, customer_country#3093, customer_postal_code#3094, customer_first_name#3089, customer_last_name#3090, customer_age#3091, customer_email#3092, id#110, id#110]
   :  :     :  :  :  :  :        +- Window [row_number() windowspecdefinition(customer_first_name#3089 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#110], [customer_first_name#3089 ASC NULLS FIRST]
   :  :     :  :  :  :  :           +- Project [customer_pet_type#3095, customer_pet_name#3096, customer_pet_breed#3097, customer_country#3093, customer_postal_code#3094, customer_first_name#3089, customer_last_name#3090, customer_age#3091, customer_email#3092]
   :  :     :  :  :  :  :              +- Deduplicate [customer_pet_type#3095, customer_email#3092, customer_postal_code#3094, customer_last_name#3090, customer_first_name#3089, customer_country#3093, customer_pet_breed#3097, customer_pet_name#3096, customer_age#3091]
   :  :     :  :  :  :  :                 +- Project [customer_pet_type#3095, customer_pet_name#3096, customer_pet_breed#3097, customer_country#3093, customer_postal_code#3094, customer_first_name#3089, customer_last_name#3090, customer_age#3091, customer_email#3092]
   :  :     :  :  :  :  :                    +- Relation [id#3088,customer_first_name#3089,customer_last_name#3090,customer_age#3091,customer_email#3092,customer_country#3093,customer_postal_code#3094,customer_pet_type#3095,customer_pet_name#3096,customer_pet_breed#3097,seller_first_name#3098,seller_last_name#3099,seller_email#3100,seller_country#3101,seller_postal_code#3102,product_name#3103,product_category#3104,product_price#3105,product_quantity#3106,sale_date#3107,sale_customer_id#3108,sale_seller_id#3109,sale_product_id#3110,sale_quantity#3111,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     :  :  :  :  +- Project [seller_first_name#3268, seller_last_name#3269, seller_country#3271, seller_postal_code#3272, seller_email#3270, id#168]
   :  :     :  :  :  :     +- Project [seller_first_name#3268, seller_last_name#3269, seller_country#3271, seller_postal_code#3272, seller_email#3270, id#168, id#168]
   :  :     :  :  :  :        +- Window [row_number() windowspecdefinition(seller_first_name#3268 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#168], [seller_first_name#3268 ASC NULLS FIRST]
   :  :     :  :  :  :           +- Project [seller_first_name#3268, seller_last_name#3269, seller_country#3271, seller_postal_code#3272, seller_email#3270]
   :  :     :  :  :  :              +- Deduplicate [seller_first_name#3268, seller_postal_code#3272, seller_country#3271, seller_last_name#3269, seller_email#3270]
   :  :     :  :  :  :                 +- Project [seller_first_name#3268, seller_last_name#3269, seller_country#3271, seller_postal_code#3272, seller_email#3270]
   :  :     :  :  :  :                    +- Relation [id#3258,customer_first_name#3259,customer_last_name#3260,customer_age#3261,customer_email#3262,customer_country#3263,customer_postal_code#3264,customer_pet_type#3265,customer_pet_name#3266,customer_pet_breed#3267,seller_first_name#3268,seller_last_name#3269,seller_email#3270,seller_country#3271,seller_postal_code#3272,product_name#3273,product_category#3274,product_price#3275,product_quantity#3276,sale_date#3277,sale_customer_id#3278,sale_seller_id#3279,sale_product_id#3280,sale_quantity#3281,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     :  :  :  +- Project [store_location#3466, store_city#3467, store_state#3468, store_country#3469, store_phone#3470, store_email#3471, store_name#3465, id#208]
   :  :     :  :  :     +- Project [store_location#3466, store_city#3467, store_state#3468, store_country#3469, store_phone#3470, store_email#3471, store_name#3465, id#208, id#208]
   :  :     :  :  :        +- Window [row_number() windowspecdefinition(store_name#3465 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#208], [store_name#3465 ASC NULLS FIRST]
   :  :     :  :  :           +- Project [store_location#3466, store_city#3467, store_state#3468, store_country#3469, store_phone#3470, store_email#3471, store_name#3465]
   :  :     :  :  :              +- Deduplicate [store_location#3466, store_name#3465, store_city#3467, store_phone#3470, store_country#3469, store_email#3471, store_state#3468]
   :  :     :  :  :                 +- Project [store_location#3466, store_city#3467, store_state#3468, store_country#3469, store_phone#3470, store_email#3471, store_name#3465]
   :  :     :  :  :                    +- Relation [id#3440,customer_first_name#3441,customer_last_name#3442,customer_age#3443,customer_email#3444,customer_country#3445,customer_postal_code#3446,customer_pet_type#3447,customer_pet_name#3448,customer_pet_breed#3449,seller_first_name#3450,seller_last_name#3451,seller_email#3452,seller_country#3453,seller_postal_code#3454,product_name#3455,product_category#3456,product_price#3457,product_quantity#3458,sale_date#3459,sale_customer_id#3460,sale_seller_id#3461,sale_product_id#3462,sale_quantity#3463,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     :  :  +- Project [product_weight#3671, product_color#3672, product_size#3673, product_material#3675, product_description#3676, product_rating#3677, product_reviews#3678, product_brand#3674, product_name#3653, product_category#3654, product_price#3655, product_quantity#3656, product_release_date#3679, product_expiry_date#3680, id#265]
   :  :     :  :     +- Project [product_weight#3671, product_color#3672, product_size#3673, product_material#3675, product_description#3676, product_rating#3677, product_reviews#3678, product_brand#3674, product_name#3653, product_category#3654, product_price#3655, product_quantity#3656, product_release_date#3679, product_expiry_date#3680, id#265, id#265]
   :  :     :  :        +- Window [row_number() windowspecdefinition(product_name#3653 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#265], [product_name#3653 ASC NULLS FIRST]
   :  :     :  :           +- Project [product_weight#3671, product_color#3672, product_size#3673, product_material#3675, product_description#3676, product_rating#3677, product_reviews#3678, product_brand#3674, product_name#3653, product_category#3654, product_price#3655, product_quantity#3656, product_release_date#3679, product_expiry_date#3680]
   :  :     :  :              +- Deduplicate [product_price#3655, product_description#3676, product_size#3673, product_reviews#3678, product_material#3675, product_quantity#3656, product_release_date#3679, product_name#3653, product_rating#3677, product_weight#3671, product_category#3654, product_color#3672, product_brand#3674, product_expiry_date#3680]
   :  :     :  :                 +- Project [product_weight#3671, product_color#3672, product_size#3673, product_material#3675, product_description#3676, product_rating#3677, product_reviews#3678, product_brand#3674, product_name#3653, product_category#3654, product_price#3655, product_quantity#3656, product_release_date#3679, product_expiry_date#3680]
   :  :     :  :                    +- Relation [id#3638,customer_first_name#3639,customer_last_name#3640,customer_age#3641,customer_email#3642,customer_country#3643,customer_postal_code#3644,customer_pet_type#3645,customer_pet_name#3646,customer_pet_breed#3647,seller_first_name#3648,seller_last_name#3649,seller_email#3650,seller_country#3651,seller_postal_code#3652,product_name#3653,product_category#3654,product_price#3655,product_quantity#3656,sale_date#3657,sale_customer_id#3658,sale_seller_id#3659,sale_product_id#3660,sale_quantity#3661,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     :  +- Project [sale_quantity#3889, sale_total_price#3890, sale_date#3885, sale_customer_id#3886, sale_seller_id#3887, sale_product_id#3888, id#349]
   :  :     :     +- Project [sale_quantity#3889, sale_total_price#3890, sale_date#3885, sale_customer_id#3886, sale_seller_id#3887, sale_product_id#3888, id#349, id#349]
   :  :     :        +- Window [row_number() windowspecdefinition(sale_date#3885 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#349], [sale_date#3885 ASC NULLS FIRST]
   :  :     :           +- Project [sale_quantity#3889, sale_total_price#3890, sale_date#3885, sale_customer_id#3886, sale_seller_id#3887, sale_product_id#3888]
   :  :     :              +- Deduplicate [sale_customer_id#3886, sale_date#3885, sale_total_price#3890, sale_quantity#3889, sale_product_id#3888, sale_seller_id#3887]
   :  :     :                 +- Project [sale_quantity#3889, sale_total_price#3890, sale_date#3885, sale_customer_id#3886, sale_seller_id#3887, sale_product_id#3888]
   :  :     :                    +- Relation [id#3866,customer_first_name#3867,customer_last_name#3868,customer_age#3869,customer_email#3870,customer_country#3871,customer_postal_code#3872,customer_pet_type#3873,customer_pet_name#3874,customer_pet_breed#3875,seller_first_name#3876,seller_last_name#3877,seller_email#3878,seller_country#3879,seller_postal_code#3880,product_name#3881,product_category#3882,product_price#3883,product_quantity#3884,sale_date#3885,sale_customer_id#3886,sale_seller_id#3887,sale_product_id#3888,sale_quantity#3889,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  :     +- Project [supplier_name#4151, supplier_contact#4152, supplier_email#4153, supplier_phone#4154, supplier_address#4155, supplier_city#4156, supplier_country#4157, id#394]
   :  :        +- Project [supplier_name#4151, supplier_contact#4152, supplier_email#4153, supplier_phone#4154, supplier_address#4155, supplier_city#4156, supplier_country#4157, id#394, id#394]
   :  :           +- Window [row_number() windowspecdefinition(supplier_name#4151 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#394], [supplier_name#4151 ASC NULLS FIRST]
   :  :              +- Project [supplier_name#4151, supplier_contact#4152, supplier_email#4153, supplier_phone#4154, supplier_address#4155, supplier_city#4156, supplier_country#4157]
   :  :                 +- Deduplicate [supplier_country#4157, supplier_phone#4154, supplier_address#4155, supplier_name#4151, supplier_contact#4152, supplier_city#4156, supplier_email#4153]
   :  :                    +- Project [supplier_name#4151, supplier_contact#4152, supplier_email#4153, supplier_phone#4154, supplier_address#4155, supplier_city#4156, supplier_country#4157]
   :  :                       +- Relation [id#4108,customer_first_name#4109,customer_last_name#4110,customer_age#4111,customer_email#4112,customer_country#4113,customer_postal_code#4114,customer_pet_type#4115,customer_pet_name#4116,customer_pet_breed#4117,seller_first_name#4118,seller_last_name#4119,seller_email#4120,seller_country#4121,seller_postal_code#4122,product_name#4123,product_category#4124,product_price#4125,product_quantity#4126,sale_date#4127,sale_customer_id#4128,sale_seller_id#4129,sale_product_id#4130,sale_quantity#4131,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   :  +- Project [customer_pet_type#16889, customer_pet_name#16890, customer_pet_breed#16891, customer_country#16887, customer_postal_code#16888, customer_first_name#16883, customer_last_name#16884, customer_age#16885, customer_email#16886, id#110]
   :     +- Project [customer_pet_type#16889, customer_pet_name#16890, customer_pet_breed#16891, customer_country#16887, customer_postal_code#16888, customer_first_name#16883, customer_last_name#16884, customer_age#16885, customer_email#16886, id#110, id#110]
   :        +- Window [row_number() windowspecdefinition(customer_first_name#16883 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#110], [customer_first_name#16883 ASC NULLS FIRST]
   :           +- Project [customer_pet_type#16889, customer_pet_name#16890, customer_pet_breed#16891, customer_country#16887, customer_postal_code#16888, customer_first_name#16883, customer_last_name#16884, customer_age#16885, customer_email#16886]
   :              +- Deduplicate [customer_pet_type#16889, customer_email#16886, customer_postal_code#16888, customer_last_name#16884, customer_first_name#16883, customer_country#16887, customer_pet_breed#16891, customer_pet_name#16890, customer_age#16885]
   :                 +- Project [customer_pet_type#16889, customer_pet_name#16890, customer_pet_breed#16891, customer_country#16887, customer_postal_code#16888, customer_first_name#16883, customer_last_name#16884, customer_age#16885, customer_email#16886]
   :                    +- Relation [id#16882,customer_first_name#16883,customer_last_name#16884,customer_age#16885,customer_email#16886,customer_country#16887,customer_postal_code#16888,customer_pet_type#16889,customer_pet_name#16890,customer_pet_breed#16891,seller_first_name#16892,seller_last_name#16893,seller_email#16894,seller_country#16895,seller_postal_code#16896,product_name#16897,product_category#16898,product_price#16899,product_quantity#16900,sale_date#16901,sale_customer_id#16902,sale_seller_id#16903,sale_product_id#16904,sale_quantity#16905,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]
   +- Project [sale_quantity#16989, sale_total_price#8202, sale_date#16985, sale_customer_id#16986, sale_seller_id#16987, sale_product_id#16988, id#8210]
      +- Project [sale_quantity#16989, sale_total_price#8202, sale_date#16985, sale_customer_id#16986, sale_seller_id#16987, sale_product_id#16988, id#8210, id#8210]
         +- Window [row_number() windowspecdefinition(sale_date#16985 ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS id#8210], [sale_date#16985 ASC NULLS FIRST]
            +- Project [sale_quantity#16989, sale_total_price#8202, sale_date#16985, sale_customer_id#16986, sale_seller_id#16987, sale_product_id#16988]
               +- Project [sale_quantity#16989, cast(regexp_replace(sale_total_price#16990, \$, , 1) as double) AS sale_total_price#8202, sale_date#16985, sale_customer_id#16986, sale_seller_id#16987, sale_product_id#16988]
                  +- Deduplicate [sale_customer_id#16986, sale_date#16985, sale_total_price#16990, sale_quantity#16989, sale_product_id#16988, sale_seller_id#16987]
                     +- Project [sale_quantity#16989, sale_total_price#16990, sale_date#16985, sale_customer_id#16986, sale_seller_id#16987, sale_product_id#16988]
                        +- Relation [id#16966,customer_first_name#16967,customer_last_name#16968,customer_age#16969,customer_email#16970,customer_country#16971,customer_postal_code#16972,customer_pet_type#16973,customer_pet_name#16974,customer_pet_breed#16975,seller_first_name#16976,seller_last_name#16977,seller_email#16978,seller_country#16979,seller_postal_code#16980,product_name#16981,product_category#16982,product_price#16983,product_quantity#16984,sale_date#16985,sale_customer_id#16986,sale_seller_id#16987,sale_product_id#16988,sale_quantity#16989,... 26 more fields] JDBCRelation(mock_data) [numPartitions=1]


In [114]:
ch_jdbc_url = "jdbc:clickhouse://clickhouse:8123/default"
properties = {
    "driver": "com.clickhouse.jdbc.ClickHouseDriver",
    "user": "custom_user",
    "password": "custom_password"
}


In [14]:
# Останавливаем SparkSession
spark.stop()