In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

Подключение и проверка данных из `mock_data.csv`

In [2]:
# Инициализация SparkSession с драйвером PostgreSQL
spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("ETL to Star") \
    .getOrCreate()


pg_url = "jdbc:postgresql://postgres:5432/bober_db"
pg_properties = {"user": "bober", "password": "bober", "driver": "org.postgresql.Driver"}

# Чтение данных из PostgreSQL
df = spark.read.jdbc(url=pg_url, table="mock_data", properties=pg_properties)

# Проверка чтения данных
df.head(1)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/23 12:56:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/23 12:56:19 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[Row(id=1, customer_first_name='Barron', customer_last_name='Rawlyns', customer_age=61, customer_email='bmassingham0@army.mil', customer_country='China', customer_postal_code=None, customer_pet_type='cat', customer_pet_name='Priscella', customer_pet_breed='Labrador Retriever', seller_first_name='Bevan', seller_last_name='Massingham', seller_email='bmassingham0@answers.com', seller_country='Indonesia', seller_postal_code=None, product_name='Dog Food', product_category='Food', product_price=Decimal('77.97'), product_quantity=89, sale_date=datetime.date(2021, 5, 14), sale_customer_id=1, sale_seller_id=1, sale_product_id=1, sale_quantity=4, sale_total_price=Decimal('487.70'), store_name='Youopia', store_location='Suite 75', store_city='Xichehe', store_state=None, store_country='United States', store_phone='564-244-8660', store_email='bmassingham0@networkadvertising.org', pet_category='Cats', product_weight=Decimal('13.40'), product_color='Indigo', product_size='Medium', product_brand='Skaj

Создание модели данных снежинка. Разбиваем данные на `измерения` и `факторы`

In [None]:
dim_date = df.select(col("sale_date").alias("full_date")) \
    .distinct() \
    .filter(col("full_date").isNotNull()) \
    .withColumn("date_id", row_number().over(Window.orderBy("full_date"))) \
    .withColumn("year", year("full_date")) \
    .withColumn("month", month("full_date")) \
    .withColumn("day", dayofmonth("full_date")) \
    .withColumn("quarter", quarter("full_date"))

dim_date.write.jdbc(url=pg_url, table="dim_date", mode="overwrite", properties=pg_properties)

dim_customer = df.select(
    col("sale_customer_id").alias("customer_id"),
    col("customer_first_name").alias("first_name"),
    col("customer_last_name").alias("last_name"),
    col("customer_age").alias("age"),
    col("customer_email").alias("email"),
    col("customer_country").alias("country"),
    col("customer_postal_code").alias("postal_code")
).distinct()

dim_customer.write.jdbc(url=pg_url, table="dim_customer", mode="overwrite", properties=pg_properties)

dim_seller = df.select(
    col("sale_seller_id").alias("seller_id"),
    col("seller_first_name").alias("first_name"),
    col("seller_last_name").alias("last_name"),
    col("seller_email").alias("email"),
    col("seller_country").alias("country"),
    col("seller_postal_code").alias("postal_code")
).distinct()

dim_seller.write.jdbc(url=pg_url, table="dim_seller", mode="overwrite", properties=pg_properties)

dim_product = df.select(
    col("sale_product_id").alias("product_id"),
    col("product_name").alias("name"),
    col("product_category").alias("category"),
    col("product_price").alias("price"),
    col("product_weight").alias("weight"),
    col("product_color").alias("color"),
    col("product_size").alias("size"),
    col("product_brand").alias("brand"),
    col("product_material").alias("material"),
    col("product_description").alias("description"),
    col("product_rating").alias("rating"),
    col("product_reviews").alias("reviews"),
    col("product_release_date").alias("release_date"),
    col("product_expiry_date").alias("expiry_date")
).distinct()

dim_product.write.jdbc(url=pg_url, table="dim_product", mode="overwrite", properties=pg_properties)

dim_store_raw = df.select(
    "store_name", "store_location", "store_city",
    "store_state", "store_country", "store_phone", "store_email"
).distinct()

store_window = Window.orderBy("store_name", "store_city", "store_country")
dim_store = dim_store_raw.withColumn("store_id", row_number().over(store_window))
dim_store.write.jdbc(url=pg_url, table="dim_store", mode="overwrite", properties=pg_properties)

supplier_window = Window.orderBy("supplier_name", "supplier_city", "supplier_country")
dim_supplier = df.select(
    "supplier_name",
    col("supplier_contact").alias("contact"),
    "supplier_email",
    "supplier_phone",
    "supplier_address",
    "supplier_city",
    "supplier_country"
).distinct() \
    .withColumn("supplier_id", row_number().over(supplier_window))

dim_supplier.write.jdbc(url=pg_url, table="dim_supplier", mode="overwrite", properties=pg_properties)

dim_pet_raw = df.select(
    col("sale_customer_id").alias("customer_id"),
    col("customer_pet_type").alias("pet_type"),
    col("customer_pet_name").alias("pet_name"),
    col("customer_pet_breed").alias("pet_breed"),
    col("pet_category").alias("pet_category")
).distinct()

# Окно определяем ПОСЛЕ select + alias, чтобы использовать новые имена колонок
pet_window = Window.orderBy("customer_id", "pet_name", "pet_type")
dim_pet = dim_pet_raw.withColumn("pet_id", row_number().over(pet_window))
dim_pet.write.jdbc(url=pg_url, table="dim_pet", mode="overwrite", properties=pg_properties)

fact_sales = df \
    .join(dim_date, df.sale_date == dim_date.full_date, "left") \
    .join(dim_store, 
          (df.store_name == dim_store.store_name) &
          (df.store_location == dim_store.store_location) &
          (df.store_city == dim_store.store_city) &
          (df.store_state == dim_store.store_state) &
          (df.store_country == dim_store.store_country) &
          (df.store_phone == dim_store.store_phone) &
          (df.store_email == dim_store.store_email), "left") \
    .join(dim_supplier,
          (df.supplier_name == dim_supplier.supplier_name) &
          (df.supplier_city == dim_supplier.supplier_city) &
          (df.supplier_country == dim_supplier.supplier_country), "left") \
    .join(dim_pet,
          (df.sale_customer_id == dim_pet.customer_id) &
          (df.customer_pet_name == dim_pet.pet_name) &
          (df.customer_pet_type == dim_pet.pet_type), "left") \
    .select(
        col("id").alias("sale_id"),
        col("sale_customer_id").alias("customer_id"),
        col("pet_id"),
        col("sale_seller_id").alias("seller_id"),
        col("sale_product_id").alias("product_id"),
        col("store_id"),
        col("supplier_id"),
        col("date_id"),
        col("sale_quantity").alias("sale_quantity"),
        col("sale_total_price").alias("sale_total_price")
    )

fact_sales.write.jdbc(url=pg_url, table="fact_sales", mode="overwrite", properties=pg_properties)
