In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ETL to Star Schema") \
    .config("spark.jars.packages", "org.postgresql:postgresql:42.2.27") \
    .getOrCreate()

# JDBC параметры
url = "jdbc:postgresql://postgres:5432/lab2BDA"
properties = {
    "user": "zloyaloha",
    "password": "12341234",
    "driver": "org.postgresql.Driver"
}

In [4]:
raw_df = spark.read.jdbc(url=url, table="mock_data", properties=properties)

In [5]:
from pyspark.sql.functions import col

customers_df = raw_df.select(
    col("id").alias("customer_id"),
    col("customer_first_name").alias("first_name"),
    col("customer_last_name").alias("last_name"),
    col("customer_age").alias("age"),
    col("customer_email").alias("email"),
    col("customer_country").alias("country"),
    col("customer_postal_code").alias("postal_code")
).dropDuplicates()

customers_df.write.jdbc(
    url=url,
    table="d_customer",
    mode="overwrite",
    properties=properties
)

In [6]:
from pyspark.sql.functions import col

pet_df = raw_df.select(
    col("id").alias("customer_id"),
    col("customer_pet_type").alias("pet_type"),
    col("customer_pet_name").alias("pet_name"),
    col("customer_pet_breed").alias("pet_breed"),
).dropDuplicates()

pet_df.write.jdbc(
    url=url,
    table="d_pet",
    mode="overwrite",
    properties=properties
)

In [7]:
from pyspark.sql.functions import col, to_date

product_df = raw_df.select(
    col("id").alias("product_id"),
    col("product_name").alias("name"),
    col("product_category").alias("category"),
    col("product_price").alias("price"),
    col("product_quantity").alias("quantity"),
    col("product_weight").alias("weight"),
    col("product_color").alias("color"),
    col("product_size").alias("size"),
    col("product_brand").alias("brand"),
    col("product_material").alias("material"),
    col("product_description").alias("description"),
    to_date(col("product_release_date"), "M/d/yyyy").alias("release_date"),
    to_date(col("product_expiry_date"), "M/d/yyyy").alias("expiry_date")
).dropDuplicates()

product_df.write.jdbc(
    url=url,
    table="d_product",
    mode="overwrite",
    properties=properties
)

In [8]:
from pyspark.sql.functions import col, to_date

products_df = raw_df.select(
    col("id").alias("product_id"),
    col("product_name").alias("name"),
    col("product_category").alias("category"),
    col("product_price").alias("price"),
    col("product_quantity").alias("quantity"),
    col("product_weight").alias("weight"),
    col("product_color").alias("color"),
    col("product_size").alias("size"),
    col("product_brand").alias("brand"),
    col("product_material").alias("material"),
    col("product_description").alias("description"),
    to_date(col("product_release_date"), "M/d/yyyy").alias("release_date"),
    to_date(col("product_expiry_date"), "M/d/yyyy").alias("expiry_date")
).dropDuplicates()

products_df.write.jdbc(
    url=url,
    table="d_product",
    mode="overwrite",
    properties=properties
)

In [9]:
sellers_df = raw_df.select(
    col("id").alias("seller_id"),
    col("seller_first_name").alias("first_name"),
    col("seller_last_name").alias("last_name"),
    col("seller_email").alias("email"),
    col("seller_country").alias("country"),
    col("seller_postal_code").alias("postal_code")
).dropDuplicates()

sellers_df.write.jdbc(
    url=url,
    table="d_seller",
    mode="overwrite",
    properties=properties
)

In [10]:
stores_df = raw_df.select(
    col("id").alias("store_id"),
    col("store_name").alias("name"),
    col("store_location").alias("location"),
    col("store_city").alias("city"),
    col("store_state").alias("state"),
    col("store_country").alias("country"),
    col("store_phone").alias("phone"),
    col("store_email").alias("email")
).dropDuplicates()

stores_df.write.jdbc(
    url=url,
    table="d_store",
    mode="overwrite",
    properties=properties
)

In [11]:
suppliers_df = raw_df.select(
    col("id").alias("supplier_id"),
    col("supplier_name").alias("name"),
    col("supplier_contact").alias("contact"),
    col("supplier_email").alias("email"),
    col("supplier_phone").alias("phone"),
    col("supplier_address").alias("address"),
    col("supplier_city").alias("city"),
    col("supplier_country").alias("country")
).dropDuplicates()

suppliers_df.write.jdbc(
    url=url,
    table="d_supplier",
    mode="overwrite",
    properties=properties
)

In [14]:
from pyspark.sql.functions import col, to_date

base_df = raw_df.select(
    col("id").alias("sale_id"),
    to_date(col("sale_date"), "M/d/yyyy").alias("sale_date"),
    col("sale_customer_id").alias("customer_id"),
    col("sale_seller_id").alias("seller_id"),
    col("sale_product_id").alias("product_id"),
    col("sale_quantity").alias("quantity"),
    col("sale_total_price").alias("total_price"),
    col("store_name"),
    col("store_email"),
    col("supplier_name"),
    col("supplier_email"),
    col("supplier_contact")
)

stores_sale = base_df.join(
    stores_df.select(
        col("store_id"),
        col("name").alias("store_name_ref"),
        col("email").alias("store_email_ref")
    ),
    (base_df["store_name"] == col("store_name_ref")) &
    (base_df["store_email"] == col("store_email_ref")),
    how="left"
)

stores_sale_supplier = stores_sale.join(
    suppliers_df.select(
        col("supplier_id"),
        col("name").alias("supplier_name_ref"),
        col("email").alias("supplier_email_ref"),
        col("contact").alias("supplier_contact_ref")
    ),
    (stores_sale["supplier_name"] == col("supplier_name_ref")) &
    (stores_sale["supplier_email"] == col("supplier_email_ref")) &
    (stores_sale["supplier_contact"] == col("supplier_contact_ref")),
    how="left"
)

sales_df = stores_sale_supplier.select(
    "sale_id", "sale_date", "customer_id", "seller_id", "product_id",
    "quantity", "total_price", "store_id", "supplier_id"
).dropDuplicates()

sales_df.write.jdbc(
    url=url,
    table="f_sale",
    mode="overwrite",
    properties=properties
)

product_rating_df = raw_df.select(
    col("id").alias("product_id"),
    col("product_rating").alias("rating"),
    col("product_reviews").alias("reviews")
).where(
    (col("id").isNotNull()) &
    (col("product_rating").isNotNull()) &
    (col("product_reviews").isNotNull())
).dropDuplicates()

product_rating_df.write.jdbc(
    url=url,
    table="d_product_rating",
    mode="overwrite",
    properties=properties
)
