In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

In [3]:
spark = SparkSession.builder \
    .appName("Neo4j Data Loading") \
    .config("spark.neo4j.bolt.url", "bolt://neo4j:7687") \
    .config("spark.neo4j.bolt.user", "neo4j") \
    .config("spark.neo4j.bolt.password", "password") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/23 13:49:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/23 13:49:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
pg_url = "jdbc:postgresql://postgres:5432/bober_db"
pg_properties = {"user": "bober", "password": "bober", "driver": "org.postgresql.Driver"}
df = spark.read.jdbc(url=pg_url, table="mock_data", properties=pg_properties)

# –ó–∞–≥—Ä—É–∂–∞–µ–º –≤—Å–µ —Ç–∞–±–ª–∏—Ü—ã –∑–≤–µ–∑–¥—ã
fact = spark.read.jdbc(url=pg_url, table="fact_sales", properties=pg_properties)
dim_product = spark.read.jdbc(url=pg_url, table="dim_product", properties=pg_properties)
dim_customer = spark.read.jdbc(url=pg_url, table="dim_customer", properties=pg_properties)
dim_store = spark.read.jdbc(url=pg_url, table="dim_store", properties=pg_properties)
dim_supplier = spark.read.jdbc(url=pg_url, table="dim_supplier", properties=pg_properties)
dim_date = spark.read.jdbc(url=pg_url, table="dim_date", properties=pg_properties)
dim_date.head(1)

[Row(full_date=datetime.date(2021, 1, 1), date_id=1, year=2021, month=1, day=1, quarter=1)]

In [8]:
def convert_decimal_to_double(df):
    """–ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç –≤—Å–µ DecimalType –∫–æ–ª–æ–Ω–∫–∏ –≤ DoubleType –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ —Å Neo4j"""
    for field in df.schema.fields:
        if isinstance(field.dataType, DecimalType):
            df = df.withColumn(field.name, col(field.name).cast(DoubleType()))
    return df

def convert_numeric_types(df):
    """–ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ—Ç –≤—Å–µ —á–∏—Å–ª–æ–≤—ã–µ —Ç–∏–ø—ã –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ —Å Neo4j"""
    for field in df.schema.fields:
        # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º DecimalType –≤ DoubleType
        if isinstance(field.dataType, DecimalType):
            df = df.withColumn(field.name, col(field.name).cast(DoubleType()))
        # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –Ω–µ–±–æ–ª—å—à–∏–µ —Ü–µ–ª—ã–µ —á–∏—Å–ª–∞ –≤ IntegerType
        elif isinstance(field.dataType, (ByteType, ShortType)):
            df = df.withColumn(field.name, col(field.name).cast(IntegerType()))
        # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –±–æ–ª—å—à–∏–µ —Ü–µ–ª—ã–µ —á–∏—Å–ª–∞ –≤ LongType
        elif isinstance(field.dataType, LongType):
            # –û—Å—Ç–∞–≤–ª—è–µ–º –∫–∞–∫ –µ—Å—Ç—å, Neo4j –ø–æ–¥–¥–µ—Ä–∂–∏–≤–∞–µ—Ç Long
            pass
    return df

# 1. –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ –ø—Ä–æ–¥—É–∫—Ç–∞–º
def create_product_sales_mart():
    product_sales = fact \
        .join(dim_product, "product_id") \
        .groupBy("product_id", "name", "category") \
        .agg(
            sum("sale_quantity").alias("total_quantity"),
            sum("sale_total_price").alias("total_revenue"),
            avg("rating").alias("avg_rating"),
            sum("reviews").alias("total_reviews")
        ) \
        .orderBy(desc("total_quantity")) \
        .limit(10)
    
    return convert_numeric_types(product_sales)

# 2. –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ –∫–ª–∏–µ–Ω—Ç–∞–º
def create_customer_sales_mart():
    customer_sales = fact \
        .join(dim_customer, "customer_id") \
        .groupBy("customer_id", "first_name", "last_name", "country") \
        .agg(
            sum("sale_total_price").alias("total_spent"),
            count("sale_id").alias("total_orders"),
            avg("sale_total_price").alias("avg_order_value")
        ) \
        .orderBy(desc("total_spent"))
    
    return convert_numeric_types(customer_sales)

# 3. –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ –≤—Ä–µ–º–µ–Ω–∏
def create_time_sales_mart():
    time_sales = fact \
        .join(dim_date, "date_id") \
        .groupBy("year", "month") \
        .agg(
            sum("sale_total_price").alias("monthly_revenue"),
            count("sale_id").alias("monthly_orders"),
            avg("sale_total_price").alias("avg_order_size")
        ) \
        .orderBy("year", "month")
    
    return convert_numeric_types(time_sales)

# 4. –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ –º–∞–≥–∞–∑–∏–Ω–∞–º
def create_store_sales_mart():
    store_sales = fact \
        .join(dim_store, "store_id") \
        .groupBy("store_id", "store_name", "store_city", "store_country") \
        .agg(
            sum("sale_total_price").alias("total_revenue"),
            count("sale_id").alias("total_sales"),
            avg("sale_total_price").alias("avg_sale_value")
        ) \
        .orderBy(desc("total_revenue"))
    
    return convert_numeric_types(store_sales)

# 5. –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥–∞–∂ –ø–æ –ø–æ—Å—Ç–∞–≤—â–∏–∫–∞–º
def create_supplier_sales_mart():
    supplier_sales = fact \
        .join(dim_supplier, "supplier_id") \
        .join(dim_product, "product_id") \
        .groupBy("supplier_id", "supplier_name", "supplier_country") \
        .agg(
            sum("sale_total_price").alias("total_revenue"),
            avg("price").alias("avg_product_price"),
            count("sale_id").alias("total_sales")
        ) \
        .orderBy(desc("total_revenue"))
    
    return convert_numeric_types(supplier_sales)

# 6. –í–∏—Ç—Ä–∏–Ω–∞ –∫–∞—á–µ—Å—Ç–≤–∞ –ø—Ä–æ–¥—É–∫—Ü–∏–∏
def create_product_quality_mart():
    product_quality = dim_product \
        .join(fact, "product_id", "left") \
        .groupBy("product_id", "name", "category", "rating", "reviews") \
        .agg(
            sum("sale_quantity").alias("total_sold"),
            sum("sale_total_price").alias("total_revenue")
        ) \
        .filter(col("rating").isNotNull()) \
        .orderBy(desc("rating"))
    
    return convert_numeric_types(product_quality)

# –û—Å–Ω–æ–≤–Ω–æ–π –ø—Ä–æ—Ü–µ—Å—Å –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞–Ω–Ω—ã—Ö –≤ Neo4j
def load_all_marts_to_neo4j():
    print("–ù–∞—á–∞–ª–æ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞–Ω–Ω—ã—Ö –≤ Neo4j...")
    
    # –ö–æ–Ω–≤–µ—Ä—Ç–∏—Ä—É–µ–º –≤—Å–µ —Ç–∞–±–ª–∏—Ü—ã –¥–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏ —Å Neo4j
    dim_product_converted = convert_numeric_types(dim_product)
    dim_customer_converted = convert_numeric_types(dim_customer)
    dim_store_converted = convert_numeric_types(dim_store)
    dim_supplier_converted = convert_numeric_types(dim_supplier)
    dim_date_converted = convert_numeric_types(dim_date)
    fact_converted = convert_numeric_types(fact)
    
    # 1. –ó–∞–≥—Ä—É–∑–∫–∞ –æ—Å–Ω–æ–≤–Ω—ã—Ö —Å—É—â–Ω–æ—Å—Ç–µ–π
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–æ–¥—É–∫—Ç–æ–≤...")
    dim_product_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Product") \
        .option("node.keys", "product_id") \
        .save()
    
    print("‚úÖ –ü—Ä–æ–¥—É–∫—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –∫–ª–∏–µ–Ω—Ç–æ–≤...")
    dim_customer_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Customer") \
        .option("node.keys", "customer_id") \
        .save()
    
    print("‚úÖ –ö–ª–∏–µ–Ω—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –º–∞–≥–∞–∑–∏–Ω–æ–≤...")
    dim_store_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Store") \
        .option("node.keys", "store_id") \
        .save()
    
    print("‚úÖ –ú–∞–≥–∞–∑–∏–Ω—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤...")
    dim_supplier_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Supplier") \
        .option("node.keys", "supplier_id") \
        .save()
    
    print("‚úÖ –ü–æ—Å—Ç–∞–≤—â–∏–∫–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç...")
    dim_date_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Date") \
        .option("node.keys", "date_id") \
        .save()
    
    print("‚úÖ –î–∞—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–æ–¥–∞–∂...")
    fact_converted.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "Sale") \
        .option("node.keys", "sale_id") \
        .save()
    
    print("‚úÖ –ü—Ä–æ–¥–∞–∂–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã")
    
    # 2. –ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω –∫–∞–∫ –æ—Ç–¥–µ–ª—å–Ω—ã—Ö —É–∑–ª–æ–≤
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –ø—Ä–æ–¥—É–∫—Ç–æ–≤...")
    product_mart = create_product_sales_mart()
    product_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "ProductSalesMart") \
        .option("node.keys", "product_id") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –∫–ª–∏–µ–Ω—Ç–æ–≤...")
    customer_mart = create_customer_sales_mart()
    customer_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "CustomerSalesMart") \
        .option("node.keys", "customer_id") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –∫–ª–∏–µ–Ω—Ç–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –≤—Ä–µ–º–µ–Ω–∏...")
    time_mart = create_time_sales_mart()
    time_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "TimeSalesMart") \
        .option("node.keys", "year,month") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –≤—Ä–µ–º–µ–Ω–∏ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –º–∞–≥–∞–∑–∏–Ω–æ–≤...")
    store_mart = create_store_sales_mart()
    store_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "StoreSalesMart") \
        .option("node.keys", "store_id") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –º–∞–≥–∞–∑–∏–Ω–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤...")
    supplier_mart = create_supplier_sales_mart()
    supplier_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "SupplierSalesMart") \
        .option("node.keys", "supplier_id") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –∫–∞—á–µ—Å—Ç–≤–∞...")
    quality_mart = create_product_quality_mart()
    quality_mart.write \
        .format("org.neo4j.spark.DataSource") \
        .mode("Overwrite") \
        .option("url", "bolt://neo4j:7687") \
        .option("authentication.basic.username", "neo4j") \
        .option("authentication.basic.password", "password") \
        .option("labels", "ProductQualityMart") \
        .option("node.keys", "product_id") \
        .save()
    
    print("‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –∫–∞—á–µ—Å—Ç–≤–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞")
    
    print("üéâ –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ Neo4j –∑–∞–≤–µ—Ä—à–µ–Ω–∞!")


load_all_marts_to_neo4j()

–ù–∞—á–∞–ª–æ –∑–∞–≥—Ä—É–∑–∫–∏ –¥–∞–Ω–Ω—ã—Ö –≤ Neo4j...
–ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–æ–¥—É–∫—Ç–æ–≤...


                                                                                

‚úÖ –ü—Ä–æ–¥—É–∫—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –∫–ª–∏–µ–Ω—Ç–æ–≤...


                                                                                

‚úÖ –ö–ª–∏–µ–Ω—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –º–∞–≥–∞–∑–∏–Ω–æ–≤...


                                                                                

‚úÖ –ú–∞–≥–∞–∑–∏–Ω—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤...


                                                                                

‚úÖ –ü–æ—Å—Ç–∞–≤—â–∏–∫–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç...
‚úÖ –î–∞—Ç—ã –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –ø—Ä–æ–¥–∞–∂...


                                                                                

‚úÖ –ü—Ä–æ–¥–∞–∂–∏ –∑–∞–≥—Ä—É–∂–µ–Ω—ã
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –ø—Ä–æ–¥—É–∫—Ç–æ–≤...
‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –ø—Ä–æ–¥—É–∫—Ç–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –∫–ª–∏–µ–Ω—Ç–æ–≤...


                                                                                

‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –∫–ª–∏–µ–Ω—Ç–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –≤—Ä–µ–º–µ–Ω–∏...
‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –≤—Ä–µ–º–µ–Ω–∏ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –º–∞–≥–∞–∑–∏–Ω–æ–≤...
‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –º–∞–≥–∞–∑–∏–Ω–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤...


                                                                                

‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –ø–æ—Å—Ç–∞–≤—â–∏–∫–æ–≤ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
–ó–∞–≥—Ä—É–∑–∫–∞ –≤–∏—Ç—Ä–∏–Ω—ã –∫–∞—á–µ—Å—Ç–≤–∞...


                                                                                

‚úÖ –í–∏—Ç—Ä–∏–Ω–∞ –∫–∞—á–µ—Å—Ç–≤–∞ –∑–∞–≥—Ä—É–∂–µ–Ω–∞
üéâ –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –≤ Neo4j –∑–∞–≤–µ—Ä—à–µ–Ω–∞!
