In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = (
        SparkSession.builder
            .master("spark://spark1:7077")
            .appName("bronze_to_silver")
            ## Config Fields
            .config('spark.sql.debug.maxToStringFields', 5000)
            .config('spark.debug.maxToStringFields', 5000)
            ## Optimize
            .config("delta.autoOptimize.optimizeWrite", "true")
            .config("delta.autoOptimize.autoCompact", "true")
            ## Delta Table
            .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4,io.delta:delta-spark_2.12:3.2.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            ## MinIO
            .config("spark.hadoop.fs.s3a.proxy.host", "minio1")
            .config("spark.hadoop.fs.s3a.proxy.port", "9000")
            .config("spark.hadoop.fs.s3a.access.key", "brew")
            .config("spark.hadoop.fs.s3a.secret.key", "brew4321")
            .config("spark.hadoop.fs.s3a.path.style.access", "true")
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
            .config("spark.hadoop.fs.s3a.connection.estabilish.timeout", "5000")
            ## Hive SQL
            .enableHiveSupport()
            .getOrCreate()
    )

In [3]:
spark

In [4]:
source_bucket = "bronze"
prefix_bucket = "breweries"
source_path = f"s3a://{source_bucket}/{prefix_bucket}/"

In [5]:
bronze_data = spark.read.format('json').load(source_path)

In [6]:
bronze_data.printSchema()

root
 |-- _corrupt_record: string (nullable = true)
 |-- address_1: string (nullable = true)
 |-- address_2: string (nullable = true)
 |-- address_3: string (nullable = true)
 |-- brewery_type: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- id: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- name: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- state: string (nullable = true)
 |-- state_province: string (nullable = true)
 |-- street: string (nullable = true)
 |-- website_url: string (nullable = true)



In [7]:
display(bronze_data.where('state = state_province').head(5))

[Row(_corrupt_record=None, address_1='1716 Topeka St', address_2=None, address_3=None, brewery_type='micro', city='Norman', country='United States', id='5128df48-79fc-4f0f-8b52-d06be54d0cec', latitude='35.25738891', longitude='-97.46818222', name='(405) Brewing Co', phone='4058160490', postal_code='73069-8224', state='Oklahoma', state_province='Oklahoma', street='1716 Topeka St', website_url='http://www.405brewing.com'),
 Row(_corrupt_record=None, address_1='407 Radam Ln Ste F200', address_2=None, address_3=None, brewery_type='micro', city='Austin', country='United States', id='9c5a66c8-cc13-416f-a5d9-0a769c87d318', latitude=None, longitude=None, name='(512) Brewing Co', phone='5129211545', postal_code='78745-1197', state='Texas', state_province='Texas', street='407 Radam Ln Ste F200', website_url='http://www.512brewing.com'),
 Row(_corrupt_record=None, address_1='8100 Washington Ave', address_2=None, address_3=None, brewery_type='micro', city='Mount Pleasant', country='United States',

In [8]:
target_bucket = "silver"
prefix_bucket = "breweries"
target_path = f"s3a://{target_bucket}/{prefix_bucket}/"

In [9]:
bronze_data.write.mode('overwrite') \
    .format('delta') \
    .partitionBy("country", "state", "city") \
    .save(target_path)

In [8]:
spark.stop()