In [9]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Configuração do Spark
conf = SparkConf()
conf.setAppName("Merge Delta Table Sample")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
conf.set("hive.metastore.uris", "thrift://metastore:9083")

# Inicialização da sessão do Spark
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()



In [19]:
# Dados de exemplo para inserção e atualização
data_insert = [
    ("Product A", 100),
    ("Product B", 150),
    ("Product C", 200)
]

data_update = [
    ("Product A", 777),
    ("Product D", 300)
]

# Esquema dos dados
schema = StructType([
    StructField("product_name", StringType(), True),
    StructField("price", IntegerType(), True)
])

# Criando DataFrames
df_insert = spark.createDataFrame(data_insert, schema)
df_update = spark.createDataFrame(data_update, schema)



In [11]:
df_insert.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product A|  100|
|   Product B|  150|
|   Product C|  200|
+------------+-----+



In [13]:
df_insert.write.format("delta").mode("overwrite").save("s3a://bronze/delta_products")

In [14]:
df_loaded = spark.read.format("delta").load("s3a://bronze/delta_products")
df_loaded.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product C|  200|
|   Product A|  100|
|   Product B|  150|
+------------+-----+



In [20]:
df_update.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product A|  777|
|   Product D|  300|
+------------+-----+



In [21]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "s3a://bronze/delta_products")

(delta_table.alias("target")
    .merge(
        df_update.alias("source"),
        "target.product_name = source.product_name"
    )
    .whenMatchedUpdate(set={"price": "source.price"})
    .whenNotMatchedInsert(values={"product_name": "source.product_name", "price": "source.price"})
    .execute())

In [22]:
df_loaded = spark.read.format("delta").load("s3a://bronze/delta_products")
df_loaded.show()

+------------+-----+
|product_name|price|
+------------+-----+
|   Product C|  200|
|   Product B|  150|
|   Product A|  777|
|   Product D|  300|
+------------+-----+

