In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Configuração do Spark
conf = SparkConf()
conf.setAppName("Schema Evolution Sample")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", "true")  # Ajustado para string
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
conf.set("hive.metastore.uris", "thrift://metastore:9083")

# Inicialização da sessão do Spark
spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()


In [2]:
data = [("John", 25, "john@example.com"),
        ("Alice", 30, "alice@example.com")]
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("email", StringType(), True)
])
df = spark.createDataFrame(data, schema)
df.show()


+-----+---+-----------------+
| name|age|            email|
+-----+---+-----------------+
| John| 25| john@example.com|
|Alice| 30|alice@example.com|
+-----+---+-----------------+



In [3]:
df.write.format("delta").mode("append").save('s3a://bronze/delta_customer')

In [6]:
delta_df = spark.read.format("delta").load('s3a://bronze/delta_customer').show()

+-----+---+-----------------+
| name|age|            email|
+-----+---+-----------------+
|Alice| 30|alice@example.com|
| John| 25| john@example.com|
+-----+---+-----------------+



In [8]:
# Novo DataFrame com a coluna adicional
new_data = [("Mike", 28, "mike@example.com", "New York"),
            ("Emily", 35, "emily@example.com", "San Francisco")]
new_schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("email", StringType(), True),
    StructField("city", StringType(), True)  # Nova coluna adicionada
])
new_df = spark.createDataFrame(new_data, new_schema)
new_df.show()

# Salvar o novo DataFrame como Delta e adicionar a coluna à tabela Delta existente
new_df.write.format("delta").mode("append").option("mergeSchema", "true").save('s3a://bronze/delta_customer')

+-----+---+-----------------+-------------+
| name|age|            email|         city|
+-----+---+-----------------+-------------+
| Mike| 28| mike@example.com|     New York|
|Emily| 35|emily@example.com|San Francisco|
+-----+---+-----------------+-------------+



In [9]:
delta_df = spark.read.format("delta").load('s3a://bronze/delta_customer').show()

+-----+---+-----------------+-------------+
| name|age|            email|         city|
+-----+---+-----------------+-------------+
|Emily| 35|emily@example.com|San Francisco|
| Mike| 28| mike@example.com|     New York|
|Alice| 30|alice@example.com|         null|
| John| 25| john@example.com|         null|
+-----+---+-----------------+-------------+

