In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, BooleanType, DateType
from pyspark.sql.types import StructType, StringType, BinaryType, IntegerType, DoubleType, TimestampType, DateType
from pyspark.storagelevel import StorageLevel
from delta.tables import *
accessKeyId=""
secretAccessKey=""

    
def create_spark_session() -> SparkSession:
    spark = (
        SparkSession
        .builder
        .appName("Landing Zone")
        .enableHiveSupport()
        .getOrCreate()
    )

    spark.sparkContext.setLogLevel("WARN")

    conf = spark.sparkContext._jsc.hadoopConfiguration()
    #conf.set("fs.s3a.aws.credentials.provider", "com.amazonaws.auth.WebIdentityTokenCredentialsProvider")
    conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider')
    conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    conf.set("fs.s3a.fast.upload", "true")
    conf.set("fs.s3a.bucket.all.committer.magic.enabled", "true")
    conf.set("fs.s3a.directory.marker.retention", "keep")
    conf.set("spark.driver.extraClassPath","/usr/local/spark/jars/*") 
    conf.set("spark.driver.memory", "8g") 
    conf.set("spark.executor.memory", "16g") 
    conf.set('fs.s3a.access.key',accessKeyId)
    conf.set('fs.s3a.secret.key', secretAccessKey)
    return spark

spark = create_spark_session()

In [4]:
jdbc_url = "jdbc:postgresql://dev-db-dados.cbmaakoae02z.us-east-2.rds.amazonaws.com:5432/postgres"
connection_properties = {
    "user": "postgres",
    "password": "senhasegura",
    "driver": "org.postgresql.Driver"
}

In [14]:
tabelas = [
    "cliente", 
    #"profissional", 
    #"servico", 
    #"agendamento", 
    #"pagamento", 
    #"horario_profissional", 
    #"promocao", 
    #"servico_promocao"
]

In [9]:
bronze_path = "s3a://dev-lab-02-us-east-2-landing/db_barbearia/"

In [21]:
for tabela in tabelas:
    print(f"Ingerindo tabela: {tabela}...")
    
    df = spark.read.option("batchsize","10000").jdbc(
        url=jdbc_url,
        table=tabela,
        properties=connection_properties
    )
    
    df = (
        df
        .withColumn("ingestion_time", F.current_timestamp()) 
        .withColumn("origem", F.lit("postgresql"))
    )
    
    
    (
        df
        .write
        .format("parquet")
        .mode("append")
        .save(f"{bronze_path}/{tabela}")
    )
    
    print(f"Tabela {tabela} ingerida com sucesso!")

Ingerindo tabela: cliente...
Tabela cliente ingerida com sucesso!


In [22]:
spark.read.format("parquet").load("s3a://dev-lab-02-us-east-2-landing/db_barbearia/cliente/").createOrReplaceTempView("cliente")

In [23]:
spark.sql("select nome,ingestion_time from cliente where cliente_id in (1) order by 2 desc").show(truncate= False)

+---------------+--------------------------+
|nome           |ingestion_time            |
+---------------+--------------------------+
|Thiago Pererira|2025-05-04 21:48:15.196922|
+---------------+--------------------------+



In [24]:
spark.sql("select max(ingestion_time) from cliente").show(truncate= False)

+--------------------------+
|max(ingestion_time)       |
+--------------------------+
|2025-05-04 21:49:14.919492|
+--------------------------+

