In [None]:
%%pyspark project.spark.compatibility
%streaming -f
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.sql.functions import when, lit, concat, substring, year, month, day, hour, minute
from pyspark.sql import SparkSession
import logging
import boto3

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def get_secret(parameter_name):
    """Retrieve secret from AWS Parameter Store"""
    try:
        ssm = boto3.client('ssm', region_name='sa-east-1')  # adjust region as needed
        response = ssm.get_parameter(
            Name=parameter_name,
            WithDecryption=True
        )
        return response['Parameter']['Value']
    except Exception as e:
        logger.error(f"Error retrieving parameter {parameter_name}: {str(e)}")
        raise

# Fetch credentials from Parameter Store
try:
    kds_arn = get_secret('/itau/kds/pix_arn')
    logger.info("Successfully retrieved database credentials from Parameter Store")
except Exception as e:
    logger.error("Failed to retrieve credentials from Parameter Store")
    raise

spark = SparkSession.builder \
                    .appName("PostgreSQL to S3 ETL") \
                    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog") \
                    .config("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
                    .config("spark.sql.catalog.glue_catalog.warehouse", "s3://itau-sm-demo-825765423553/iceberg_catalog/") \
                    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
                    .config("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
                    .config("spark.sql.iceberg.handle-timestamp-without-timezone", True) \
                    .getOrCreate()

sc = spark.sparkContext
glueContext = GlueContext(sc)
job = Job(glueContext)

schema = StructType([
            StructField("transaction_id", StringType(), False),
            StructField("customer_id", IntegerType(), False),
            StructField("customer_pix_key", StringType(), False),
            StructField("value", DecimalType(10, 2), False),
            StructField("destination_pix_key", StringType(), False),
            StructField("timestamp", TimestampType(), False)
        ])


raw_df = glueContext.create_data_frame.from_options(
    connection_type="kinesis",
    connection_options={
        "streamARN": kds_arn,
        "initialPosition": "LATEST",
        "inferSchema": "true",
        "classification": "json"
    },
    transformation_ctx="raw_df"
)

def process_batch(df, epoch_id):
    ssc_df = df.select(
        from_json(col("$json$data_infer_schema$_temporary$"), schema).alias("parsed_data")
    ).select("parsed_data.*") \
     .withColumnRenamed("timestamp", "transaction_timestamp") \
     .withColumn("customer_pix_key_masked", concat(substring(col("customer_pix_key"), 1, 2),
                                            lit("*******"),
                                            substring(col("customer_pix_key"), -4, 4))) \
     .withColumn("destination_pix_key_masked", concat(substring(col("customer_pix_key"), 1, 2),
                                            lit("*******"),
                                            substring(col("destination_pix_key"), -4, 4))) \
     .withColumn("t_year", year("transaction_timestamp")) \
     .withColumn("t_month", month("transaction_timestamp")) \
     .withColumn("t_day", day("transaction_timestamp")) \
     .withColumn("t_hour", hour("transaction_timestamp")) \
     .withColumn("t_minute", minute("transaction_timestamp"))

    
    ssc_df.writeTo("glue_db_aw53flfpa5qkyj.pix_transactions") \
          .tableProperty("format-version", "2") \
          .append()

final_df = raw_df \
    .writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", "s3://itau-sm-demo-825765423553/streaming_pix_checkpoint") \
    .trigger(processingTime="1 second") \
    .start() \
    .awaitTermination()

Stopping session for project.spark.compatibility. Session id: 4cmygnn241c1zv-97a53b8b-062f-4399-abe2-7e9246cbd42a
Session stopped.


"The following configurations have been updated: {'session_type': 'streaming'}"

Creating Glue session...


'Session 4cmygnn241c1zv-94017b9f-d861-42c6-9cf9-bb498e078a90 has been created.'

Id,Spark UI,Driver logs
4cmygnn241c1zv-94017b9f-d861-42c6-9cf9-bb498e078a90,link,link


In [None]:
%%pyspark project.spark.compatibility
