In [1]:
%%configure -n project.spark.compatibility
{
    "--conf":"spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener --conf spark.openlineage.transport.type=amazon_datazone_api --conf spark.openlineage.transport.domainId=dzd_cjfgk2iz50fxp7 --conf spark.glue.accountId=577638381635 --conf spark.openlineage.facets.custom_environment_variables=[AWS_DEFAULT_REGION;GLUE_VERSION;GLUE_COMMAND_CRITERIA;GLUE_PYTHON_VERSION;]"
}

Executing for connection type: SPARK_GLUE, connection name: project.spark.compatibility


"The following configurations have been updated: {'--conf': 'spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener --conf spark.openlineage.transport.type=amazon_datazone_api --conf spark.openlineage.transport.domainId=dzd_cjfgk2iz50fxp7 --conf spark.glue.accountId=577638381635 --conf spark.openlineage.facets.custom_environment_variables=[AWS_DEFAULT_REGION;GLUE_VERSION;GLUE_COMMAND_CRITERIA;GLUE_PYTHON_VERSION;]'}"

In [1]:
%%sql project.spark.compatibility
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp
import logging
import boto3

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

def get_secret(parameter_name):
    """Retrieve secret from AWS Parameter Store"""
    try:
        ssm = boto3.client('ssm', region_name='sa-east-1')  # adjust region as needed
        response = ssm.get_parameter(
            Name=parameter_name,
            WithDecryption=True
        )
        return response['Parameter']['Value']
    except Exception as e:
        logger.error(f"Error retrieving parameter {parameter_name}: {str(e)}")
        raise

# Fetch credentials from Parameter Store
try:
    db_username = get_secret('/itau/rds/username')
    db_password = get_secret('/itau/rds/password')
    logger.info("Successfully retrieved database credentials from Parameter Store")
except Exception as e:
    logger.error("Failed to retrieve credentials from Parameter Store")
    raise

# Create Spark session
spark = SparkSession.builder \
                    .appName("PostgreSQL to S3 ETL") \
                    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog") \
                    .config("spark.sql.catalog.glue_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
                    .config("spark.sql.catalog.glue_catalog.warehouse", "s3://itau-sm-demo-825765423553/iceberg_catalog/") \
                    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
                    .config("spark.sql.catalog.glue_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") \
                    .config("spark.sql.iceberg.handle-timestamp-without-timezone", True) \
                    .getOrCreate()

# Read from PostgreSQL
df = spark.read \
          .format("jdbc") \
          .option("url", "jdbc:postgresql://itau-sm-demo-database.cjsu82i00wkp.sa-east-1.rds.amazonaws.com:5432/registration") \
          .option("dbtable", "registration.customer") \
          .option("user", db_username) \
          .option("password", db_password) \
          .option("driver", "org.postgresql.Driver") \
          .load()

df.writeTo("glue_db_aw53flfpa5qkyj.customer") \
  .tableProperty("format-version", "2") \
  .overwritePartitions()

Executing for connection type: SPARK_GLUE, connection name: project.spark.compatibility
Creating Glue session...


'Session 4cmygnn241c1zv-4de10be3-a10f-403a-8a78-634d8c0b51f2 has been created.'

Id,Spark UI,Driver logs
4cmygnn241c1zv-4de10be3-a10f-403a-8a78-634d8c0b51f2,link,link


Session created for connection: project.spark.compatibility.


Connection: project.spark.compatibility | Run start time: 2025-03-18 17:34:30.800105 | Run duration : 0:01:33.381870s.


In [2]:
%%pyspark project.spark.compatibility
%help


| Magic              | Example                                                                                                              | Supported Compute Types                         | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
| ------------------ | -------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| %help              | %help                                                                                                                | JupyterLab, AWS Glue, AWS EMR, ATHENA, REDSHIFT | Returns a list of descriptions and example usages for all magic commands.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
| %%spark            | %%spark<br>%%spark (compute)<br>%%spark --name (compute)<br>%%spark -n (compute)                                     | AWS Glue, AWS EMR                               | Allows pyspark code written in python in the cell and submits code to Spark Compute<br><br>Supports an optional argument to specify the Compute name. The argument can be expressed explicitly by using "-n or --name". When the Compute name is not specified, an ordered search for "project.spark.compatibility", "project.spark", "project.spark.fineGrained" will be performed, and the first one found will be used.                                                                                                                                                                                                                                                                                       |
| %%pyspark          | %%pyspark<br>%%pyspark (compute)<br>%%pyspark --name (compute)<br>%%pyspark -n (compute)                             | AWS Glue, AWS EMR                               | Allows pyspark code written in python in the cell and submits code to Spark Compute<br><br>Supports an optional argument to specify the Compute name. The argument can be expressed explicitly by using "-n or --name". When the Compute name is not specified, an ordered search for "project.spark.compatibility", "project.spark", "project.spark.fineGrained" will be performed, and the first one found will be used.                                                                                                                                                                                                                                                                                                        |
| %%scalaspark       | %%scalaspark<br>%%scalaspark (compute)<br>%%scalaspark --name (compute)<br>%%scalaspark -n (compute)                 | AWS Glue, AWS EMR                               | Allows spark code written in scala in the cell and submits code to Spark Compute<br><br>Supports an optional argument to specify the Compute name. The argument can be expressed explicitly by using "-n or --name". When the Compute name is not specified, an ordered search for "project.spark.compatibility", "project.spark", "project.spark.fineGrained" will be performed, and the first one found will be used.                                                                                                                                                                                                                                                                                                              |
| %%local            | %%local<br>%%local (compute)<br>%%local --name (compute)<br>%%local -n (compute)                                     | JupyterLab                                      | Allows code written in python in the cell runs the code in JupyterLab Compute. When no prarmeters are specified, it is the equivallent of not using this cell magic.<br><br>Supports an optional argument to specify the JupyterLab Compute name. The argument can be expressed explicitly by using "-n or --name". When the Compute name is not specified, "project.python" will be used. Commonly used to change AWS profile used to run code on JupyterLab.                                                                                                                                |
| %%sql              | %%sql<br><br>select current_user                                                                                     | AWS Glue, AWS EMR, ATHENA, REDSHIFT             | Executes a SQL query against Redshift, Athena and Spark Compute.<br><br>Supports an optional argument to specify the Compute name. The argument can be expressed explicitly by using "-n or --name". When the Compute name is not specified, an ordered search for "project.spark.compatibility", "project.spark", "project.spark.fineGrained" will be performed, and the first one found will be used.                                                                                                                                                                                                                                                                                                                             |
| %disconnect        | %disconnect --name (compute)                                                                                         | AWS Glue, AWS EMR, ATHENA, REDSHIFT             | Stops a session connecting to a remote compute.<br><br>Supports an optional argument -n or --name for the connection to be stopped. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                                                                                                                                               |
| %send_to_remote    | %send_to_remote --name (compute) --language (language) --local (local variable name) --remote (remote variable name) | AWS Glue, AWS EMR                               | Sends a variable from the local kernel to remote compute. Remote compute supports both python and scala. When the remote is running in Python, it supports data type in dict, df, and str. When running in scala, df and str data types are supported.<br><br>Supports the following arguments: -l or --language for the connection language to be used, an argument -n or --name for the connection to be used, --local for the local variable name, and --remote or -r for the remote variable name.                                                                                       |
| %info              | %info --name (compute)                                                                                               | AWS Glue, AWS EMR                               | Outputs session information for the current chosen compute.<br><br>Supports optional argument -n or --name for the connection to be displayed its session information. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                                                                                                            |
| %status            | %status --name (compute)                                                                                             | AWS Glue                                        | Returns current session status. Status includes when the session was created, configuration, executing user role, etc.<br><br>Supports an optional argument -n or --name for the connection to be displayed its session status. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                                                   |
| %session_id        | %session_id —name (compute)                                                                                          | AWS Glue                                        | Returns the session ID for the running session.<br><br>Supports an optional argument -n or --name for the connection to be displayed its session ID. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                                                                                                                              |
| %list_sessions     | %list_sessions                                                                                                       | JupyterLab                                      | Displays all maintained sessions in the current kernel.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
| %display           | %display --dataframe (df name)                                                                                       | JupyterLab, AWS Glue, AWS EMR                   | Displays summary from data from dataframe and enable plotting of dataframe.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
| %matplot           | %matplot --name (compute) <plot name>                                                                                | AWS EMR                                         | Creates a visualization of plotting                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
| %%configure        | %%configure --name (compute) (-f)<br>{<br>"key": "value"<br>}                                                      | AWS Glue, AWS EMR                               | Configures the session by providing a JSON-formatted dictionary consisting of configuration parameters for a session.<br><br>Supports optional arguments: -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                         |
| %%tags             | %%tags --name (compute)<br>{<br>"key": "value"<br>}                                                                | AWS Glue                                        | Adds tags to a session. Tags should be specified within a curly bracket {}. Each tag name pair is enclosed in parentheses (" ") and separated by a comma (,).<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                            |
| %spark_conf        | %spark_conf --name (compute) (-f) configuration                                                                      | AWS Glue                                        | Sets the spark configuration value of the session to be created.<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                             |
| %idle_timeout      | %idle_timeout --name (compute) timeout (-f)                                                                          | AWS Glue                                        | Sets the idle timeout value of the session to be created. Default: 2880 minutes (48 hours).<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                  |
| %session_id_prefix | %session_id_prefix --name (compute) prefix_value (-f)                                                                | AWS Glue                                        | Sets the session Id prefix as the prefix_value preceding all session IDs in the format: **[session_id_prefix]-[project_id]-[session_id].** If a session ID is not provided, a random UUID will be generated.<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied. |
| %session_type      | %session_type --name (compute) etl (-f)                                                                              | AWS Glue                                        | Sets the session type to the value specified. The acceptable values are: etl and streaming.<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                  |
| %streaming         | %streaming --name (compute) (-f)                                                                                     | AWS Glue                                        | Sets the session to streaming type.<br><br>It supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                                                       |
| %etl               | %etl --name (compute) (-f)                                                                                           | AWS Glue                                        | Sets the session to be etl type.<br><br>It supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                                                          |
| %glue_version      | %glue_version --name (compute) 3.0 (-f)                                                                              | AWS Glue                                        | Sets the version of a session to be created.<br><br>It supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                                              |
| %worker_type       | %worker_type --name (compute) G.2X (-f)                                                                              | AWS Glue                                        | Sets the worker type of a session to be created.<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                                             |
| %number_of_workers | % number_of_workers --name (compute) 15 (-f)                                                                         | AWS Glue                                        | Sets the number of workers of a Glue session to be created.<br><br>Supports an optional argument -n or --name for the connection to be configured. The default value is set to be the chosen connection in the dropdown for the current cell. If a session has been created already, and a user wants to apply the configuration, -f or --force is required to kill the current one and to create a new one with the configuration applied.                                                                                                                                                  |
| %logs              | %logs --name (compute)                                                                                               | AWS EMR                                         | Outputs the session's session logs.<br><br>Supports an optional argument -n or --name for the connection to show its session logs. The default value is set to be the chosen connection in the dropdown for the current cell.                                                                                                                                                                                                                                                                                                                                                                |


Connection: project.spark.compatibility | Run start time: 2025-03-18 17:37:40.978442 | Run duration : 0:00:03.010348s.


In [None]:
%%pyspark project.spark.compatibility
