In [1]:
import sys
from typing import Callable, Dict, Iterator, Tuple
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws, col, coalesce, to_date, split, year


In [2]:
def init() -> Tuple[GlueContext, Job]:
    params = []
    if '--JOB_NAME' in sys.argv:
        params.append('JOB_NAME')
    args = getResolvedOptions(sys.argv, params)

    context = GlueContext(SparkContext.getOrCreate())
    job = Job(context)

    if 'JOB_NAME' in args:
        jobname = args['JOB_NAME']
    else:
        jobname = "test"
    job.init(jobname, args)
    return (context, job)

context, job = init()

In [3]:
macro_indicators_dynamodb_node: DynamicFrame = context.create_dynamic_frame.from_catalog(
    database="macro_indicators",
    table_name="macro_indicators",
    transformation_ctx="macro_indicators_dynamodb_node",
)

In [4]:
macro_indicators_dynamodb_node.printSchema()

resolved_frame = macro_indicators_dynamodb_node.resolveChoice(
    specs=[
        ("value", "cast:double")
    ]
)

df = resolved_frame.toDF().withColumn("date", to_date(col("date")))
df = df.withColumn("year", year(col("date")))

transform_node = DynamicFrame.fromDF(df, context, "transformed")
transform_node.printSchema()

root
|-- description: string
|-- country_code: string
|-- date: string
|-- value: choice
|    |-- double
|    |-- long
|-- period: string
|-- country_name: string
|-- indicator: string

root
|-- description: string
|-- country_code: string
|-- date: date
|-- value: double
|-- period: string
|-- country_name: string
|-- indicator: string
|-- year: int



In [5]:
partitioned_dataframe: DynamicFrame = macro_indicators_dynamodb_node.toDF().repartition(1)

In [6]:
partitioned_dynamicframe: DynamicFrame = DynamicFrame.fromDF(partitioned_dataframe, context, "partitioned_df")

In [7]:
context.write_dynamic_frame.from_options(
    frame=partitioned_dynamicframe,
    connection_type="s3",
    format="parquet",
    connection_options={
        "path": "s3://tonberry-macro-indicators-staging",
        "partitionKeys": ["indicator"],
    },
    transformation_ctx="S3bucket_node3",
)

<awsglue.dynamicframe.DynamicFrame at 0xffff90f1c590>

In [8]:
job.commit()
