In [1]:
import sys
from typing import Callable, Dict, Iterator, Tuple
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws, col, coalesce, to_date, split, year, dayofmonth, month


In [2]:
class DDBDelete:
    def __init__(self, table: str, keyGen: Callable[[Row], Dict[str, str]]) -> None:
        self.table = table
        self.keyGen = keyGen
        
    def process(self, df: DynamicFrame) -> None:
        df.toDF().foreachPartition(self.delete)
        
    def delete(self, rows: Iterator[Row]) -> None:
        ddb_underlying_table = boto3.resource("dynamodb").Table(self.table)
        with ddb_underlying_table.batch_writer() as batch:
            for row in rows:
                batch.delete_item(Key=self.keyGen(row))

In [3]:
def init() -> Tuple[GlueContext, Job]:
    params = []
    if '--JOB_NAME' in sys.argv:
        params.append('JOB_NAME')
    args = getResolvedOptions(sys.argv, params)

    context = GlueContext(SparkContext.getOrCreate())
    job = Job(context)

    if 'JOB_NAME' in args:
        jobname = args['JOB_NAME']
    else:
        jobname = "test"
    job.init(jobname, args)
    return (context, job)

context, job = init()

In [4]:
bonds_dynamodb_node: DynamicFrame = context.create_dynamic_frame.from_catalog(
    database="bonds",
    table_name="bonds",
    transformation_ctx="bonds_dynamodb_node",
)

In [5]:
bonds_dynamodb_node.printSchema()

root
|-- open: choice
|    |-- double
|    |-- long
|-- low: choice
|    |-- double
|    |-- long
|-- adjusted_close: choice
|    |-- double
|    |-- long
|-- date: string
|-- bond: string
|-- close: choice
|    |-- double
|    |-- long
|-- volume: long
|-- high: choice
|    |-- double
|    |-- long



In [6]:
resolved_frame = bonds_dynamodb_node.resolveChoice(specs=[
    ("open", "cast:double"),
    ("low", "cast:double"),
    ("adjusted_close", "cast:double"),
    ("close", "cast:double"),
    ("high", "cast:double")
])

In [7]:
resolved_frame.printSchema()

root
|-- open: double
|-- low: double
|-- adjusted_close: double
|-- date: string
|-- bond: string
|-- close: double
|-- volume: long
|-- high: double



In [8]:
df = resolved_frame.toDF()
add_year_df = df.withColumn('year', year(df.date))
add_year_frame = DynamicFrame.fromDF(add_year_df, context, 'add_year')

In [9]:
add_year_frame.printSchema()

root
|-- open: double
|-- low: double
|-- adjusted_close: double
|-- date: string
|-- bond: string
|-- close: double
|-- volume: long
|-- high: double
|-- year: int



In [10]:
partitioned_dataframe: DynamicFrame = add_year_frame.toDF().repartition(1)

In [11]:
partitioned_dynamicframe: DynamicFrame = DynamicFrame.fromDF(partitioned_dataframe, context, "partitioned_df")

In [12]:
context.write_dynamic_frame.from_options(
    frame=partitioned_dynamicframe,
    connection_type="s3",
    format="csv",
    connection_options={
        "path": "s3://tonberry-bonds-staging",
        "partitionKeys": ["bond", "year"],

    },
    transformation_ctx="S3bucket_node3",
)

<awsglue.dynamicframe.DynamicFrame at 0xffff79f85190>

In [13]:
job.commit()
