In [10]:
import sys
from typing import Callable, Dict, Iterator, Tuple
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws, col, coalesce, to_date, split



In [11]:
class DDBDelete:
    def __init__(self, table: str, keyGen: Callable[[Row], Dict[str, str]]) -> None:
        self.table = table
        self.keyGen = keyGen
        
    def process(self, df: DynamicFrame) -> None:
        df.toDF().foreachPartition(self.delete)
        
    def delete(self, rows: Iterator[Row]) -> None:
        ddb_underlying_table = boto3.resource("dynamodb").Table(self.table)
        with ddb_underlying_table.batch_writer() as batch:
            for row in rows:
                batch.delete_item(Key=self.keyGen(row))

In [12]:
def init() -> Tuple[GlueContext, Job]:
    params = []
    if '--JOB_NAME' in sys.argv:
        params.append('JOB_NAME')
    args = getResolvedOptions(sys.argv, params)

    context = GlueContext(SparkContext.getOrCreate())
    job = Job(context)

    if 'JOB_NAME' in args:
        jobname = args['JOB_NAME']
    else:
        jobname = "test"
    job.init(jobname, args)
    return (context, job)

context, job = init()

In [13]:
earnings_dynamodb_node: DynamicFrame = context.create_dynamic_frame.from_catalog(
    database="earnings",
    table_name="earnings",
    transformation_ctx="earnings_dynamodb_node",
)

In [14]:
resolved_frame = earnings_dynamodb_node.resolveChoice(specs=[
    ("estimate", "cast:double"),
    ("percent", "cast:double"),
    ("actual", "cast:double"),
    ("difference", "cast:double"),
    ("high", "cast:double")
])

In [15]:
partitioned_dataframe: DynamicFrame = resolved_frame.toDF().repartition(1)

In [16]:
partitioned_dynamicframe: DynamicFrame = DynamicFrame.fromDF(partitioned_dataframe, context, "partitioned_df")

In [18]:
context.write_dynamic_frame.from_options(
    frame=partitioned_dynamicframe,
    connection_type="s3",
    format="parquet",
    connection_options={
        "path": "s3://tonberry-earnings-staging",
        "partitionKeys": ["symbol"],
        "compression": "gzip"
    },
    transformation_ctx="S3bucket_node3",
)

<awsglue.dynamicframe.DynamicFrame at 0xffff6195a850>

In [19]:
job.commit()
