In [23]:
import sys
from typing import Callable, Dict, Iterator, Tuple
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws, col, coalesce, to_date, split, to_date, year



In [24]:
class DDBDelete:
    def __init__(self, table: str, keyGen: Callable[[Row], Dict[str, str]]) -> None:
        self.table = table
        self.keyGen = keyGen
        
    def process(self, df: DynamicFrame) -> None:
        df.toDF().foreachPartition(self.delete)
        
    def delete(self, rows: Iterator[Row]) -> None:
        ddb_underlying_table = boto3.resource("dynamodb").Table(self.table)
        with ddb_underlying_table.batch_writer() as batch:
            for row in rows:
                batch.delete_item(Key=self.keyGen(row))

In [25]:
def init() -> Tuple[GlueContext, Job]:
    params = []
    if '--JOB_NAME' in sys.argv:
        params.append('JOB_NAME')
    args = getResolvedOptions(sys.argv, params)

    context = GlueContext(SparkContext.getOrCreate())
    job = Job(context)

    if 'JOB_NAME' in args:
        jobname = args['JOB_NAME']
    else:
        jobname = "test"
    job.init(jobname, args)
    return (context, job)

context, job = init()

In [26]:
eod_prices_dynamodb_node: DynamicFrame = context.create_dynamic_frame.from_catalog(
    database="eod_prices",
    table_name="eod_prices",
    transformation_ctx="eod_prices_dynamodb_node",
)

In [27]:
resolved_frame = eod_prices_dynamodb_node.resolveChoice(specs=[
    ("open", "cast:double"),
    ("low", "cast:double"),
    ("adjusted_close", "cast:double"),
    ("close", "cast:double"),
    ("high", "cast:double")
])

In [28]:
df = resolved_frame.toDF().withColumn('date', to_date(col('date')))

In [29]:
df = df.withColumn('year', year(col('date')))

In [30]:
transform_node = DynamicFrame.fromDF(df, context, 'transformed')

In [31]:
partitioned_dataframe: DynamicFrame = transform_node.toDF().repartition(1)

In [32]:
partitioned_dynamicframe: DynamicFrame = DynamicFrame.fromDF(partitioned_dataframe, context, "partitioned_df")

In [33]:
context.write_dynamic_frame.from_options(
    frame=partitioned_dynamicframe,
    connection_type="s3",
    format="parquet",
    connection_options={
        "path": "s3://tonberry-eod-prices-staging",
        "partitionKeys": ["symbol","year"],
        "compression": "gzip"
    },
    transformation_ctx="S3bucket_node3",
    format_options={
        "useGlueParquetWriter": True,
    },
)

<awsglue.dynamicframe.DynamicFrame at 0xffff61219250>

In [34]:
job.commit()