In [1]:
import sys
from typing import Callable, Dict, Iterator, Tuple
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
import boto3
from pyspark.sql.types import Row
from pyspark.sql.functions import concat_ws, col, coalesce, to_date, split, to_timestamp


In [2]:
def init() -> Tuple[GlueContext, Job]:
    params = []
    if '--JOB_NAME' in sys.argv:
        params.append('JOB_NAME')
    args = getResolvedOptions(sys.argv, params)

    context = GlueContext(SparkContext.getOrCreate())
    job = Job(context)

    if 'JOB_NAME' in args:
        jobname = args['JOB_NAME']
    else:
        jobname = "test"
    job.init(jobname, args)
    return (context, job)

context, job = init()

In [3]:
news_dynamodb_node: DynamicFrame = context.create_dynamic_frame.from_catalog(
    database="news",
    table_name="news",
    transformation_ctx="news_dynamodb_node",
)

In [34]:
transform_node = (news_dynamodb_node.toDF().withColumn("tags", concat_ws(",", col("tags"))).na.fill('', 'tags')
    .withColumn("symbols", concat_ws(",", col("symbols"))).na.fill('', 'symbols')
    .withColumn("neg_sentiment", coalesce(col("sentiment.neg.long"), col("sentiment.neg.double")))
    .withColumn("pos_sentiment", coalesce(col("sentiment.pos.long"), col("sentiment.pos.double")))
    .withColumn("new_sentiment", coalesce(col("sentiment.neu.long"), col("sentiment.neu.double")))
    .withColumn("publish_timestamp", col["date"])
    .withColumn("date", to_date(col["date"]))
    .withColumn("symbol", split(col["symbol:link"], "#").getItem(0))
)

In [36]:

drop_node = DynamicFrame.fromDF(transform_node.drop("sentiment").drop("symbol:link"), context, "drop_node")

root
 |-- tags: string (nullable = false)
 |-- sentiment: struct (nullable = true)
 |    |-- neg: struct (nullable = true)
 |    |    |-- double: double (nullable = true)
 |    |    |-- long: long (nullable = true)
 |    |-- pos: struct (nullable = true)
 |    |    |-- double: double (nullable = true)
 |    |    |-- long: long (nullable = true)
 |    |-- polarity: struct (nullable = true)
 |    |    |-- double: double (nullable = true)
 |    |    |-- long: long (nullable = true)
 |    |-- neu: struct (nullable = true)
 |    |    |-- double: double (nullable = true)
 |    |    |-- long: long (nullable = true)
 |-- symbols: string (nullable = false)
 |-- date: date (nullable = true)
 |-- link: string (nullable = true)
 |-- content: string (nullable = true)
 |-- title: string (nullable = true)
 |-- symbol:link: string (nullable = true)
 |-- neg_sentiment: double (nullable = true)
 |-- pos_sentiment: double (nullable = true)
 |-- new_sentiment: double (nullable = true)
 |-- publish_timesta

In [40]:

context.write_dynamic_frame.from_options(
    frame=drop_node,
    connection_type="s3",
    format="parquet",
    connection_options={
        "path": "s3://tonberry-news",
        "compression": "gzip",
    },
    transformation_ctx="S3bucket_news_datasink",
)

KeyboardInterrupt: 

In [15]:

# DDBDelete("news", lambda x: {"date": x['date'], "symbol:link": x['symbol:link'] }).process(news_dynamodb_node)

In [16]:
job.commit()
