In [1]:
# Setting up and starting the interactive session
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import col, trim, to_date, when
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 5
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 5
Idle Timeout: 2880
Session ID: 610aa340-72cb-4df2-81b3-982cbdb3f2b6
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 610aa340-72cb-4df2-81b3-982cbdb3f2b6 to get into ready status...
Session 610aa340-72cb-4df2-81b3-982cbdb3f2b6 ha

In [24]:
# Dataset: enigma-jhu
# Loading the dataset into a DynamicFrame
dynamicFrameEnigma = glueContext.create_dynamic_frame.from_catalog(database="coviddatabase", table_name="enigma-jhu")

# Showing the top 10 rows from the dynamic dataframe
dynamicFrameEngima.show(10)

# Since, the above python code will show col0, col1,...as headers, so by mapping the column names to new ones
# String data types for handling all columns
mapped_dynamic_frame = dynamicFrameEnigma.apply_mapping([
    ('col0', 'string', 'fips', 'string'),
    ('col1', 'string', 'admin2', 'string'),
    ('col2', 'string', 'province_state', 'string'),
    ('col3', 'string', 'country_region', 'string'),
    ('col4', 'string', 'last_update', 'string'),
    ('col5', 'string', 'latitude', 'string'),
    ('col6', 'string', 'longitude', 'string'),
    ('col7', 'string', 'confirmed', 'string'),
    ('col8', 'string', 'deaths', 'string'),
    ('col9', 'string', 'recovered', 'string'),
    ('col10', 'string', 'active', 'string'),
    ('col11', 'string', 'combined_key', 'string')
])

# Converting the DynamicFrame to an iterable of records
records = mapped_dynamic_frame.toDF().collect()

# Shifting the rows (make the second row the first row)
header = records.pop(0)

# Creating a new DynamicFrame from the remaining records
newdynamicFrameEnigma = DynamicFrame.fromDF(spark.createDataFrame(records, schema=header), glueContext, "newdynamicFrameEnigma")

# Showing Dynamic Frame to Spark DataFrame
sparkDf = newdynamicFrameEnigma.toDF()

# Showing spark DF
sparkDf.show(10)

+----+------+--------------+--------------+-------------------+--------+---------+---------+------+---------+------+----------------+
|fips|admin2|province_state|country_region|        last_update|latitude|longitude|confirmed|deaths|recovered|active|    combined_key|
+----+------+--------------+--------------+-------------------+--------+---------+---------+------+---------+------+----------------+
|    |      |         Anhui|         China|2020-01-22T17:00:00|  31.826|  117.226|        1|      |         |      |    Anhui, China|
|    |      |       Beijing|         China|2020-01-22T17:00:00|  40.182|  116.414|       14|      |         |      |  Beijing, China|
|    |      |     Chongqing|         China|2020-01-22T17:00:00|  30.057|  107.874|        6|      |         |      |Chongqing, China|
|    |      |        Fujian|         China|2020-01-22T17:00:00|  26.079|  117.987|        1|      |         |      |   Fujian, China|
|    |      |         Gansu|         China|2020-01-22T17:00:00

In [19]:
# Data Cleaning
# The data cleaning can be done in various ways such as removing duplicates, handling missing and null values
# But the handling missing and null values with default values can be more relevant and appropriate for covid dataset
# Since this way can be the best to maintain continuity, prevent data loss and ensure data size

# Replacing empty strings with None to handle them as null values
sparkDf = sparkDf.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in sparkDf.columns])

# Replacing missing and null values with default values
fill_values = {
    "fips": "Unknown",
    "admin2": "Unknown",
    "province_state": "Unknown",
    "country_region": "Unknown",
    "last_update": "2020-01-22T17:00:00",
    "latitude": 0.0,
    "longitude": 0.0,
    "confirmed": 0,
    "deaths": 0,
    "recovered": 0,
    "active": 0,
    "combined_key": "Unknown"
}

# Applying fill values for nulls
sparkDf = sparkDf.na.fill(fill_values)

# Showing the DataFrame to inspect the first 10 rows after filling values
sparkDf.show(10)

+-------+-------+--------------+--------------+-------------------+--------+---------+---------+------+---------+------+----------------+
|   fips| admin2|province_state|country_region|        last_update|latitude|longitude|confirmed|deaths|recovered|active|    combined_key|
+-------+-------+--------------+--------------+-------------------+--------+---------+---------+------+---------+------+----------------+
|Unknown|Unknown|         Anhui|         China|2020-01-22T17:00:00|  31.826|  117.226|        1|     0|        0|     0|    Anhui, China|
|Unknown|Unknown|       Beijing|         China|2020-01-22T17:00:00|  40.182|  116.414|       14|     0|        0|     0|  Beijing, China|
|Unknown|Unknown|     Chongqing|         China|2020-01-22T17:00:00|  30.057|  107.874|        6|     0|        0|     0|Chongqing, China|
|Unknown|Unknown|        Fujian|         China|2020-01-22T17:00:00|  26.079|  117.987|        1|     0|        0|     0|   Fujian, China|
|Unknown|Unknown|         Gansu|  

In [20]:
# Coalescing to a single partition to ensure a single output file
sparkDf = sparkDf.coalesce(1)

# Converting back to DynamicFrame for writing to S3
cleaned_dynamic_frame = DynamicFrame.fromDF(sparkDf, glueContext, "cleaned_dynamic_frame")

# Writing the cleaned data back to S3
output_path = "s3://etlemrbucket/cleandata/"
glueContext.write_dynamic_frame.from_options(
    frame=cleaned_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_path},
    format="csv"
)

<awsglue.dynamicframe.DynamicFrame object at 0x7f0c5661e650>


In [27]:
# Dataset: us_states
# Loading the dataset into a DynamicFrame
dynamicFrameUSstates = glueContext.create_dynamic_frame.from_catalog(database="coviddatabase", table_name="us_states")

# Since, the above python code will show col0, col1,...as headers, so by mapping the column names to new ones
# String data types for handling all columns
mapped_dynamic_frame = dynamicFrameUSstates.apply_mapping([
    ('col0', 'string', 'date', 'string'),
    ('col1', 'string', 'state', 'string'),
    ('col2', 'string', 'fips', 'string'),
    ('col3', 'string', 'cases', 'string'),
    ('col4', 'string', 'deaths', 'string')
])

# Converting the DynamicFrame to an iterable of records
records = mapped_dynamic_frame.toDF().collect()

# Shifting the rows (make the second row the first row)
header = records.pop(0)

# Creating a new DynamicFrame from the remaining records
newdynamicFrameUSstates = DynamicFrame.fromDF(spark.createDataFrame(records, schema=header), glueContext, "newdynamicFrameUSstates")

# Showing Dynamic Frame to Spark DataFrame
sparkDf = newdynamicFrameUSstates.toDF()

# Showing spark DF
sparkDf.show(10)

+----------+----------+----+-----+------+
|      date|     state|fips|cases|deaths|
+----------+----------+----+-----+------+
|2020-01-21|Washington|  53|    1|     0|
|2020-01-22|Washington|  53|    1|     0|
|2020-01-23|Washington|  53|    1|     0|
|2020-01-24|  Illinois|  17|    1|     0|
|2020-01-24|Washington|  53|    1|     0|
|2020-01-25|California|  06|    1|     0|
|2020-01-25|  Illinois|  17|    1|     0|
|2020-01-25|Washington|  53|    1|     0|
|2020-01-26|   Arizona|  04|    1|     0|
|2020-01-26|California|  06|    2|     0|
+----------+----------+----+-----+------+
only showing top 10 rows


In [28]:
# Data Cleaning
# Replacing empty strings with None to handle them as null values
sparkDf = sparkDf.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in sparkDf.columns])

# Replacing missing and null values with default values
fill_values = {
    "date": "1970-01-01",
    "state": "Unknown",
    "fips": "000",
    "cases": 0,
    "deaths": 0
}

# Applying fill values for nulls
sparkDf = sparkDf.na.fill(fill_values)

# Showing the DataFrame to inspect the first 10 rows after filling values
sparkDf.show(10)

+----------+----------+----+-----+------+
|      date|     state|fips|cases|deaths|
+----------+----------+----+-----+------+
|2020-01-21|Washington|  53|    1|     0|
|2020-01-22|Washington|  53|    1|     0|
|2020-01-23|Washington|  53|    1|     0|
|2020-01-24|  Illinois|  17|    1|     0|
|2020-01-24|Washington|  53|    1|     0|
|2020-01-25|California|  06|    1|     0|
|2020-01-25|  Illinois|  17|    1|     0|
|2020-01-25|Washington|  53|    1|     0|
|2020-01-26|   Arizona|  04|    1|     0|
|2020-01-26|California|  06|    2|     0|
+----------+----------+----+-----+------+
only showing top 10 rows


In [29]:
# Coalescing to a single partition to ensure a single output file
sparkDf = sparkDf.coalesce(1)

#Converting back to DynamicFrame for writing to S3
cleaned_dynamic_frame = DynamicFrame.fromDF(sparkDf, glueContext, "cleaned_dynamic_frame")

# Writing the cleaned data back to S3
output_path = "s3://etlemrbucket/cleandata/"
glueContext.write_dynamic_frame.from_options(
    frame=cleaned_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_path},
    format="csv"
)

<awsglue.dynamicframe.DynamicFrame object at 0x7f0c73cf0160>


In [30]:
# Dataset: us_counties
# Loading the dataset into a DynamicFrame
dynamicFrameUScounties = glueContext.create_dynamic_frame.from_catalog(database="coviddatabase", table_name="us_counties")

# Since, the above python code will show col0, col1,...as headers, so by mapping the column names to new ones
# String data types for handling all columns
mapped_dynamic_frame = dynamicFrameUScounties.apply_mapping([
    ('col0', 'string', 'date', 'string'),
    ('col1', 'string', 'county', 'string'),
    ('col2', 'string', 'state', 'string'),
    ('col3', 'string', 'fips', 'string'),
    ('col4', 'string', 'cases', 'string'),
    ('col5', 'string', 'deaths', 'string')
])

# Converting the DynamicFrame to an iterable of records
records = mapped_dynamic_frame.toDF().collect()

# Shifting the rows (make the second row the first row)
header = records.pop(0)

# Creating a new DynamicFrame from the remaining records
newdynamicFrameUScounties = DynamicFrame.fromDF(spark.createDataFrame(records, schema=header), glueContext, "newdynamicFrameUScounties")

# Showing Dynamic Frame to Spark DataFrame
sparkDf = newdynamicFrameUScounties.toDF()

# Showing spark DF
sparkDf.show(10)

+----------+-----------+----------+-----+-----+------+
|      date|     county|     state| fips|cases|deaths|
+----------+-----------+----------+-----+-----+------+
|2020-01-21|  Snohomish|Washington|53061|    1|     0|
|2020-01-22|  Snohomish|Washington|53061|    1|     0|
|2020-01-23|  Snohomish|Washington|53061|    1|     0|
|2020-01-24|       Cook|  Illinois|17031|    1|     0|
|2020-01-24|  Snohomish|Washington|53061|    1|     0|
|2020-01-25|     Orange|California|06059|    1|     0|
|2020-01-25|       Cook|  Illinois|17031|    1|     0|
|2020-01-25|  Snohomish|Washington|53061|    1|     0|
|2020-01-26|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-26|Los Angeles|California|06037|    1|     0|
+----------+-----------+----------+-----+-----+------+
only showing top 10 rows


In [31]:
# Data Cleaning
# Replacing empty strings with None to handle them as null values
sparkDf = sparkDf.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in sparkDf.columns])

# Replacing missing and null values with default values
fill_values = {
    "date": "1970-01-01",
    "county": "Unknown",
    "state": "Unknown",
    "fips": "000",
    "cases": 0,
    "deaths": 0
}

# Applying fill values for nulls
sparkDf = sparkDf.na.fill(fill_values)

# Showing the DataFrame to inspect the first 10 rows after filling values
sparkDf.show(10)

+----------+-----------+----------+-----+-----+------+
|      date|     county|     state| fips|cases|deaths|
+----------+-----------+----------+-----+-----+------+
|2020-01-21|  Snohomish|Washington|53061|    1|     0|
|2020-01-22|  Snohomish|Washington|53061|    1|     0|
|2020-01-23|  Snohomish|Washington|53061|    1|     0|
|2020-01-24|       Cook|  Illinois|17031|    1|     0|
|2020-01-24|  Snohomish|Washington|53061|    1|     0|
|2020-01-25|     Orange|California|06059|    1|     0|
|2020-01-25|       Cook|  Illinois|17031|    1|     0|
|2020-01-25|  Snohomish|Washington|53061|    1|     0|
|2020-01-26|   Maricopa|   Arizona|04013|    1|     0|
|2020-01-26|Los Angeles|California|06037|    1|     0|
+----------+-----------+----------+-----+-----+------+
only showing top 10 rows


In [32]:
# Coalescing to a single partition to ensure a single output file
sparkDf = sparkDf.coalesce(1)

# Converting back to DynamicFrame for writing to S3
cleaned_dynamic_frame = DynamicFrame.fromDF(sparkDf, glueContext, "cleaned_dynamic_frame")

# Writing the cleaned data back to S3
output_path = "s3://etlemrbucket/cleandata/"
glueContext.write_dynamic_frame.from_options(
    frame=cleaned_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_path},
    format="csv"
)

<awsglue.dynamicframe.DynamicFrame object at 0x7f0c73d5b2e0>


In [33]:
# Dataset: us_daily
# Loading the dataset into a DynamicFrame
dynamicFrameUSdaily = glueContext.create_dynamic_frame.from_catalog(database="coviddatabase", table_name="us_daily")

# Since, the above python code will show col0, col1,...as headers, so by mapping the column names to new ones
# String data types for handling all columns
mapped_dynamic_frame = dynamicFrameUSdaily.apply_mapping([
    ('col0', 'string', 'date', 'string'),
    ('col1', 'string', 'states', 'string'),
    ('col2', 'string', 'positive', 'string'),
    ('col3', 'string', 'negative', 'string'),
    ('col4', 'string', 'pending', 'string'),
    ('col5', 'string', 'hospitalizedCurrently', 'string'),
    ('col6', 'string', 'hospitalizedCumulative', 'string'),
    ('col7', 'string', 'inIcuCurrently', 'string'),
    ('col8', 'string', 'inIcuCumulative', 'string'),
    ('col9', 'string', 'onVentilatorCurrently', 'string'),
    ('col10', 'string', 'onVentilatorCumulative', 'string'),
    ('col11', 'string', 'dateChecked', 'string'),
    ('col12', 'string', 'death', 'string'),
    ('col13', 'string', 'hospitalized', 'string'),
    ('col14', 'string', 'totalTestResults', 'string'),
    ('col15', 'string', 'lastModified', 'string'),
    ('col16', 'string', 'recovered', 'string'),
    ('col17', 'string', 'total', 'string'),
    ('col18', 'string', 'posNeg', 'string'),
    ('col19', 'string', 'deathIncrease', 'string'),
    ('col20', 'string', 'hospitalizedIncrease', 'string'),
    ('col21', 'string', 'negativeIncrease', 'string'),
    ('col22', 'string', 'positiveIncrease', 'string'),
    ('col23', 'string', 'totalTestResultsIncrease', 'string'),
    ('col24', 'string', 'hash', 'string')
])

# Converting the DynamicFrame to an iterable of records
records = mapped_dynamic_frame.toDF().collect()

# Shifting the rows (make the second row the first row)
header = records.pop(0)

# Creating a new DynamicFrame from the remaining records
newdynamicFrameUSdaily = DynamicFrame.fromDF(spark.createDataFrame(records, schema=header), glueContext, "newdynamicFrameUSdaily")

# Showing Dynamic Frame to Spark DataFrame
sparkDf = newdynamicFrameUSdaily.toDF()

# Showing spark DF
sparkDf.show(10)

+--------+------+--------+--------+-------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+--------------------+------+------------+----------------+--------------------+---------+-----+------+-------------+--------------------+----------------+----------------+------------------------+--------------------+
|    date|states|positive|negative|pending|hospitalizedCurrently|hospitalizedCumulative|inIcuCurrently|inIcuCumulative|onVentilatorCurrently|onVentilatorCumulative|         dateChecked| death|hospitalized|totalTestResults|        lastModified|recovered|total|posNeg|deathIncrease|hospitalizedIncrease|negativeIncrease|positiveIncrease|totalTestResultsIncrease|                hash|
+--------+------+--------+--------+-------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+--------------------+------+------------+----------------+--------------

In [34]:
# Data Cleaning
# Replacing empty strings with None to handle them as null values
sparkDf = sparkDf.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in sparkDf.columns])

# Replacing missing and null values with default values
fill_values = {
    "date": "1970-01-01",
    "states": "Unknown",
    "positive": 0,
    "negative": 0,
    "pending": 0,
    "hospitalizedCurrently": 0,
    "hospitalizedCumulative": 0,
    "inIcuCurrently": 0,
    "inIcuCumulative": 0,
    "onVentilatorCurrently": 0,
    "onVentilatorCumulative": 0,
    "dateChecked": "1970-01-01",
    "death": 0,
    "hospitalized": 0,
    "totalTestResults": 0,
    "lastModified": "1970-01-01",
    "recovered": 0,
    "total": 0,
    "posNeg": 0,
    "deathIncrease": 0,
    "hospitalizedIncrease": 0,
    "negativeIncrease": 0,
    "positiveIncrease": 0,
    "totalTestResultsIncrease": 0,
    "hash": "Unknown"
}

# Applying fill values for nulls
sparkDf = sparkDf.na.fill(fill_values)

# Showing the DataFrame to inspect the first 10 rows after filling values
sparkDf.show(10)

+--------+------+--------+--------+-------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+--------------------+------+------------+----------------+--------------------+---------+-----+------+-------------+--------------------+----------------+----------------+------------------------+--------------------+
|    date|states|positive|negative|pending|hospitalizedCurrently|hospitalizedCumulative|inIcuCurrently|inIcuCumulative|onVentilatorCurrently|onVentilatorCumulative|         dateChecked| death|hospitalized|totalTestResults|        lastModified|recovered|total|posNeg|deathIncrease|hospitalizedIncrease|negativeIncrease|positiveIncrease|totalTestResultsIncrease|                hash|
+--------+------+--------+--------+-------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+--------------------+------+------------+----------------+--------------

In [35]:
# Coalescing to a single partition to ensure a single output file
sparkDf = sparkDf.coalesce(1)

# Converting back to DynamicFrame for writing to S3
cleaned_dynamic_frame = DynamicFrame.fromDF(sparkDf, glueContext, "cleaned_dynamic_frame")

# Writing the cleaned data back to S3
output_path = "s3://etlemrbucket/cleandata/"
glueContext.write_dynamic_frame.from_options(
    frame=cleaned_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_path},
    format="csv"
)

<awsglue.dynamicframe.DynamicFrame object at 0x7f0c73dc8730>


In [36]:
# Dataset: states_daily
# Loading the dataset into a DynamicFrame
dynamicFrameStatesdaily = glueContext.create_dynamic_frame.from_catalog(database="coviddatabase", table_name="states_daily")

# Since, the above python code will show col0, col1,...as headers, so by mapping the column names to new ones
# String data types for handling all columns
mapped_dynamic_frame = dynamicFrameStatesdaily.apply_mapping([
    ('col0', 'string', 'date', 'string'),
    ('col1', 'string', 'state', 'string'),
    ('col2', 'string', 'positive', 'string'),
    ('col3', 'string', 'probableCases', 'string'),
    ('col4', 'string', 'negative', 'string'),
    ('col5', 'string', 'pending', 'string'),
    ('col6', 'string', 'totalTestResultsSource', 'string'),
    ('col7', 'string', 'totalTestResults', 'string'),
    ('col8', 'string', 'hospitalizedCurrently', 'string'),
    ('col9', 'string', 'hospitalizedCumulative', 'string'),
    ('col10', 'string', 'inIcuCurrently', 'string'),
    ('col11', 'string', 'inIcuCumulative', 'string'),
    ('col12', 'string', 'onVentilatorCurrently', 'string'),
    ('col13', 'string', 'onVentilatorCumulative', 'string'),
    ('col14', 'string', 'recovered', 'string'),
    ('col15', 'string', 'lastUpdateEt', 'string'),
    ('col16', 'string', 'dateModified', 'string'),
    ('col17', 'string', 'checkTimeEt', 'string'),
    ('col18', 'string', 'death', 'string'),
    ('col19', 'string', 'hospitalized', 'string'),
    ('col20', 'string', 'hospitalizedDischarged', 'string'),
    ('col21', 'string', 'dateChecked', 'string'),
    ('col22', 'string', 'totalTestsViral', 'string'),
    ('col23', 'string', 'positiveTestsViral', 'string'),
    ('col24', 'string', 'negativeTestsViral', 'string'),
    ('col25', 'string', 'positiveCasesViral', 'string'),
    ('col26', 'string', 'deathConfirmed', 'string'),
    ('col27', 'string', 'deathProbable', 'string'),
    ('col28', 'string', 'totalTestEncountersViral', 'string'),
    ('col29', 'string', 'totalTestsPeopleViral', 'string'),
    ('col30', 'string', 'totalTestsAntibody', 'string'),
    ('col31', 'string', 'positiveTestsAntibody', 'string'),
    ('col32', 'string', 'negativeTestsAntibody', 'string'),
    ('col33', 'string', 'totalTestsPeopleAntibody', 'string'),
    ('col34', 'string', 'positiveTestsPeopleAntibody', 'string'),
    ('col35', 'string', 'negativeTestsPeopleAntibody', 'string'),
    ('col36', 'string', 'totalTestsPeopleAntigen', 'string'),
    ('col37', 'string', 'positiveTestsPeopleAntigen', 'string'),
    ('col38', 'string', 'totalTestsAntigen', 'string'),
    ('col39', 'string', 'positiveTestsAntigen', 'string'),
    ('col40', 'string', 'fips', 'string'),
    ('col41', 'string', 'positiveIncrease', 'string'),
    ('col42', 'string', 'negativeIncrease', 'string'),
    ('col43', 'string', 'total', 'string'),
    ('col44', 'string', 'totalTestResultsIncrease', 'string'),
    ('col45', 'string', 'posNeg', 'string'),
    ('col46', 'string', 'dataQualityGrade', 'string'),
    ('col47', 'string', 'deathIncrease', 'string'),
    ('col48', 'string', 'hospitalizedIncrease', 'string'),
    ('col49', 'string', 'hash', 'string'),
    ('col50', 'string', 'commercialScore', 'string'),
    ('col51', 'string', 'negativeRegularScore', 'string'),
    ('col52', 'string', 'negativeScore', 'string'),
    ('col53', 'string', 'positiveScore', 'string'),
    ('col54', 'string', 'score', 'string'),
    ('col55', 'string', 'grade', 'string')
])
# Converting the DynamicFrame to an iterable of records
records = mapped_dynamic_frame.toDF().collect()

# Shifting the rows (make the second row the first row)
header = records.pop(0)

# Creating a new DynamicFrame from the remaining records
newdynamicFrameStatesdaily = DynamicFrame.fromDF(spark.createDataFrame(records, schema=header), glueContext, "newdynamicFrameStatesdaily")

# Showing Dynamic Frame to Spark DataFrame
sparkDf = newdynamicFrameStatesdaily.toDF()

# Showing spark DF
sparkDf.show(10)

+--------+-----+--------+-------------+--------+-------+----------------------+----------------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+---------+---------------+--------------------+-----------+-----+------------+----------------------+--------------------+---------------+------------------+------------------+------------------+--------------+-------------+------------------------+---------------------+------------------+---------------------+---------------------+------------------------+---------------------------+---------------------------+-----------------------+--------------------------+-----------------+--------------------+----+----------------+----------------+-------+------------------------+-------+----------------+-------------+--------------------+--------------------+---------------+--------------------+-------------+-------------+-----+-----+
|    date|state|positive|probableCases|negative

In [37]:
# Data Cleaning
# Replacing empty strings with None to handle them as null values
sparkDf = sparkDf.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in sparkDf.columns])

# Replacing missing and null values with default values
fill_values = {
    "date": "1970-01-01",
    "state": "Unknown",
    "positive": 0,
    "probableCases": 0,
    "negative": 0,
    "pending": 0,
    "totalTestResultsSource": "Unknown",
    "totalTestResults": 0,
    "hospitalizedCurrently": 0,
    "hospitalizedCumulative": 0,
    "inIcuCurrently": 0,
    "inIcuCumulative": 0,
    "onVentilatorCurrently": 0,
    "onVentilatorCumulative": 0,
    "recovered": 0,
    "lastUpdateEt": "1970-01-01",
    "dateModified": "1970-01-01",
    "checkTimeEt": "1970-01-01",
    "death": 0,
    "hospitalized": 0,
    "hospitalizedDischarged": 0,
    "dateChecked": "1970-01-01",
    "totalTestsViral": 0,
    "positiveTestsViral": 0,
    "negativeTestsViral": 0,
    "positiveCasesViral": 0,
    "deathConfirmed": 0,
    "deathProbable": 0,
    "totalTestEncountersViral": 0,
    "totalTestsPeopleViral": 0,
    "totalTestsAntibody": 0,
    "positiveTestsAntibody": 0,
    "negativeTestsAntibody": 0,
    "totalTestsPeopleAntibody": 0,
    "positiveTestsPeopleAntibody": 0,
    "negativeTestsPeopleAntibody": 0,
    "totalTestsPeopleAntigen": 0,
    "positiveTestsPeopleAntigen": 0,
    "totalTestsAntigen": 0,
    "positiveTestsAntigen": 0,
    "fips": "Unknown",
    "positiveIncrease": 0,
    "negativeIncrease": 0,
    "total": 0,
    "totalTestResultsIncrease": 0,
    "posNeg": 0,
    "dataQualityGrade": "Unknown",
    "deathIncrease": 0,
    "hospitalizedIncrease": 0,
    "hash": "Unknown",
    "commercialScore": 0,
    "negativeRegularScore": 0,
    "negativeScore": 0,
    "positiveScore": 0,
    "score": 0,
    "grade": "Unknown"
}

# Applying fill values for nulls
sparkDf = sparkDf.na.fill(fill_values)

# Showing the DataFrame to inspect the first 10 rows after filling values
sparkDf.show(10)

+--------+-----+--------+-------------+--------+-------+----------------------+----------------+---------------------+----------------------+--------------+---------------+---------------------+----------------------+---------+---------------+--------------------+-----------+-----+------------+----------------------+--------------------+---------------+------------------+------------------+------------------+--------------+-------------+------------------------+---------------------+------------------+---------------------+---------------------+------------------------+---------------------------+---------------------------+-----------------------+--------------------------+-----------------+--------------------+----+----------------+----------------+-------+------------------------+-------+----------------+-------------+--------------------+--------------------+---------------+--------------------+-------------+-------------+-----+-------+
|    date|state|positive|probableCases|negati

In [38]:
# Coalescing to a single partition to ensure a single output file
sparkDf = sparkDf.coalesce(1)

# Converting back to DynamicFrame for writing to S3
cleaned_dynamic_frame = DynamicFrame.fromDF(sparkDf, glueContext, "cleaned_dynamic_frame")

# Writing the cleaned data back to S3
output_path = "s3://etlemrbucket/cleandata/"
glueContext.write_dynamic_frame.from_options(
    frame=cleaned_dynamic_frame,
    connection_type="s3",
    connection_options={"path": output_path},
    format="csv"
)

<awsglue.dynamicframe.DynamicFrame object at 0x7f0c74097670>
