#### Optional: Run this cell to see available notebook commands ("magics").


# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [2]:
%idle_timeout 2880
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

You are already connected to a glueetl session ab092a2b-868f-443f-a308-c30eacfe40d7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session ab092a2b-868f-443f-a308-c30eacfe40d7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 4.0


You are already connected to a glueetl session ab092a2b-868f-443f-a308-c30eacfe40d7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: None
Setting new worker type to: G.1X


You are already connected to a glueetl session ab092a2b-868f-443f-a308-c30eacfe40d7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: None
Setting new number of workers to: 5



#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [10]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='gluedbonaws', table_name='mygluetable')
dyf.printSchema()

root
|-- col0: string
|-- col1: string
|-- col2: string
|-- col3: string
|-- col4: string
|-- col5: string
|-- col6: string
|-- col7: string
|-- col8: string
|-- col9: string
|-- col10: string
|-- col11: string
|-- col12: string
|-- col13: string
|-- col14: string
|-- col15: string
|-- col16: string
|-- col17: string


#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [11]:
df = dyf.toDF()
df.show()

+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+
|    col0|                col1|                col2|           col3|         col4|      col5|              col6|        col7|        col8|        col9|      col10|col11|  col12|     col13|       col14|               col15|       col16|               col17|
+--------+--------------------+--------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+--------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_date...|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surch...|total_a

#### Example: Visualize data with matplotlib


In [None]:
import matplotlib.pyplot as plt

# Set X-axis and Y-axis values
x = [5, 2, 8, 4, 9]
y = [10, 4, 8, 5, 2]
  
# Create a bar chart 
plt.bar(x, y)
  
# Show the plot
%matplot plt

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
s3output = glueContext.getSink(
  path="s3://bucket_name/folder_name",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="demo", catalogTableName="populations"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF)

In [13]:
s3output = glueContext.getSink(
  path="s3://transformedbuck/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="gluedbonaws", catalogTableName="mygluetable"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyf)

<awsglue.dynamicframe.DynamicFrame object at 0x7f8029349ab0>


In [14]:
dyf.count()


6405009


In [15]:
dyf.show(20)

{"col0": "VendorID", "col1": "tpep_pickup_datetime", "col2": "tpep_dropoff_datetime", "col3": "passenger_count", "col4": "trip_distance", "col5": "RatecodeID", "col6": "store_and_fwd_flag", "col7": "PULocationID", "col8": "DOLocationID", "col9": "payment_type", "col10": "fare_amount", "col11": "extra", "col12": "mta_tax", "col13": "tip_amount", "col14": "tolls_amount", "col15": "improvement_surcharge", "col16": "total_amount", "col17": "congestion_surcharge"}
{"col0": "1", "col1": "2020-01-01 00:28:15", "col2": "2020-01-01 00:33:03", "col3": "1", "col4": "1.20", "col5": "1", "col6": "N", "col7": "238", "col8": "239", "col9": "1", "col10": "6", "col11": "3", "col12": "0.5", "col13": "1.47", "col14": "0", "col15": "0.3", "col16": "11.27", "col17": "2.5"}
{"col0": "1", "col1": "2020-01-01 00:35:39", "col2": "2020-01-01 00:43:04", "col3": "1", "col4": "1.20", "col5": "1", "col6": "N", "col7": "239", "col8": "238", "col9": "1", "col10": "7", "col11": "3", "col12": "0.5", "col13": "1.5", "co