# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Not Empty. Output was conciously deleted!

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [2]:
dyf = glueContext.create_dynamic_frame.from_catalog(database='infractions', table_name='parking_ticketsinput')
dyf.printSchema()

root
|-- tag_number_masked: string
|-- date_of_infraction: long
|-- infraction_code: long
|-- infraction_description: string
|-- set_fine_amount: long
|-- time_of_infraction: long
|-- location1: string
|-- location2: string
|-- location3: string
|-- location4: string
|-- province: string


#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [3]:
df = dyf.toDF()

type(df)

<class 'pyspark.sql.dataframe.DataFrame'>


In [4]:
df.show()

+-----------------+------------------+---------------+----------------------+---------------+------------------+---------+-----------------+---------+-------------+--------+
|tag_number_masked|date_of_infraction|infraction_code|infraction_description|set_fine_amount|time_of_infraction|location1|        location2|location3|    location4|province|
+-----------------+------------------+---------------+----------------------+---------------+------------------+---------+-----------------+---------+-------------+--------+
|         ***92517|          20180101|             16|  PARK-WITHIN 9M IN...|             50|                 0|      S/S|        PRYOR AVE|      E/O|CLOVERDALE RD|      ON|
|         ***71708|          20180101|             29|  PARK PROHIBITED T...|             30|                 2|       NR|266 DOVERCOURT RD|         |             |      ON|
|         ***92311|          20180101|             29|  PARK PROHIBITED T...|             30|                 2|       NR|  15 FAI

#### Example: Visualize data with matplotlib


In [5]:
df_new = (
    df.select("tag_number_masked", "date_of_infraction", "infraction_code", "infraction_description", "set_fine_amount", "time_of_infraction")
)

df_new.dtypes

[('tag_number_masked', 'string'), ('date_of_infraction', 'bigint'), ('infraction_code', 'bigint'), ('infraction_description', 'string'), ('set_fine_amount', 'bigint'), ('time_of_infraction', 'bigint')]


In [6]:
type(df_new)

<class 'pyspark.sql.dataframe.DataFrame'>


In [7]:
df_new.show()

+-----------------+------------------+---------------+----------------------+---------------+------------------+
|tag_number_masked|date_of_infraction|infraction_code|infraction_description|set_fine_amount|time_of_infraction|
+-----------------+------------------+---------------+----------------------+---------------+------------------+
|         ***92517|          20180101|             16|  PARK-WITHIN 9M IN...|             50|                 0|
|         ***71708|          20180101|             29|  PARK PROHIBITED T...|             30|                 2|
|         ***92311|          20180101|             29|  PARK PROHIBITED T...|             30|                 2|
|         ***92312|          20180101|             29|  PARK PROHIBITED T...|             30|                 2|
|         ***71709|          20180101|             29|  PARK PROHIBITED T...|             30|                 3|
|         ***73023|          20180101|              3|  PARK ON PRIVATE P...|             30|   

In [9]:
from awsglue.dynamicframe import DynamicFrame




In [10]:
dyF_clean = DynamicFrame.fromDF(df_new, glueContext, "tickets_clean_dyf")




In [15]:
s3output = glueContext.getSink(
  path="s3://<BUCKET_NAME>/parking_tickets/output/",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="infractions", catalogTableName="clean_tickets"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(dyF_clean)

<awsglue.dynamicframe.DynamicFrame object at 0x7f9907354610>


#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
s3output = glueContext.getSink(
  path="s3://<BUCKET_NAME>/folder_name",
  connection_type="s3",
  updateBehavior="UPDATE_IN_DATABASE",
  partitionKeys=[],
  compression="snappy",
  enableUpdateCatalog=True,
  transformation_ctx="s3output",
)
s3output.setCatalogInfo(
  catalogDatabase="demo", catalogTableName="populations"
)
s3output.setFormat("glueparquet")
s3output.writeFrame(DyF)