# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [None]:
%timeout 15
%idle_timeout 15
%glue_version 4.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

#### Create a new data base imba_parquet ####

In [None]:
import boto3
# Initialize the Glue client
glue_client = boto3.client('glue')

# Create a new database
response = glue_client.create_database(
    DatabaseInput={
        'Name': 'imba_parquet',
        'Description': 'a new database that points to the parquet files',
    }
)

#### Leverage GlueContext to convert newly uploaded csv data files to parquet.####

**(need to consider the choices for updateBehavior, partitionKeys, enableUpdateCatalog)**

**aisles**

In [None]:
# read metadata from database='imba'
aisles_read = glueContext.create_dynamic_frame. \
    from_catalog(database="imba2", table_name="aisles", transformation_ctx="S3_out")

# write csv file to parquet file
aisles_parquet = glueContext. \
getSink(path="s3://imba-tgou1055/data3/aisles/", connection_type="s3", \
        updateBehavior="LOG", partitionKeys=[], enableUpdateCatalog=True, transformation_ctx="S3_in")
aisles_parquet.setCatalogInfo(catalogDatabase="imba_parquet",catalogTableName="aisles")
aisles_parquet.setFormat("glueparquet", compression="snappy")
aisles_parquet.writeFrame(aisles_read)

**departments**

In [None]:
# read metadata from database='imba'
departments_read = glueContext.create_dynamic_frame. \
    from_catalog(database="imba2", table_name="departments", transformation_ctx="S3_out")

# write csv file to parquet file
departments_parquet = glueContext. \
getSink(path="s3://imba-tgou1055/data3/departments/", connection_type="s3", \
        updateBehavior="LOG", partitionKeys=[], enableUpdateCatalog=True, transformation_ctx="S3_in")
departments_parquet.setCatalogInfo(catalogDatabase="imba_parquet",catalogTableName="departments")
departments_parquet.setFormat("glueparquet", compression="snappy")
departments_parquet.writeFrame(departments_read)


**orders**

In [None]:
# read metadata from database='imba'
orders_read = glueContext.create_dynamic_frame. \
    from_catalog(database="imba2", table_name="orders", transformation_ctx="S3_out")

# write csv file to parquet file
orders_parquet = glueContext. \
getSink(path="s3://imba-tgou1055/data3/orders/", connection_type="s3", \
        updateBehavior="LOG", partitionKeys=[], enableUpdateCatalog=True, transformation_ctx="S3_in")
orders_parquet.setCatalogInfo(catalogDatabase="imba_parquet",catalogTableName="orders")
orders_parquet.setFormat("glueparquet", compression="snappy")
orders_parquet.writeFrame(orders_read)

**products**

In [None]:
# read metadata from database='imba'
products_read = glueContext.create_dynamic_frame. \
    from_catalog(database="imba2", table_name="products", transformation_ctx="S3_out")

# write csv file to parquet file
products_parquet = glueContext. \
getSink(path="s3://imba-tgou1055/data3/products/", connection_type="s3", \
        updateBehavior="LOG", partitionKeys=[], enableUpdateCatalog=True, transformation_ctx="S3_in")
products_parquet.setCatalogInfo(catalogDatabase="imba_parquet",catalogTableName="products")
products_parquet.setFormat("glueparquet", compression="snappy")
products_parquet.writeFrame(products_read)

**order_products**

In [None]:
# read metadata from database='imba'
order_products_read = glueContext.create_dynamic_frame. \
    from_catalog(database="imba2", table_name="order_products", transformation_ctx="S3_out")

# write csv file to parquet file
order_products_parquet = glueContext. \
getSink(path="s3://imba-tgou1055/data3/order_products/", connection_type="s3", \
        updateBehavior="LOG", partitionKeys=[], enableUpdateCatalog=True, transformation_ctx="S3_in")
order_products_parquet.setCatalogInfo(catalogDatabase="imba_parquet",catalogTableName="order_products")
order_products_parquet.setFormat("glueparquet", compression="snappy")
order_products_parquet.writeFrame(order_products_read)

#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [None]:
# dyf = glueContext.create_dynamic_frame.from_catalog(database='database_name', table_name='table_name')
# dyf.printSchema()

#### Example: Convert the DynamicFrame to a Spark DataFrame and display a sample of the data


In [None]:
# df = dyf.toDF()
# df.show()

#### Example: Visualize data with matplotlib


In [None]:
# import matplotlib.pyplot as plt

# # Set X-axis and Y-axis values
# x = [5, 2, 8, 4, 9]
# y = [10, 4, 8, 5, 2]
  
# # Create a bar chart 
# plt.bar(x, y)
  
# # Show the plot
# %matplot plt

#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [None]:
# s3output = glueContext.getSink(
#   path="s3://bucket_name/folder_name",
#   connection_type="s3",
#   updateBehavior="UPDATE_IN_DATABASE",
#   partitionKeys=[],
#   compression="snappy",
#   enableUpdateCatalog=True,
#   transformation_ctx="s3output",
# )
# s3output.setCatalogInfo(
#   catalogDatabase="demo", catalogTableName="populations"
# )
# s3output.setFormat("glueparquet")
# s3output.writeFrame(DyF)