# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [1]:
# %help

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 20
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Current idle_timeout is 20 minutes.
idle_timeout has been set to 20 minutes.
Setting Glue version to: 4.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 2
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 20
Session ID: f5c32082-b131-425e-a62a-0b638e6cce43
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session f5c32082-b131-425e-a62a-0b638e6cce43 to get into ready status...
Session f5c32082-b131-425e-a62a-0b638e6cce43 has been created.



In [2]:
# important! using python min, max won't work
from pyspark.sql.functions import col, min, max, sum, avg, count, countDistinct, row_number
from pyspark.sql.window import Window

# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
from pyspark.sql.types import StructType, StructField, BooleanType, ByteType, ShortType, IntegerType, StringType, FloatType, DoubleType




## aisles
read as csv, save as parquet, then read from parquet

In [6]:
aisles_schema = StructType([
    StructField("aisle_id", IntegerType(), True),
    StructField("aisle", StringType(), True)
])
aisles = spark.read.csv("s3://imba-derek/data/aisles/", header=True, schema=aisles_schema)
aisles.write.mode("overwrite").parquet("s3://derek-raw-parquet/aisles/")
aisles = spark.read.parquet('s3://derek-raw-parquet/aisles/')
aisles.printSchema()
print(f'row count: {aisles.count()}')

Py4JJavaError: An error occurred while calling o138.parquet.
: java.io.IOException: Failed to delete key: aisles
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.java:417)
	at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.delete(EmrFileSystem.java:387)
	at org.apache.spark.internal.io.FileCommitProtocol.deleteWithJob(FileCommitProtocol.scala:186)
	at org.apache.spark.sql.execution.datasources.SQLEmrOptimizedCommitProtocol.deleteWithJob(SQLEmrOptimizedCommitProtocol.scala:149)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.deleteMatchingPartitions(InsertIntoHadoopFsRelationCommand.scala:257)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:133)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scal

## departments
read as csv, save as parquet, then read from parquet

In [7]:
departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department", StringType(), True)
])
departments = spark.read.csv("s3://imba-derek/data/departments/departments.csv", header=True, schema=departments_schema)
departments.write.mode("overwrite").parquet("s3://derek-raw-parquet/departments/")
departments = spark.read.parquet('s3://derek-raw-parquet/departments') # read as parquet
departments.printSchema()
print(f'row count: {departments.count()}')

Py4JJavaError: An error occurred while calling o155.parquet.
: java.io.IOException: Failed to delete key: departments
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.java:417)
	at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.delete(EmrFileSystem.java:387)
	at org.apache.spark.internal.io.FileCommitProtocol.deleteWithJob(FileCommitProtocol.scala:186)
	at org.apache.spark.sql.execution.datasources.SQLEmrOptimizedCommitProtocol.deleteWithJob(SQLEmrOptimizedCommitProtocol.scala:149)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.deleteMatchingPartitions(InsertIntoHadoopFsRelationCommand.scala:257)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:133)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands

## products
read as csv, save as parquet, then read from parquet

In [8]:
products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("aisle_id", IntegerType(), True),
    StructField("department_id", IntegerType(), True)
])
products = spark.read.csv("s3://imba-derek/data/products/products.csv", header=True, schema=products_schema)
products.write.mode("overwrite").parquet("s3://derek-raw-parquet/products/")
products = spark.read.parquet('s3://derek-raw-parquet/products') # read as parquet
products.printSchema()
print(f'row count: {products.count()}')

Py4JJavaError: An error occurred while calling o172.parquet.
: java.io.IOException: Failed to delete key: products
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.java:417)
	at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.delete(EmrFileSystem.java:387)
	at org.apache.spark.internal.io.FileCommitProtocol.deleteWithJob(FileCommitProtocol.scala:186)
	at org.apache.spark.sql.execution.datasources.SQLEmrOptimizedCommitProtocol.deleteWithJob(SQLEmrOptimizedCommitProtocol.scala:149)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.deleteMatchingPartitions(InsertIntoHadoopFsRelationCommand.scala:257)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:133)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.sc

## denorm products
join with aisles and departments, save to transformed

In [8]:
products_denorm = products\
                    .join(aisles, products.aisle_id==aisles.aisle_id, 'inner')\
                    .join(departments, products.department_id==departments.department_id, 'inner')\
                    .select(products.product_id,
                            products.product_name,
                            products.aisle_id,
                            aisles.aisle,
                            products.department_id,
                            departments.department
                           )
products_denorm.printSchema()
products_denorm.write.mode("overwrite").parquet("s3://derek-transformed-data/products/")

root

 |-- product_id: integer (nullable = true)

 |-- product_name: string (nullable = true)

 |-- aisle_id: integer (nullable = true)

 |-- aisle: string (nullable = true)

 |-- department_id: integer (nullable = true)

 |-- department: string (nullable = true)


## orders
read as csv, partition by eval_set, save as parquet, then read from parque

In [9]:
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("eval_set", StringType(), True),
    StructField("order_number", IntegerType(), True),
    StructField("order_dow", ByteType(), True),
    StructField("order_hour_of_day", ByteType(), True),
    StructField("days_since_prior_order", FloatType(), True)
])
orders = spark.read.csv("s3://imba-derek/data/orders/orders.csv", header=True, schema=orders_schema)
orders.write.partitionBy("eval_set").mode("overwrite").parquet("s3://derek-raw-parquet/orders/")
orders = spark.read.parquet('s3://derek-raw-parquet/orders') # read as parquet
orders.printSchema()
print(f'row count: {orders.count()}')
orders.agg(min('order_number'), max('order_number')).show()
orders.agg(min('days_since_prior_order'), max('days_since_prior_order')).show()

Py4JJavaError: An error occurred while calling o192.parquet.
: java.io.IOException: Failed to delete key: orders
	at com.amazon.ws.emr.hadoop.fs.s3n.S3NativeFileSystem.delete(S3NativeFileSystem.java:417)
	at com.amazon.ws.emr.hadoop.fs.EmrFileSystem.delete(EmrFileSystem.java:387)
	at org.apache.spark.internal.io.FileCommitProtocol.deleteWithJob(FileCommitProtocol.scala:186)
	at org.apache.spark.sql.execution.datasources.SQLEmrOptimizedCommitProtocol.deleteWithJob(SQLEmrOptimizedCommitProtocol.scala:149)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.deleteMatchingPartitions(InsertIntoHadoopFsRelationCommand.scala:257)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:133)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scal

In [10]:
# filter by eval_set=prior
orders_prior = orders.where(orders.eval_set=='prior').select(*[c for c in orders.columns if c!='eval_set'])
print(f'row count: {orders_prior.count()}')
orders_prior.write.mode("overwrite").parquet("s3://derek-transformed-data/orders_prior/")

row count: 3214874


## order_products
read as csv, save as parquet, then read from parque

In [11]:
# takes 1 minute to run
order_products_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("add_to_cart_order", IntegerType(), True),
    StructField("reordered", IntegerType(), True)
])
order_products = spark.read.csv("s3://imba-derek/data/order_products/", header=True, schema=order_products_schema)
order_products = order_products.withColumn("reordered", col("reordered").cast("boolean"))
order_products.write.mode("overwrite").parquet("s3://derek-raw-parquet/order_products/")
order_products = spark.read.parquet('s3://derek-raw-parquet/order_products') # read as parquet
order_products.printSchema()
print(f'row count: {order_products.count()}')

root

 |-- order_id: integer (nullable = true)

 |-- product_id: integer (nullable = true)

 |-- add_to_cart_order: integer (nullable = true)

 |-- reordered: boolean (nullable = true)



row count: 33819106


In [10]:
# takes 20 seconds to run
order_products_prior = orders_prior\
                        .join(order_products, orders_prior.order_id==order_products.order_id, 'inner')\
                        .select(orders_prior.order_id,
                                orders_prior.user_id,
                                orders_prior.order_number,
                                orders_prior.order_dow,
                                orders_prior.order_hour_of_day,
                                orders_prior.days_since_prior_order,
                                order_products.product_id,
                                order_products.add_to_cart_order,
                                order_products.reordered
                               )
order_products_prior.write.mode("overwrite").parquet("s3://derek-transformed-data/order_products_prior/")

NameError: name 'orders_prior' is not defined


## Q2
```sql
select 
    user_id, 
    max(order_number) as max_order_number, 
    sum(days_since_prior_order) as sum_days_since_prior_order, 
    avg(days_since_prior_order) as avg_days_since_prior_order
from orders
group by user_id;
```

In [None]:
# orders = spark.read.parquet('s3://sam-raw-parquet/orders') # read as parquet

In [14]:
user_features_1 = orders.groupBy('user_id').agg(max('order_number').alias('max_order_number'),
                                               sum('days_since_prior_order').alias('sum_days_since_prior_order'),
                                               avg('days_since_prior_order').alias('avg_days_since_prior_order')
                                               )
user_features_1.orderBy('user_id').show(5)
print(f'row count: {user_features_1.count()}')
# save aggregated result as one part
user_features_1.write.mode("overwrite").parquet("s3://sam-transformed/user_features_1/")

+-------+----------------+--------------------------+--------------------------+

|user_id|max_order_number|sum_days_since_prior_order|avg_days_since_prior_order|

+-------+----------------+--------------------------+--------------------------+

|      1|              11|                     190.0|                      19.0|

|      2|              15|                     228.0|        16.285714285714285|

|      3|              13|                     144.0|                      12.0|

|      4|               6|                      85.0|                      17.0|

|      5|               5|                      46.0|                      11.5|

+-------+----------------+--------------------------+--------------------------+

only showing top 5 rows



row count: 206209


## Q3
```sql
SELECT
    user_id,
    COUNT(product_id) AS total_products_count,
    COUNT(DISTINCT product_id) AS total_distinct_products_count, 
    SUM(CASE WHEN reordered = 1 THEN 1 ELSE 0 END) * 1.0 / 
    SUM(CASE WHEN order_number > 1 THEN 1 ELSE 0 END) AS reorder_ratio
FROM order_products_prior
GROUP BY user_id;
```

In [None]:
# order_products_prior = spark.read.parquet('s3://sam-transformed/order_products_prior') # read as parquet

In [15]:
user_features_2 = order_products_prior.groupBy('user_id').agg(count('product_id').alias('total_products'),
                                                              countDistinct('product_id').alias('total_distinct_products'),
                                                              (sum(col('reordered').cast('int'))/
                                                               sum((col('order_number')>1).cast('int'))).alias('reorder_ratio')
                                                            )
user_features_2.orderBy('user_id').show(5)
print(f'row count: {user_features_2.count()}')
user_features_2.write.mode("overwrite").parquet("s3://sam-transformed/user_features_2/")

+-------+--------------+-----------------------+-------------------+

|user_id|total_products|total_distinct_products|      reorder_ratio|

+-------+--------------+-----------------------+-------------------+

|      1|            59|                     18| 0.7592592592592593|

|      2|           195|                    102|  0.510989010989011|

|      3|            88|                     33| 0.7051282051282052|

|      4|            18|                     17|0.07142857142857142|

|      5|            37|                     23| 0.5384615384615384|

+-------+--------------+-----------------------+-------------------+

only showing top 5 rows



row count: 206209


## Q4
```sql
SELECT
    user_id,
    product_id,
    COUNT(order_id) AS total_orders,
    MIN(order_number) AS min_order_number,
    MAX(order_number) AS max_order_number,
    AVG(add_to_cart_order) AS avg_add_to_cart_order
FROM order_products_prior
GROUP BY user_id, product_id;
```

In [16]:
up_features = order_products_prior.groupBy('user_id', 'product_id').agg(count('order_id').alias('total_orders'),
                                                                        min('order_number').alias('min_order_number'),
                                                                        max('order_number').alias('max_order_number'),
                                                                        avg('add_to_cart_order').alias('avg_add_to_cart_order')
                                                                       )
up_features.orderBy('user_id', 'product_id').show(5)
print(f'row count: {up_features.count()}')
up_features.write.mode("overwrite").parquet("s3://sam-transformed/up_features/")

+-------+----------+------------+----------------+----------------+---------------------+

|user_id|product_id|total_orders|min_order_number|max_order_number|avg_add_to_cart_order|

+-------+----------+------------+----------------+----------------+---------------------+

|      1|       196|          10|               1|              10|                  1.4|

|      1|     10258|           9|               2|              10|   3.3333333333333335|

|      1|     10326|           1|               5|               5|                  5.0|

|      1|     12427|          10|               1|              10|                  3.3|

|      1|     13032|           3|               2|              10|    6.333333333333333|

+-------+----------+------------+----------------+----------------+---------------------+

only showing top 5 rows



row count: 13307953


## Q5
```sql
SELECT 
    product_id,
    COUNT(product_id) AS total_products,
    SUM(reordered) AS total_reordered,
    SUM(CASE WHEN product_seq_time = 1 THEN 1 ELSE 0 END) AS product_seq_time_is_1,
    SUM(CASE WHEN product_seq_time = 2 THEN 1 ELSE 0 END) AS product_seq_time_is_2
FROM (
    SELECT
        product_id,
        reordered,
        ROW_NUMBER() OVER (PARTITION BY user_id, product_id ORDER BY order_number ASC) AS product_seq_time
    FROM order_products_prior
) prod_seq
GROUP BY product_id;
```

In [17]:
prod_seq = order_products_prior.withColumn('product_seq_time', 
                                           row_number().over(Window\
                                                             .partitionBy('user_id', 'product_id')\
                                                             .orderBy(col('order_number').asc())
                                                            )
                                          ).select('product_id', 'reordered', 'product_seq_time')

prd_features = prod_seq.groupBy('product_id').agg(count('product_id').alias('total_products'),
                                                  sum(col('reordered').cast('int')).alias('total_reordered'),
                                                  sum((col('product_seq_time')==1).cast('int')).alias('product_seq_time_is_1'),
                                                  sum((col('product_seq_time')==2).cast('int')).alias('product_seq_time_is_2')
                                                 )
prd_features.orderBy('product_id').show(5)
print(f'row count: {prd_features.count()}')
prd_features.write.mode("overwrite").parquet("s3://sam-transformed/prd_features/")

+----------+--------------+---------------+---------------------+---------------------+

|product_id|total_products|total_reordered|product_seq_time_is_1|product_seq_time_is_2|

+----------+--------------+---------------+---------------------+---------------------+

|         1|          1852|           1136|                  716|                  276|

|         2|            90|             12|                   78|                    8|

|         3|           277|            203|                   74|                   36|

|         4|           329|            147|                  182|                   64|

|         5|            15|              9|                    6|                    4|

+----------+--------------+---------------+---------------------+---------------------+

only showing top 5 rows



row count: 49677
