# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [1]:
# %help

Welcome to the Glue Interactive Sessions Kernel



For more information on available magic commands, please type %help in any new cell.







Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html



Installed kernel version: 1.0.5 


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 20
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 20 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 20
Session ID: 93e16f81-afbb-4200-9d6f-f5ec2b94d5e2
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 93e16f81-afbb-4200-9d6f-f5ec2b94d5e2 to get into ready status...
Session 93e16f81-afbb-4200-9d6f-f5ec2b94d5e2 has be

In [2]:
# important! using python min, max won't work
from pyspark.sql.functions import col, min, max, sum, avg, count, countDistinct, row_number
from pyspark.sql.window import Window

# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
from pyspark.sql.types import StructType, StructField, BooleanType, ByteType, ShortType, IntegerType, StringType, FloatType, DoubleType




## aisles
read as csv, save as parquet, then read from parquet

In [3]:
aisles_schema = StructType([
    StructField("aisle_id", IntegerType(), True),
    StructField("aisle", StringType(), True)
])

# read from csv in s3
s3_input_path = "s3://weikaibucket/data/aisles/aisles.csv"
aisles = spark.read.csv(s3_input_path, header=True, schema=aisles_schema)

# save as parquet
s3_output_path = "s3://weikaibucket/data_parquet/aisles/"
aisles.write.mode("overwrite").parquet(s3_output_path)

# read parquet
aisles = spark.read.parquet(s3_output_path)

aisles.printSchema()
print(f'row count: {aisles.count()}')

root

 |-- aisle_id: integer (nullable = true)

 |-- aisle: string (nullable = true)



row count: 134


## departments
read as csv, save as parquet, then read from parquet

In [4]:
departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department", StringType(), True)
])

# read from csv in s3
departments = spark.read.csv("s3://weikaibucket/data/departments/departments.csv", header=True, schema=departments_schema)

# save as parquet
departments.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/departments/")

# read parquet
departments = spark.read.parquet('s3://weikaibucket/data_parquet/departments')


departments.printSchema()

# print row number
print(f'Row count: {departments.count()}')

root

 |-- department_id: integer (nullable = true)

 |-- department: string (nullable = true)



row count: 21


## products
read as csv, save as parquet, then read from parquet

In [6]:
products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("aisle_id", IntegerType(), True),
    StructField("department_id", IntegerType(), True)
])
# read from csv in s3
products = spark.read.csv("s3://weikaibucket/data/products/products.csv", header=True, schema=products_schema)

# save as parquet
products.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/products/")

# read parquet
products = spark.read.parquet('s3://weikaibucket/data_parquet/products/')


products.printSchema()

# print row number
print(f'Row count: {products.count()}')

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)

Row count: 49688


## orders
read as csv, partition by eval_set, save as parquet, then read from parque

In [9]:
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("eval_set", StringType(), True),
    StructField("order_number", IntegerType(), True),
    StructField("order_dow", ByteType(), True),
    StructField("order_hour_of_day", ByteType(), True),
    StructField("days_since_prior_order", FloatType(), True)
])
# read from csv in s3
orders = spark.read.csv("s3://weikaibucket/data/orders/orders.csv", header=True, schema=orders_schema)

# save as parquet
orders.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/orders/")

# read parquet
orders = spark.read.parquet('s3://weikaibucket/data_parquet/orders')


orders.printSchema()

# print row number
print(f'Row count: {orders.count()}')


root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: byte (nullable = true)
 |-- order_hour_of_day: byte (nullable = true)
 |-- days_since_prior_order: float (nullable = true)

Row count: 3421083


## order_products
read as csv, save as parquet, then read from parque

In [10]:
# takes 1 minute to run
order_products_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("add_to_cart_order", IntegerType(), True),
    StructField("reordered", IntegerType(), True)
])
order_products = spark.read.csv("s3://weikaibucket/data/order_products/", header=True, schema=order_products_schema)

# save as parquet
order_products.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/order_products/")

# read parquet
order_products = spark.read.parquet('s3://weikaibucket/data_parquet/order_products')


order_products.printSchema()

# print row number
print(f'Row count: {order_products.count()}')

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: integer (nullable = true)

Row count: 33819106


## order_products_prior
already save as parquet, just read from parquet

In [19]:
# read parquet
order_products_prior = spark.read.parquet('s3://weikaibucket/features/order_products_prior/')


order_products_prior.printSchema()

# print row number
print(f'Row count: {order_products_prior.count()}')

root
 |-- order_id: long (nullable = true)
 |-- user_id: long (nullable = true)
 |-- eval_set: string (nullable = true)
 |-- order_number: long (nullable = true)
 |-- order_dow: long (nullable = true)
 |-- order_hour_of_day: long (nullable = true)
 |-- days_since_prior_order: double (nullable = true)
 |-- product_id: long (nullable = true)
 |-- add_to_cart_order: long (nullable = true)
 |-- reordered: long (nullable = true)

Row count: 32434489


## Q2
```sql
CREATE TABLE user_features_1 WITH (
	external_location = 's3://weikaibucket/features/user_features_1/',
	format = 'PARQUET'
) AS
SELECT user_id,
	MAX(order_number) AS max_order_number,
	SUM(days_since_prior_order) AS sum_days_since_prior_order,
	AVG(days_since_prior_order) AS avg_days_since_prior_order
FROM orders
GROUP BY user_id;
```

In [11]:
user_features_1 = spark.read.parquet('s3://weikaibucket/data_parquet/orders') # read as parquet

user_features_1 = user_features_1.groupBy('user_id').agg(
    max('order_number').alias('max_order_number'),
    sum('days_since_prior_order').alias('sum_days_since_prior_order'),
    avg('days_since_prior_order').alias('avg_days_since_prior_order')
)


user_features_1.orderBy('user_id').show(5)


print(f'row count: {user_features_1.count()}')


s3_output_path = "s3://weikaibucket/features/pyspark/user_features_1/"
user_features_1.write.mode("overwrite").parquet(s3_output_path)

+-------+----------------+--------------------------+--------------------------+
|user_id|max_order_number|sum_days_since_prior_order|avg_days_since_prior_order|
+-------+----------------+--------------------------+--------------------------+
|      1|              11|                     190.0|                      19.0|
|      2|              15|                     228.0|        16.285714285714285|
|      3|              13|                     144.0|                      12.0|
|      4|               6|                      85.0|                      17.0|
|      5|               5|                      46.0|                      11.5|
+-------+----------------+--------------------------+--------------------------+
only showing top 5 rows

row count: 206209


## Q3
```sql
Create Table user_features_2 with (
	external_location = 's3://weikaibucket/features/user_features_2/',
	format = 'parquet'
) as
select user_id,
	count(product_id) as total_number_of_products,
	count(Distinct product_id) as total_number_of_distinct_products,
	--整数除法。如果结果是一个小于 1 的小数，那么结果会被截断为 0,所以要乘以1.0变浮点数
	sum(if(reordered=1,1,0)) * 1.0 / sum(if(order_number>1,order_number,null)) as reorder_ratio
from order_products_prior
group by user_id;
```

In [21]:
order_products_prior = spark.read.parquet('s3://weikaibucket/features/order_products_prior/') # read as parquet

## Registering data as a temporary view
order_products_prior.createOrReplaceTempView("order_products_prior")

user_features_2 = spark.sql('''
select user_id,
	count(product_id) as total_number_of_products,
	count(Distinct product_id) as total_number_of_distinct_products,
	--整数除法。如果结果是一个小于 1 的小数，那么结果会被截断为 0,所以要乘以1.0变浮点数
	sum(if(reordered=1,1,0)) * 1.0 / sum(if(order_number>1,order_number,null)) as reorder_ratio
from order_products_prior
group by user_id;
''')

user_features_2.orderBy('user_id').show(5)


print(f'row count: {user_features_2.count()}')


s3_output_path = "s3://weikaibucket/features/pyspark/user_features_2/"
user_features_2.write.mode("overwrite").parquet(s3_output_path)

+-------+------------------------+---------------------------------+------------------+
|user_id|total_number_of_products|total_number_of_distinct_products|     reorder_ratio|
+-------+------------------------+---------------------------------+------------------+
|      1|                      59|                               18|0.1213017751479290|
|      2|                     195|                              102|0.0598455598455598|
|      3|                      88|                               33|0.1061776061776062|
|      4|                      18|                               17|0.0208333333333333|
|      5|                      37|                               23|0.1728395061728395|
+-------+------------------------+---------------------------------+------------------+
only showing top 5 rows

row count: 206209


## Q4
```sql
Create Table up_features with (
	external_location = 's3://weikaibucket/features/up_features/',
	format = 'parquet'
) as
select user_id,
	product_id,
	count(order_id) as total_number_of_orders,
	min(order_number) as minimum_order_number,
	max(order_number) as max_order_number,
	avg(add_to_cart_order) as avg_add_to_cart_order
from order_products_prior
group by user_id,
	product_id;
```

In [25]:
order_products_prior.createOrReplaceTempView("order_products_prior")

up_features = spark.sql('''
select user_id,
	product_id,
	count(order_id) as total_number_of_orders,
	min(order_number) as minimum_order_number,
	max(order_number) as max_order_number,
	avg(add_to_cart_order) as avg_add_to_cart_order
from order_products_prior
group by user_id,
	product_id;
''')

up_features.orderBy('user_id','product_id').show(5)


print(f'row count: {up_features.count()}')


s3_output_path = "s3://weikaibucket/features/pyspark/up_features/"
up_features.write.mode("overwrite").parquet(s3_output_path)

+-------+----------+----------------------+--------------------+----------------+---------------------+
|user_id|product_id|total_number_of_orders|minimum_order_number|max_order_number|avg_add_to_cart_order|
+-------+----------+----------------------+--------------------+----------------+---------------------+
|      1|       196|                    10|                   1|              10|                  1.4|
|      1|     10258|                     9|                   2|              10|   3.3333333333333335|
|      1|     10326|                     1|                   5|               5|                  5.0|
|      1|     12427|                    10|                   1|              10|                  3.3|
|      1|     13032|                     3|                   2|              10|    6.333333333333333|
+-------+----------+----------------------+--------------------+----------------+---------------------+
only showing top 5 rows

row count: 13307953


## Q5
```sql
Create Table prd_features with (
	external_location = 's3://weikaibucket/features/prd_features/',
	format = 'parquet'
) as
select product_id,
	count(product_id) as count_product,
	sum(reordered) as sum_of_reordered,
	count(
		if (product_seq_table.product_seq_time = 1, 1, null)
	) as seq_time1,
	count(
		if (product_seq_table.product_seq_time = 2, 1, null)
	) as seq_time2
from (
		select user_id,
			product_id,
			reordered,
			row_number() over (
				partition by user_id,
				product_id
				order by order_number Asc
			) as product_seq_time
		from order_products_prior
	) as product_seq_table
group by product_id;
```

In [29]:
order_products_prior.createOrReplaceTempView("order_products_prior")

prd_features = spark.sql('''
select product_id,
	count(product_id) as count_product,
	sum(reordered) as sum_of_reordered,
	count(
		if (product_seq_table.product_seq_time = 1, 1, null)
	) as seq_time1,
	count(
		if (product_seq_table.product_seq_time = 2, 1, null)
	) as seq_time2
from (
		select user_id,
			product_id,
			reordered,
			row_number() over (
				partition by user_id,
				product_id
				order by order_number Asc
			) as product_seq_time
		from order_products_prior
	) as product_seq_table
group by product_id;
''')

prd_features.orderBy('product_id').show(5)


print(f'row count: {prd_features.count()}')


s3_output_path = "s3://weikaibucket/features/pyspark/prd_features/"
prd_features.write.mode("overwrite").parquet(s3_output_path)

+----------+-------------+----------------+---------+---------+
|product_id|count_product|sum_of_reordered|seq_time1|seq_time2|
+----------+-------------+----------------+---------+---------+
|         1|         1852|            1136|      716|      276|
|         2|           90|              12|       78|        8|
|         3|          277|             203|       74|       36|
|         4|          329|             147|      182|       64|
|         5|           15|               9|        6|        4|
+----------+-------------+----------------+---------+---------+
only showing top 5 rows

row count: 49677
