# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [1]:
# %help

Welcome to the Glue Interactive Sessions Kernel

For more information on available magic commands, please type %help in any new cell.



Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html

Installed kernel version: 1.0.5 


####  Run this cell to set up and start your interactive session.


In [1]:
%idle_timeout 200
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Current idle_timeout is None minutes.
idle_timeout has been set to 20 minutes.
Setting Glue version to: 4.0
Previous worker type: None
Setting new worker type to: G.1X
Previous number of workers: None
Setting new number of workers to: 2
Trying to create a Glue session for the kernel.
Session Type: glueetl
Worker Type: G.1X
Number of Workers: 2
Idle Timeout: 20
Session ID: 7d9c6042-c161-42de-9b8b-60efab9ffa6e
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 7d9c6042-c161-42de-9b8b-60efab9ffa6e to get into ready status...
Session 7d9c6042-c161-42de-9b8b-60efab9ffa6e has be

In [2]:
# important! using python min, max won't work
from pyspark.sql.functions import col, min, max, sum, avg, count, countDistinct, row_number
from pyspark.sql.window import Window

# https://spark.apache.org/docs/latest/sql-ref-datatypes.html
from pyspark.sql.types import StructType, StructField, BooleanType, ByteType, ShortType, IntegerType, StringType, FloatType, DoubleType




## aisles
read as csv, save as parquet, then read from parquet

In [3]:
aisles_schema = StructType([
    StructField("aisle_id", IntegerType(), True),
    StructField("aisle", StringType(), True)
])

# read from csv in s3
s3_input_path = "s3://weikaibucket/data/aisles/aisles.csv"
aisles = spark.read.csv(s3_input_path, header=True, schema=aisles_schema)

# save as parquet
s3_output_path = "s3://weikaibucket/data_parquet/aisles/"
aisles.write.mode("overwrite").parquet(s3_output_path)

# read parquet
aisles = spark.read.parquet(s3_output_path)

aisles.printSchema()
print(f'row count: {aisles.count()}')

root
 |-- aisle_id: integer (nullable = true)
 |-- aisle: string (nullable = true)

row count: 134


## departments
read as csv, save as parquet, then read from parquet

In [4]:
departments_schema = StructType([
    StructField("department_id", IntegerType(), True),
    StructField("department", StringType(), True)
])

# read from csv in s3
departments = spark.read.csv("s3://weikaibucket/data/departments/departments.csv", header=True, schema=departments_schema)

# save as parquet
departments.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/departments/")

# read parquet
departments = spark.read.parquet('s3://weikaibucket/data_parquet/departments')


departments.printSchema()

# print row number
print(f'Row count: {departments.count()}')

root
 |-- department_id: integer (nullable = true)
 |-- department: string (nullable = true)

row count: 21


## products
read as csv, save as parquet, then read from parquet

In [5]:
products_schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("aisle_id", IntegerType(), True),
    StructField("department_id", IntegerType(), True)
])
# read from csv in s3
products = spark.read.csv("s3://weikaibucket/data/products/products.csv", header=True, schema=products_schema)

# save as parquet
products.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/aisles_parquet/")

# read parquet
products = spark.read.parquet('s3://weikaibucket/data_parquet/aisles_parquet/')


products.printSchema()

# print row number
print(f'Row count: {products.count()}')

root
 |-- product_id: integer (nullable = true)
 |-- product_name: string (nullable = true)
 |-- aisle_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)

row count: 49688


## orders
read as csv, partition by eval_set, save as parquet, then read from parque

In [7]:
orders_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("user_id", IntegerType(), True),
    StructField("eval_set", StringType(), True),
    StructField("order_number", IntegerType(), True),
    StructField("order_dow", ByteType(), True),
    StructField("order_hour_of_day", ByteType(), True),
    StructField("days_since_prior_order", FloatType(), True)
])
# read from csv in s3
orders = spark.read.csv("s3://weikaibucket/data/orders/orders.csv", header=True, schema=products_schema)

# save as parquet
orders.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/orders/")

# read parquet
orders = spark.read.parquet('s3://weikaibucket/data_parquet/orders')


orders.printSchema()

# print row number
print(f'Row count: {orders.count()}')


root
 |-- order_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- order_dow: byte (nullable = true)
 |-- order_hour_of_day: byte (nullable = true)
 |-- days_since_prior_order: float (nullable = true)
 |-- eval_set: string (nullable = true)

row count: 3421083
+-----------------+-----------------+
|min(order_number)|max(order_number)|
+-----------------+-----------------+
|                1|              100|
+-----------------+-----------------+

+---------------------------+---------------------------+
|min(days_since_prior_order)|max(days_since_prior_order)|
+---------------------------+---------------------------+
|                        0.0|                       30.0|
+---------------------------+---------------------------+


## order_products
read as csv, save as parquet, then read from parque

In [9]:
# takes 1 minute to run
order_products_schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("product_id", IntegerType(), True),
    StructField("add_to_cart_order", IntegerType(), True),
    StructField("reordered", IntegerType(), True)
])
order_products = spark.read.csv("s3://weikaibucket/data/order_products/", header=True, schema=products_schema)

# save as parquet
order_products.write.mode("overwrite").parquet("s3://weikaibucket/data_parquet/order_products/")

# read parquet
order_products = spark.read.parquet('s3://weikaibucket/data_parquet/order_products')


order_products.printSchema()

# print row number
print(f'Row count: {order_products.count()}')

root
 |-- order_id: integer (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- add_to_cart_order: integer (nullable = true)
 |-- reordered: boolean (nullable = true)

row count: 33819106


## order_products_prior
already save as parquet, just read from parquet

In [10]:
# read parquet
order_products_prior = spark.read.parquet('s3://weikaibucket/features/order_products_prior/')


order_products_prior.printSchema()

# print row number
print(f'Row count: {order_products_prior.count()}')




## Q2
```sql
CREATE TABLE user_features_1 WITH (
	external_location = 's3://weikaibucket/features/user_features_1/',
	format = 'PARQUET'
) AS
SELECT user_id,
	MAX(order_number) AS max_order_number,
	SUM(days_since_prior_order) AS sum_days_since_prior_order,
	AVG(days_since_prior_order) AS avg_days_since_prior_order
FROM orders
GROUP BY user_id;
```

In [None]:
user_features_1 = spark.read.parquet('s3://weikaibucket/features/user_features_1/') # read as parquet

user_features_1.printSchema()

# print row number
print(f'Row count: {user_features_1.count()}')

## Q3
```sql
Create Table user_features_2 with (
	external_location = 's3://weikaibucket/features/user_features_2/',
	format = 'parquet'
) as
select user_id,
	count(product_id) as total_number_of_products,
	count(Distinct product_id) as total_number_of_distinct_products,
	--整数除法。如果结果是一个小于 1 的小数，那么结果会被截断为 0,所以要乘以1.0变浮点数
	sum(if(reordered=1,1,0)) * 1.0 / sum(if(order_number>1,order_number,null)) as reorder_ratio
from order_products_prior
group by user_id;
```

In [None]:
user_features_2 = spark.read.parquet('s3://weikaibucket/features/user_features_2/') # read as parquet

user_features_2.printSchema()

# print row number
print(f'Row count: {user_features_2.count()}')

## Q4
```sql
Create Table up_features with (
	external_location = 's3://weikaibucket/features/up_features/',
	format = 'parquet'
) as
select user_id,
	product_id,
	count(order_id) as total_number_of_orders,
	min(order_number) as minimum_order_number,
	max(order_number) as max_order_number,
	avg(add_to_cart_order) as avg_add_to_cart_order
from order_products_prior
group by user_id,
	product_id;
```

In [13]:
up_features = spark.read.parquet('s3://weikaibucket/features/up_features/') # read as parquet

up_features.printSchema()

# print row number
print(f'Row count: {up_features.count()}')

+-------+----------+------------+----------------+----------------+---------------------+
|user_id|product_id|total_orders|min_order_number|max_order_number|avg_add_to_cart_order|
+-------+----------+------------+----------------+----------------+---------------------+
|      1|       196|          10|               1|              10|                  1.4|
|      1|     10258|           9|               2|              10|   3.3333333333333335|
|      1|     10326|           1|               5|               5|                  5.0|
|      1|     12427|          10|               1|              10|                  3.3|
|      1|     13032|           3|               2|              10|    6.333333333333333|
+-------+----------+------------+----------------+----------------+---------------------+
only showing top 5 rows

row count: 13307953


## Q5
```sql
Create Table prd_features with (
	external_location = 's3://weikaibucket/features/prd_features/',
	format = 'parquet'
) as
select product_id,
	count(product_id) as count_product,
	sum(reordered) as sum_of_reordered,
	count(
		if (product_seq_table.product_seq_time = 1, 1, null)
	) as seq_time1,
	count(
		if (product_seq_table.product_seq_time = 2, 1, null)
	) as seq_time2
from (
		select user_id,
			product_id,
			reordered,
			row_number() over (
				partition by user_id,
				product_id
				order by order_number Asc
			) as product_seq_time
		from order_products_prior
	) as product_seq_table
group by product_id;
```

In [None]:
prd_features = spark.read.parquet('s3://weikaibucket/features/prd_features/') # read as parquet

prd_features.printSchema()

# print row number
print(f'Row count: {prd_features.count()}')