In [0]:
from pyspark.sql.functions import to_date, year, month, weekofyear
from pyspark.sql.types import DateType, DateConverter

In [0]:
%run "../includes/configurations"

#### Ingesting processed data

In [0]:
orders_df = spark.read.parquet(f"{processed_folder_path}/orders")

#### How many orders are processed on daily basis?

In [0]:
display(orders_df.take(10))

order_id,customer_id,order_status,order_date,order_delivery_date
6ec1bea8cbcef0a1b81bc9b7fbd37ccb,e6b5e20566e5c72cbaab04f91dec9c85,delivered,2018-08-07,2018-08-27
441972a5bbd51a10459a487402076942,b79fa9dfed0c3d624b70fbd0ca2469de,delivered,2018-08-23,2018-08-29
fbecebecbe32df9dc3411b1e35b5484c,7f9f88f14a8f0dc733a5e371af98abac,delivered,2017-03-20,2017-03-27
9e22b00fd1b9f036577e8c517ea0a5d4,5a7346263920ecab0f88da703bbaf3b1,delivered,2017-12-01,2018-01-03
07d9d11b0175952638ddd53d7855f8a7,a1c779cddccf5ffe60e543d3ce0dc477,delivered,2017-11-22,2017-11-30
1c38314597f39a117548efb308041b76,39a23021e4ee2efd04ea8decaff7b328,delivered,2017-12-11,2017-12-28
06caf4b0e55d9bc015a91fa77432a1cb,e42b1319fefce9bf37acf91a0ac8c1a9,delivered,2017-03-11,2017-03-21
1330176f6500125ff6e8e10922fb714c,89479f50ebac0687b3d8d0ac9f1193b1,delivered,2017-10-18,2017-10-21
6616fa4c89b8bf2a7e17271cdc542fca,19245d7a8dfb1639e01f96c455fb05b1,delivered,2018-08-17,2018-08-28
06f544e5c7afe6215b9e2a26b4f08442,26d283dae44fbc9eee874cecca589853,delivered,2018-04-12,2018-04-24


In [0]:
orders_df.createOrReplaceTempView("orders_df_view")

In [0]:
top_orders_dates = spark.sql("""
SELECT order_date, COUNT(*)
FROM orders_df_view
GROUP BY order_date
ORDER BY 2 desc
""")

In [0]:
display(top_orders_dates.take(7))

order_date,count(1)
2017-11-24,1176
2017-11-25,499
2017-11-27,403
2017-11-26,391
2017-11-28,380
2018-05-07,372
2018-08-06,372


In [0]:
top_order_years = spark.sql("""
SELECT year(orders_df_view.order_date), COUNT (*)
FROM orders_df_view
GROUP BY year(orders_df_view.order_date)
ORDER BY 2 desc
""")

In [0]:
display(top_order_years)

year(order_date),count(1)
2018,54011
2017,45101
2016,329


In [0]:
orders_2018 = spark.sql("""
SELECT *
FROM orders_df_view
WHERE year(orders_df_view.order_date) = 2018
""")

In [0]:
display(orders_2018.take(5))

order_id,customer_id,order_status,order_date,order_delivery_date
6ec1bea8cbcef0a1b81bc9b7fbd37ccb,e6b5e20566e5c72cbaab04f91dec9c85,delivered,2018-08-07,2018-08-27
441972a5bbd51a10459a487402076942,b79fa9dfed0c3d624b70fbd0ca2469de,delivered,2018-08-23,2018-08-29
6616fa4c89b8bf2a7e17271cdc542fca,19245d7a8dfb1639e01f96c455fb05b1,delivered,2018-08-17,2018-08-28
06f544e5c7afe6215b9e2a26b4f08442,26d283dae44fbc9eee874cecca589853,delivered,2018-04-12,2018-04-24
8fa279b262d80217668f5711737ac83b,e620fbdf4316bb301fdce0325c0e0c2c,delivered,2018-03-27,2018-03-29


In [0]:
orders_2018.createOrReplaceTempView("orders_2018_view")

In [0]:
top_orders_2018_months = spark.sql("""
SELECT month(orders_2018_view.order_date), COUNT(*)
from orders_2018_view
GROUP BY month(orders_2018_view.order_date)
ORDER BY 2 desc
""")

In [0]:
display(top_orders_2018_months.take(5))

month(order_date),count(1)
1,7269
3,7211
4,6939
5,6873
2,6728


In [0]:
top_orders_2018_weeks = spark.sql("""
SELECT weekofyear(orders_2018_view.order_date), COUNT(*)
from orders_2018_view
GROUP BY weekofyear(orders_2018_view.order_date)
ORDER BY 2 desc
""")

In [0]:
display(top_orders_2018_weeks.take(7))

weekofyear(order_date),count(1)
31,2058
32,1988
19,1987
9,1903
33,1875
20,1837
2,1785
