In [0]:
from pyspark.sql.functions import to_date, year, month, weekofyear, desc, rank
from pyspark.sql.types import DateType, DateConverter

In [0]:
%run "../includes/configurations"

#### Ingesting processed data

In [0]:
## TABLES: ORDERS AND ORDER ITEMS JOIN ON product_id
## TO GET RESULTS BASED ON PRODUCT CATEGORY RATHER THAN product_id WE CAN USE products.csv AS WELL
orders_df = spark.read.parquet(f"{processed_folder_path}/orders")
order_items_df = spark.read.parquet(f"{processed_folder_path}/order_items")

In [0]:
display(orders_df.take(5))

order_id,customer_id,order_status,order_date,order_delivery_date
6ec1bea8cbcef0a1b81bc9b7fbd37ccb,e6b5e20566e5c72cbaab04f91dec9c85,delivered,2018-08-07,2018-08-27
441972a5bbd51a10459a487402076942,b79fa9dfed0c3d624b70fbd0ca2469de,delivered,2018-08-23,2018-08-29
fbecebecbe32df9dc3411b1e35b5484c,7f9f88f14a8f0dc733a5e371af98abac,delivered,2017-03-20,2017-03-27
9e22b00fd1b9f036577e8c517ea0a5d4,5a7346263920ecab0f88da703bbaf3b1,delivered,2017-12-01,2018-01-03
07d9d11b0175952638ddd53d7855f8a7,a1c779cddccf5ffe60e543d3ce0dc477,delivered,2017-11-22,2017-11-30


In [0]:
display(order_items_df.take(5))

order_id,order_item_id,product_id,seller_id,shipping_limit_date,price
a292ffb436f475f98ec53a05994dbd94,1,5f21301936c11698db6aee5444190da0,fe2032dab1a61af8794248c8196565c9,2017-06-29T12:23:42.000+0000,149.9
a29361cea3ac5ecb55cfc2c64b6c339a,1,5a3320037d5922a7708647c81ecc1f15,b6d44737c043328708f6749c2dbe50bd,2018-03-01T15:55:27.000+0000,79.0
a29361cea3ac5ecb55cfc2c64b6c339a,2,7a6aebc4c1205818e64f9275475a73e9,8bd0f31cf0a614c658f6763bd02dea69,2018-03-01T15:55:27.000+0000,199.99
a2940f1b67316eac4f302e29554392aa,1,d0fe4295267f15ccaceac4fb233d8c9a,0db783cfcd3b73998abc6e10e59a102f,2017-06-30T16:55:11.000+0000,49.9
a2941bd059891a9336409a22149acf24,1,0aabfb375647d9738ad0f7b4ea3653b1,37515688008a7a40ac93e3b2e4ab203f,2017-11-28T03:06:59.000+0000,24.5


In [0]:
orders_df.createOrReplaceTempView("orders_df_view")
order_items_df.createOrReplaceTempView("order_items_df_view")

In [0]:
joined_df = spark.sql("""
SELECT * FROM orders_df_view O1
INNER JOIN order_items_df_view O2
USING (order_id);
""")

In [0]:
joined_df.count()

In [0]:
display(joined_df.take(7))

order_id,customer_id,order_status,order_date,order_delivery_date,order_item_id,product_id,seller_id,shipping_limit_date,price
a292ffb436f475f98ec53a05994dbd94,c08ef557085ca9fb0c59f6d820a0b059,delivered,2017-06-25,2017-07-18,1,5f21301936c11698db6aee5444190da0,fe2032dab1a61af8794248c8196565c9,2017-06-29T12:23:42.000+0000,149.9
a29361cea3ac5ecb55cfc2c64b6c339a,674a4d874e368f09a9760046e9e4a3e0,delivered,2018-02-25,2018-02-27,1,5a3320037d5922a7708647c81ecc1f15,b6d44737c043328708f6749c2dbe50bd,2018-03-01T15:55:27.000+0000,79.0
a29361cea3ac5ecb55cfc2c64b6c339a,674a4d874e368f09a9760046e9e4a3e0,delivered,2018-02-25,2018-02-27,2,7a6aebc4c1205818e64f9275475a73e9,8bd0f31cf0a614c658f6763bd02dea69,2018-03-01T15:55:27.000+0000,199.99
a2940f1b67316eac4f302e29554392aa,cf6e2b8a22d2788c9b3e26b2a276a065,delivered,2017-06-26,2017-07-03,1,d0fe4295267f15ccaceac4fb233d8c9a,0db783cfcd3b73998abc6e10e59a102f,2017-06-30T16:55:11.000+0000,49.9
a2941bd059891a9336409a22149acf24,4ac22b1186fc5ac2711abbb5ab167f0f,delivered,2017-11-21,2017-12-05,1,0aabfb375647d9738ad0f7b4ea3653b1,37515688008a7a40ac93e3b2e4ab203f,2017-11-28T03:06:59.000+0000,24.5
a294a9571b11828c875657e7ea6d9a9b,22adc2c76364a3db4bad7f96ff956e4c,delivered,2017-10-22,2017-10-27,1,386eebb43722ab502f04f7900bd2451b,8b28d096634035667e8263d57ba3368c,2017-10-26T13:56:22.000+0000,57.9
a294b359ceeb6b570206ae9a4e266230,339a385d7c41c652faf8a49b4e645bbf,delivered,2017-05-03,2017-06-01,1,ab9cf155f8280c3739b09fe341a185e8,92eb0f42c21942b6552362b9b114707d,2017-05-09T21:35:24.000+0000,13.98


In [0]:
joined_df.createOrReplaceTempView("joined_df_view")

In [0]:
top_products = spark.sql("""
SELECT product_id, year(joined_df_view.order_date) AS order_year, month(joined_df_view.order_date) AS order_month, weekofyear(joined_df_view.order_date) AS order_week, COUNT(product_id) AS pro_count
FROM joined_df_view
GROUP BY product_id, year(joined_df_view.order_date), month(joined_df_view.order_date), weekofyear(joined_df_view.order_date)
ORDER BY 5 desc
""")

In [0]:
display(top_products.take(10))

product_id,order_year,order_month,order_week,pro_count
422879e10f46682990de24d770e7f83d,2017,11,47,59
53759a2ecddad2bb87a079a1f1519f73,2017,11,47,56
53b36df67ebb7c41585e8d54d6772e08,2018,5,19,53
53b36df67ebb7c41585e8d54d6772e08,2018,5,18,53
e7cc48a9daff5436f63d3aad9426f28b,2018,8,32,48
389d119b48cf3043d311335e499d9c6b,2017,11,47,45
53b36df67ebb7c41585e8d54d6772e08,2018,4,17,45
aca2eb7d00ea1a7b8ebd4e68314663af,2018,1,1,42
d285360f29ac7fd97640bf0baef03de0,2018,7,28,40
aca2eb7d00ea1a7b8ebd4e68314663af,2018,1,2,38


In [0]:
top_products.createOrReplaceTempView("top_products_view")

In [0]:
#top_products_2018 = spark.sql("""
#SELECT product_id, order_month, order_week, pro_count
#FROM top_products_view
#WHERE order_year = 2018
#GROUP BY product_id, order_month, order_week, pro_count
#ORDER BY 4 desc
#""")

In [0]:
#display(top_products_2018.take(10))

In [0]:
from pyspark.sql.window import Window

In [0]:
monthlyProdRankSpec = Window.partitionBy("order_year", "order_month", "order_week").orderBy(desc("pro_count"))
top_products = top_products.withColumn("rank", rank().over(monthlyProdRankSpec))

In [0]:
top_products.count()

In [0]:
display(top_products.take(10))

product_id,order_year,order_month,order_week,pro_count,rank
f3c2d01a84c947b078e32bbef0718962,2016,9,36,1,1
bdcec8e22b04a8f4241d87bcfe9ea877,2016,10,41,3,1
6d2928252aa5ba6a5fc439571dd0d748,2016,10,41,2,2
79938780eb7ffb7289f59a9a283bc06b,2016,10,41,2,2
c1488892604e4ba5cff5b4eb4d595400,2016,10,41,1,4
f27f3e8b4b3836e4d6cd756dfb83e750,2016,10,41,1,4
f9bcf9d8e89d012d7dc24fe4de0abd80,2016,10,41,1,4
f436a5130457df6b73cdb2c12a90ea2b,2016,10,41,1,4
afad7a4420a4348d41da23ed8d0902dd,2016,10,41,1,4
44377a26f7c80dbf0a9b2b300604e6bf,2016,10,41,1,4


In [0]:
top_products.createOrReplaceTempView("top_products_view")

In [0]:
%sql
SELECT * FROM top_products_view
WHERE rank<4
LIMIT 10

product_id,order_year,order_month,order_week,pro_count,rank
f3c2d01a84c947b078e32bbef0718962,2016,9,36,1,1
bdcec8e22b04a8f4241d87bcfe9ea877,2016,10,41,3,1
6d2928252aa5ba6a5fc439571dd0d748,2016,10,41,2,2
79938780eb7ffb7289f59a9a283bc06b,2016,10,41,2,2
f5d8f4fbc70ca2a0038b9a0010ed5cb0,2016,12,51,1,1
4a90b44b456e79e7a33435fdd36e551f,2017,1,2,9,1
c1488892604e4ba5cff5b4eb4d595400,2017,1,2,3,2
fe7976532777922f0d60261e75f7ad36,2017,1,2,3,2
16ce899c7af0c99f46948734a0d00f0f,2017,1,3,9,1
5445335704bf8a32827f3bf6b9701ed9,2017,1,3,5,2


### identifying top sellers

In [0]:
top_sellers_yearwise = spark.sql("""
SELECT seller_id, year(joined_df_view.order_date) AS order_year, month(joined_df_view.order_date) AS order_month, weekofyear(joined_df_view.order_date) AS order_week, COUNT(*) AS sell_count
FROM joined_df_view
GROUP BY seller_id, year(joined_df_view.order_date), month(joined_df_view.order_date), weekofyear(joined_df_view.order_date)
ORDER BY 2 desc, 3 desc, 4 desc
""")

In [0]:
display(top_sellers_yearwise.take(7))
top_sellers_yearwise.count()

seller_id,order_year,order_month,order_week,sell_count
25be943a321c8938947bdaabca979a90,2018,9,36,1
508808d438fe2ff972ed13bb8f4a82e2,2018,8,35,2
33a6f4b1e7cdc205511e76ba1b6e0186,2018,8,35,1
fd386aa7bed2af3c7035c65506c9b4a3,2018,8,35,3
f0b47fbbc6dee9aafe415a6e33051b3f,2018,8,35,1
2c9e548be18521d1c43cde1c582c6de8,2018,8,35,1
6560211a19b47992c3666cc44a7e94c0,2018,8,35,4


In [0]:
top_sellers_yearwise.createOrReplaceTempView("top_sellers_yearwise_view")

In [0]:
top_sellers_2018 = spark.sql("""
SELECT seller_id, order_month, order_week, sell_count
FROM top_sellers_yearwise_view
WHERE order_year = 2018
GROUP BY seller_id, order_month, order_week, sell_count
ORDER BY 4 desc
""")

In [0]:
display(top_sellers_2018.take(10))

seller_id,order_month,order_week,sell_count
7d13fca15225358621be4086e1eb0964,5,19,109
955fee9216a65b617aa5c0531780ce60,1,3,86
7d13fca15225358621be4086e1eb0964,5,18,83
955fee9216a65b617aa5c0531780ce60,5,20,79
955fee9216a65b617aa5c0531780ce60,4,17,78
3d871de0142ce09b7081e2b9d1733cb1,1,2,77
955fee9216a65b617aa5c0531780ce60,1,2,75
955fee9216a65b617aa5c0531780ce60,5,19,66
1f50f920176fa81dab994f9023523100,2,7,65
6560211a19b47992c3666cc44a7e94c0,7,30,65


In [0]:
sellerRankSpec = Window.partitionBy("order_month", "order_week").orderBy(desc("sell_count"))
top_sellers_2018 = top_sellers_2018.withColumn("rank", rank().over(sellerRankSpec))

In [0]:
display(top_sellers_2018.take(10))

seller_id,order_month,order_week,sell_count,rank
3d871de0142ce09b7081e2b9d1733cb1,1,1,56,1
955fee9216a65b617aa5c0531780ce60,1,1,52,2
8b321bb669392f5163d04c59e235e066,1,1,38,3
da8622b14eb17ae2831f4ac5b9dab84a,1,1,38,3
ea8482cd71df3c1969d7b9473ff13abc,1,1,32,5
128639473a139ac0f3e5f5ade55873a5,1,1,25,6
4a3ca9315b744ce9f8e9374361493884,1,1,24,7
cc419e0650a3c5ba77189a1882b7556a,1,1,21,8
e9779976487b77c6d4ac45f75ec7afe9,1,1,18,9
7ddcbb64b5bc1ef36ca8c151f6ec77df,1,1,17,10


In [0]:
top_sellers_2018.createOrReplaceTempView("top_sellers_2018_view")

In [0]:
%sql
SELECT * FROM top_sellers_2018_view
WHERE rank<2
LIMIT 10

seller_id,order_month,order_week,sell_count,rank
3d871de0142ce09b7081e2b9d1733cb1,1,1,56,1
3d871de0142ce09b7081e2b9d1733cb1,1,2,77,1
955fee9216a65b617aa5c0531780ce60,1,3,86,1
da8622b14eb17ae2831f4ac5b9dab84a,1,4,48,1
ea8482cd71df3c1969d7b9473ff13abc,1,5,24,1
8b321bb669392f5163d04c59e235e066,2,5,27,1
1f50f920176fa81dab994f9023523100,2,6,56,1
1f50f920176fa81dab994f9023523100,2,7,65,1
8b321bb669392f5163d04c59e235e066,2,8,46,1
955fee9216a65b617aa5c0531780ce60,2,9,33,1
