In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession.\
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

spark

In [2]:
orders_df = spark.read.format("csv")\
.option("header","true")\
.option("inferSchema","true")\
.load("/public/trendytech/orders_wh/*")

In [3]:
orders_df.show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

In [4]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [5]:
orders_df.createOrReplaceTempView("orders")

## top 15 customers who placed max number of orders

In [17]:
ans_df = spark.sql("""select customer_id,count(*) as orders from orders group by customer_id order by orders desc limit 15
""")
ans_df.show(truncate=0)

+-----------+------+
|customer_id|orders|
+-----------+------+
|5897       |16    |
|12431      |16    |
|569        |16    |
|6316       |16    |
|12284      |15    |
|4320       |15    |
|5624       |15    |
|5283       |15    |
|221        |15    |
|5654       |15    |
|6248       |14    |
|3708       |14    |
|1011       |14    |
|8652       |14    |
|4517       |14    |
+-----------+------+



In [14]:
ans_df = orders_df.groupBy("customer_id").count()
ans_df.sort("count",ascending=False).show(15)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|        569|   16|
|       5897|   16|
|      12431|   16|
|       6316|   16|
|        221|   15|
|       4320|   15|
|       5654|   15|
|      12284|   15|
|       5283|   15|
|       5624|   15|
|       3708|   14|
|       4517|   14|
|       6248|   14|
|       3710|   14|
|        791|   14|
+-----------+-----+
only showing top 15 rows



## number of orders under each order status

In [18]:
ans_df = spark.sql("""select order_status,count(*) as orders from orders group by order_status order by orders desc
""")
ans_df.show(truncate=0)

+---------------+------+
|order_status   |orders|
+---------------+------+
|COMPLETE       |22899 |
|PENDING_PAYMENT|15030 |
|PROCESSING     |8275  |
|PENDING        |7610  |
|CLOSED         |7556  |
|ON_HOLD        |3798  |
|SUSPECTED_FRAUD|1558  |
|CANCELED       |1428  |
|PAYMENT_REVIEW |729   |
+---------------+------+



In [16]:
ans_df = orders_df.groupBy("order_status").count()
ans_df.sort("count",ascending=False).show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8275|
|        PENDING| 7610|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



## no. of active customers (with atleast one order)

In [23]:
ans_df = orders_df.select("customer_id").distinct().count()
ans_df

12405

In [24]:
ans_df = spark.sql("""select distinct customer_id from orders""")
ans_df.count()

12405

## customers with most number of closed orders

In [29]:
ans_df = orders_df.filter("order_status='CLOSED'").groupBy("customer_id").count()
ans_df.sort("count",ascending=False).show(1)

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
+-----------+-----+
only showing top 1 row



In [30]:
ans_df = spark.sql("""select customer_id,count(*) as cnt from orders where order_status='CLOSED' group by customer_id
order by cnt desc limit 1
""")
ans_df.show(truncate=0)

+-----------+---+
|customer_id|cnt|
+-----------+---+
|1833       |6  |
+-----------+---+

