In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_df = spark.read \
.format("csv") \
.option("header", "true") \
.option("inferSchema", "true") \
.load("/public/trendytech/orders_wh/*")

In [3]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



### Higher Level API's Demo

In [4]:
orders_df.createOrReplaceTempView("orders")

### 1.Top 15 customers who placed most number of orders

In [5]:
result = orders_df.groupBy("customer_id").count().sort("count",ascending = False).limit(15)

In [6]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



In [7]:
result = spark.sql("select customer_id,count(order_id) as count from orders group by customer_id order by count desc limit 15")

In [8]:
result.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       5897|   16|
|      12431|   16|
|        569|   16|
|       6316|   16|
|      12284|   15|
|       4320|   15|
|       5624|   15|
|       5283|   15|
|        221|   15|
|       5654|   15|
|       6248|   14|
|       3708|   14|
|       1011|   14|
|       8652|   14|
|       4517|   14|
+-----------+-----+



### Find the number of orders under each order status

In [9]:
result = orders_df.groupBy("order_status").count()

In [10]:
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



In [11]:
result = spark.sql("select order_status,count(order_id) as count from orders group by order_status")

In [12]:
result.show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|PENDING_PAYMENT|15030|
|       COMPLETE|22899|
|        ON_HOLD| 3798|
| PAYMENT_REVIEW|  729|
|     PROCESSING| 8275|
|         CLOSED| 7556|
|SUSPECTED_FRAUD| 1558|
|        PENDING| 7610|
|       CANCELED| 1428|
+---------------+-----+



### 3.Number of active customers(customers who have placed atleast one order)

In [13]:
results = orders_df.select("customer_id").distinct().count()

In [14]:
print(results)

12405


In [15]:
results = spark.sql("select count(distinct(customer_id)) as active_customers from orders") 

In [16]:
results.show()

+----------------+
|active_customers|
+----------------+
|           12405|
+----------------+



### 4.Customers with most number of orders

In [17]:
results = orders_df.filter("order_status='CLOSED'").groupBy("customer_id").count().sort("count",ascending=False)

In [18]:
results.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       3631|    4|
|       2403|    4|
|       5319|    4|
|       4573|    4|
|       7850|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|        437|    4|
|      10018|    4|
|       7879|    4|
|       2236|    4|
|       2774|    4|
+-----------+-----+
only showing top 20 rows



In [19]:
results = spark.sql("select customer_id,count(order_id) as count from orders where order_status='CLOSED' group by customer_id order by count desc")

In [20]:
results.show()

+-----------+-----+
|customer_id|count|
+-----------+-----+
|       1833|    6|
|       1363|    5|
|       1687|    5|
|       5493|    5|
|       7948|    4|
|       2768|    4|
|      10263|    4|
|       2236|    4|
|       2403|    4|
|       7879|    4|
|       4573|    4|
|       7850|    4|
|      12431|    4|
|       1521|    4|
|      10111|    4|
|        437|    4|
|      10018|    4|
|       5319|    4|
|       2774|    4|
|       3631|    4|
+-----------+-----+
only showing top 20 rows

