In [1]:
from pyspark.sql import SparkSession

import getpass

username = getpass.getuser()

spark = SparkSession.\
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

spark

In [2]:
orders_rdd = spark.sparkContext.textFile("/public/trendytech/retail_db/orders/*")

In [5]:
orders_rdd.take(10)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [10]:
rdd1 = orders_rdd.map(lambda x : (x.split(",")[-1],1))

In [11]:
rdd1.take(10)

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('COMPLETE', 1),
 ('PROCESSING', 1),
 ('PENDING_PAYMENT', 1),
 ('PENDING_PAYMENT', 1)]

In [12]:
rdd2 = rdd1.reduceByKey(lambda x,y:x+y)

In [13]:
rdd2.take(10)

[('CLOSED', 7556),
 ('CANCELED', 1428),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899),
 ('PROCESSING', 8275),
 ('PAYMENT_REVIEW', 729),
 ('PENDING', 7610),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558)]

In [17]:
rdd3 = rdd2.sortByKey(ascending=False)

In [18]:
rdd3.take(10)

[('SUSPECTED_FRAUD', 1558),
 ('PROCESSING', 8275),
 ('PENDING_PAYMENT', 15030),
 ('PENDING', 7610),
 ('PAYMENT_REVIEW', 729),
 ('ON_HOLD', 3798),
 ('COMPLETE', 22899),
 ('CLOSED', 7556),
 ('CANCELED', 1428)]

In [19]:
rdd4 = rdd2.sortBy(lambda x:x[-1],ascending=False)

In [20]:
rdd4.take(10)

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8275),
 ('PENDING', 7610),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

# Top10 customer_id's who placed max orders

In [23]:
orders_rdd.take(10)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE',
 '6,2013-07-25 00:00:00.0,7130,COMPLETE',
 '7,2013-07-25 00:00:00.0,4530,COMPLETE',
 '8,2013-07-25 00:00:00.0,2911,PROCESSING',
 '9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT',
 '10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT']

In [25]:
customer_id_cnt_rdd = orders_rdd.map(lambda x:(x.split(",")[-2],1))

In [26]:
customer_id_cnt_rdd.take(10)

[('11599', 1),
 ('256', 1),
 ('12111', 1),
 ('8827', 1),
 ('11318', 1),
 ('7130', 1),
 ('4530', 1),
 ('2911', 1),
 ('5657', 1),
 ('5648', 1)]

In [27]:
customer_id_sum_rdd = customer_id_cnt_rdd.reduceByKey(lambda x,y:x+y)

In [28]:
customer_id_sum_rdd.take(10)

[('3066', 6),
 ('3159', 7),
 ('8135', 11),
 ('2248', 4),
 ('6117', 6),
 ('7733', 7),
 ('6540', 3),
 ('4882', 8),
 ('6060', 7),
 ('10436', 8)]

In [29]:
ordered_customers = customer_id_sum_rdd.sortBy(lambda x:x[-1],False)

In [30]:
ordered_customers.take(10)

[('5897', 16),
 ('6316', 16),
 ('12431', 16),
 ('569', 16),
 ('4320', 15),
 ('221', 15),
 ('5624', 15),
 ('5283', 15),
 ('12284', 15),
 ('5654', 15)]

In [31]:
atleast_1_order = customer_id_sum_rdd.filter(lambda x : x[-1]>=1)

In [32]:
atleast_1_order.take(10)

[('256', 10),
 ('12111', 6),
 ('11318', 6),
 ('7130', 7),
 ('2911', 6),
 ('5657', 12),
 ('9149', 4),
 ('9842', 7),
 ('7276', 5),
 ('9488', 7)]

In [35]:
# or

cust_atleast_1_order = orders_rdd.map(lambda x:(x.split(",")[-2])).distinct()
cust_atleast_1_order.count()

12405

# Customer with max closed orders

In [42]:
closed_orders = orders_rdd.filter(lambda x:(x.split(",")[-1])=="CLOSED")

In [43]:
closed_orders.take(10)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '12,2013-07-25 00:00:00.0,1837,CLOSED',
 '18,2013-07-25 00:00:00.0,1205,CLOSED',
 '24,2013-07-25 00:00:00.0,11441,CLOSED',
 '25,2013-07-25 00:00:00.0,9503,CLOSED',
 '37,2013-07-25 00:00:00.0,5863,CLOSED',
 '51,2013-07-25 00:00:00.0,12271,CLOSED',
 '57,2013-07-25 00:00:00.0,7073,CLOSED',
 '61,2013-07-25 00:00:00.0,4791,CLOSED']

In [50]:
cust_cnt = closed_orders.map(lambda x:(x.split(",")[-2],1))

In [55]:
order_sum = cust_cnt.reduceByKey(lambda x,y:x+y)
top_cust = order_sum.sortBy(lambda x:x[-1],False)
top_cust.take(1)

[('1833', 6)]