In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("sort/hash aggregate")\
.enableHiveSupport()\
.config("spark.shuffle.useOldFetchProtocol",'true')\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [3]:
order_schema = 'order_id  long, order_date string, customer_id long, order_status string '
orders_df = spark.read.format("csv").\
schema(order_schema).\
load("/public/trendytech/retail_db/ordersnew")

In [4]:
orders_df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [5]:
orders_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [6]:
orders_df.rdd.getNumPartitions()

23

In [7]:
orders_df.createOrReplaceTempView("orders")

In [9]:
spark.sql("""select customer_id,month(order_date) as month, count(order_id) as orders from orders group by customer_id,
month(order_date)
order by month(order_date) """).show()

+-----------+-----+------+
|customer_id|month|orders|
+-----------+-----+------+
|       2125|    1|  2010|
|      11530|    1|  2385|
|       6680|    1|  2010|
|        212|    1|  2010|
|       2429|    1|  2010|
|       1200|    1|  2010|
|       6640|    1|  2010|
|       8510|    1|  2385|
|       7765|    1|  2010|
|       9053|    1|  2010|
|       9567|    1|  2010|
|       7174|    1|   750|
|       2643|    1|   750|
|      10820|    1|   375|
|       6330|    1|   375|
|       5829|    1|   375|
|       9703|    1|   375|
|       8282|    1|   375|
|       2476|    1|   750|
|       3486|    1|   375|
+-----------+-----+------+
only showing top 20 rows



In [14]:
spark.sql("""select customer_id,date_format(order_date,'MMMM') as month, count(order_id) as orders,
first(date_format(order_date,'MM')) as month_no
from orders group by customer_id,
date_format(order_date,'MMMM')
order by month_no """).show()

+-----------+-------+------+--------+
|customer_id|  month|orders|month_no|
+-----------+-------+------+--------+
|       1498|January|  2010|      01|
|       1683|January|  2385|      01|
|       2724|January|  2010|      01|
|       3529|January|   750|      01|
|       3948|January|   375|      01|
|       4040|January|  2385|      01|
|       4048|January|  2010|      01|
|       5039|January|   375|      01|
|       6862|January|   375|      01|
|       3538|January|  2010|      01|
|       6001|January|  2010|      01|
|       7115|January|   375|      01|
|       7536|January|   375|      01|
|       7994|January|   375|      01|
|       8471|January|   375|      01|
|       8525|January|  2385|      01|
|       8648|January|   375|      01|
|      10172|January|  2385|      01|
|       5548|January|  2010|      01|
|       5850|January|   375|      01|
+-----------+-------+------+--------+
only showing top 20 rows



In [15]:
spark.sql("""select customer_id,date_format(order_date,'MMMM') as month, count(order_id) as orders,
first(date_format(order_date,'MM')) as month_no
from orders group by customer_id,
date_format(order_date,'MMMM')
order by month_no """).write.format("noop").mode("overwrite").save()

In [17]:
spark.sql("""select customer_id,date_format(order_date,'MMMM') as month, count(order_id) as orders,
first(int(date_format(order_date,'MM'))) as month_no
from orders group by customer_id,
date_format(order_date,'MMMM')
order by month_no """).show()

+-----------+-------+------+--------+
|customer_id|  month|orders|month_no|
+-----------+-------+------+--------+
|       8763|January|  2010|       1|
|       1852|January|   375|       1|
|       5235|January|  2010|       1|
|        332|January|  2010|       1|
|       3275|January|  2385|       1|
|       4618|January|  4770|       1|
|       5291|January|  2010|       1|
|       3699|January|  2385|       1|
|      12232|January|  2010|       1|
|      12198|January|  4395|       1|
|       5633|January|  2010|       1|
|        255|January|  2010|       1|
|       3080|January|   375|       1|
|      10275|January|   375|       1|
|       2150|January|   375|       1|
|       9300|January|   375|       1|
|       6022|January|   375|       1|
|       9435|January|   750|       1|
|       6951|January|   375|       1|
|       6012|January|   375|       1|
+-----------+-------+------+--------+
only showing top 20 rows



In [18]:
spark.sql("""select customer_id,date_format(order_date,'MMMM') as month, count(order_id) as orders,
first(int(date_format(order_date,'MM'))) as month_no
from orders group by customer_id,
date_format(order_date,'MMMM')
order by month_no """).write.format("noop").mode("overwrite").save()