In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_schema = "order_id long , order_date string, customer_id long,order_status string"

In [3]:
orders_df = spark.read \
.format("csv") \
.schema(orders_schema) \
.load("/public/trendytech/retail_db/ordersnew")

In [4]:
orders_df.show()

+--------+--------------------+-----------+------------+
|order_id|          order_date|customer_id|order_status|
+--------+--------------------+-----------+------------+
|    2480|2013-08-07 00:00:...|       3807|    COMPLETE|
|   30479|2014-01-30 00:00:...|       9265|    COMPLETE|
|    2481|2013-08-07 00:00:...|       2476|    COMPLETE|
|   30481|2014-01-30 00:00:...|       9240|    COMPLETE|
|    2483|2013-08-07 00:00:...|      10453|    COMPLETE|
|   30484|2014-01-30 00:00:...|       2876|    COMPLETE|
|    2484|2013-08-07 00:00:...|       9256|    COMPLETE|
|   30485|2014-01-30 00:00:...|       1069|    COMPLETE|
|    2488|2013-08-07 00:00:...|       1255|    COMPLETE|
|   30486|2014-01-30 00:00:...|       1151|    COMPLETE|
|    2491|2013-08-07 00:00:...|        247|    COMPLETE|
|   30487|2014-01-30 00:00:...|       6772|    COMPLETE|
|    2495|2013-08-07 00:00:...|       9011|    COMPLETE|
|   30489|2014-01-30 00:00:...|       5717|    COMPLETE|
|    2498|2013-08-07 00:00:...|

In [5]:
orders_df.createOrReplaceTempView("orders")

In [6]:
spark.sql("""select customer_id, date_format(order_date, 'MMMM') as order_month, 
count(1) as total_count from orders
group by customer_id, order_month order by order_month""").show()

+-----------+-----------+-----------+
|customer_id|order_month|total_count|
+-----------+-----------+-----------+
|      11163|      April|       2010|
|       4083|      April|        375|
|       3035|      April|        375|
|       3832|      April|       2007|
|       1433|      April|        375|
|       7996|      April|       2007|
|       2913|      April|        375|
|      12068|      April|       2007|
|       5520|      April|        375|
|       4961|      April|       2007|
|      12179|      April|        375|
|       2247|      April|       2007|
|       7535|      April|        750|
|       9555|      April|       2382|
|       2668|      April|        375|
|       5575|      April|       2007|
|         12|      April|        375|
|       2057|      April|       2007|
|       9730|      April|        375|
|       6868|      April|       2007|
+-----------+-----------+-----------+
only showing top 20 rows



In [7]:
spark.sql("""select customer_id, date_format(order_date, 'MMMM') as order_month, 
count(1) as tot_count, first(date_format(order_date, 'MM')) as month_num from orders
group by customer_id, order_month order by month_num
""").show()

+-----------+-----------+---------+---------+
|customer_id|order_month|tot_count|month_num|
+-----------+-----------+---------+---------+
|       1902|    January|      375|       01|
|       4124|    January|      375|       01|
|       3648|    January|      375|       01|
|       3692|    January|     2010|       01|
|       3764|    January|     4395|       01|
|       4473|    January|     6030|       01|
|       4711|    January|      750|       01|
|       4871|    January|      375|       01|
|       5186|    January|     2010|       01|
|       6122|    January|     2010|       01|
|       6128|    January|      375|       01|
|       7396|    January|     2010|       01|
|       8156|    January|     2760|       01|
|       8159|    January|     2010|       01|
|       8554|    January|      375|       01|
|       8864|    January|      375|       01|
|       9463|    January|     2010|       01|
|       9567|    January|     2010|       01|
|       9855|    January|      375

In [8]:
spark.sql("""select customer_id, date_format(order_date, 'MMMM') as order_month, 
count(1) as tot_count, first(date_format(order_date, 'MM')) as month_num from orders
group by customer_id, order_month order by month_num
""").write.format("noop").mode("overwrite").save()

In [11]:
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month, 
count(1) as tot_count, first(int(date_format(order_date, 'MM'))) as month_num from orders
group by customer_id, order_month order by month_num
""").show()

+-----------+-----------+---------+---------+
|customer_id|order_month|tot_count|month_num|
+-----------+-----------+---------+---------+
|       8763|    January|     2010|        1|
|       1852|    January|      375|        1|
|       5235|    January|     2010|        1|
|        332|    January|     2010|        1|
|       3275|    January|     2385|        1|
|       4618|    January|     4770|        1|
|       5291|    January|     2010|        1|
|       3699|    January|     2385|        1|
|      12232|    January|     2010|        1|
|      12198|    January|     4395|        1|
|       5633|    January|     2010|        1|
|        255|    January|     2010|        1|
|      11418|    January|      375|        1|
|       3080|    January|      375|        1|
|      10275|    January|      375|        1|
|       2150|    January|      375|        1|
|       9300|    January|      375|        1|
|       9435|    January|      750|        1|
|       6951|    January|      375

In [12]:
spark.sql("""
select customer_id, date_format(order_date, 'MMMM') as order_month, 
count(1) as tot_count, first(int(date_format(order_date, 'MM'))) as month_num from orders
group by customer_id, order_month order by month_num
""").write.format("noop").mode("overwrite").save()