In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [3]:
orders_df = spark.read. \
format('csv'). \
schema(orders_schema). \
load('/public/trendytech/orders/orders_1gb.csv')

In [None]:
spark.sql("create database itv013010_cachingdemo_db")

### How to Write df into spark table.

In [None]:
orders_df.write.format("csv").saveAsTable("itv013010_cachingdemo_db.itv013010_orders1")

In [13]:
!hadoop fs -ls /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1

Found 10 items
-rw-r--r--   3 itv013010 supergroup          0 2024-07-14 13:46 /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/_SUCCESS
-rw-r--r--   3 itv013010 supergroup  100316856 2024-07-14 13:46 /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
-rw-r--r--   3 itv013010 supergroup  100316948 2024-07-14 13:46 /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00001-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
-rw-r--r--   3 itv013010 supergroup  100316813 2024-07-14 13:46 /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00002-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
-rw-r--r--   3 itv013010 supergroup  100319652 2024-07-14 13:46 /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00003-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
-rw-r--r--   3 itv013010 supergroup  100316854 2024-07-14 13:46 /use

In [5]:
spark.sql("describe extended itv013010_cachingdemo_db.itv013010_orders1 ").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|              bigint|   null|
|          order_date|                date|   null|
|         customer_id|              bigint|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|itv013010_caching...|       |
|               Table|   itv013010_orders1|       |
|               Owner|           itv013010|       |
|        Created Time|Sun Jul 14 13:46:...|       |
|         Last Access|Wed Dec 31 19:00:...|       |
|          Created By|         Spark 2.4.7|       |
|                Type|             MANAGED|       |
|            Provider|                 csv|       |
|    Table Properties|[transient_lastDd...|       |
|          Statistics|     840836656 bytes|       |
|           

In [6]:
spark.sql("select count(*) from itv013010_cachingdemo_db.itv013010_orders1").show()

+--------+
|count(1)|
+--------+
|25831126|
+--------+



## Caching Table in Spark is NOT LAZY.
### [Add lazy keyword to make it LAZY]

In [7]:
spark.sql("cache table itv013010_cachingdemo_db.itv013010_orders1")

In [8]:
spark.sql("select count(*) from itv013010_cachingdemo_db.itv013010_orders1").show()

+--------+
|count(1)|
+--------+
|25831126|
+--------+



In [9]:
spark.sql("select distinct(order_status) from itv013010_cachingdemo_db.itv013010_orders1").show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|         BOOKED|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



In [10]:
spark.sql("select count(distinct(order_status)) from itv013010_cachingdemo_db.itv013010_orders1").show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                          10|
+----------------------------+



## Now how to Uncache?

In [11]:
spark.sql("uncache table itv013010_cachingdemo_db.itv013010_orders1")

## How to make cache LAZY in spark.sql

In [12]:
spark.sql("cache lazy table itv013010_cachingdemo_db.itv013010_orders1")

In [13]:
spark.sql("select count(*) from itv013010_cachingdemo_db.itv013010_orders1")

count(1)
25831126


## How to see the no of Paritions of our table on storage[hadoop]?

In [14]:
!hadoop fs -du -h /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1

0       0        /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/_SUCCESS
31      93       /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-6aace557-58ad-4a8d-a23c-8cccbd8b5d9c-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00001-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00002-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00003-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00004-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv

In [15]:
spark.sql("select order_status, count(*) from itv013010_cachingdemo_db.itv013010_orders1 group by order_status").show()

+---------------+--------+
|   order_status|count(1)|
+---------------+--------+
|PENDING_PAYMENT| 5636250|
|       COMPLETE| 8587125|
|        ON_HOLD| 1424250|
| PAYMENT_REVIEW|  273375|
|         BOOKED|       1|
|     PROCESSING| 3103125|
|         CLOSED| 2833500|
|SUSPECTED_FRAUD|  584250|
|        PENDING| 2853750|
|       CANCELED|  535500|
+---------------+--------+



In [16]:
spark.sql("insert into itv013010_cachingdemo_db.itv013010_orders1 values(11111, '2023-05-29', 102455, 'BOOKED')")

In [17]:
!hadoop fs -du -h /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1

0       0        /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/_SUCCESS
31      93       /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-2bd4f886-6c9f-4705-8845-6ef86fc12f4c-c000.csv
31      93       /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-6aace557-58ad-4a8d-a23c-8cccbd8b5d9c-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00001-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00002-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00003-8101ec11-bb45-4666-a9c8-01b563ea6baf-c000.csv
95.7 M  287.0 M  /user/itv

In [18]:
! hadoop fs -cat /user/itv013010/warehouse/itv013010_cachingdemo_db.db/itv013010_orders1/part-00000-6aace557-58ad-4a8d-a23c-8cccbd8b5d9c-c000.csv

11111,2023-05-29,102455,BOOKED


In [19]:
spark.sql("select count(distinct(order_status)) from itv013010_cachingdemo_db.itv013010_orders1").show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                          10|
+----------------------------+



### How to uncache everything in a single command?

In [20]:
spark.sql("clear cache")