In [16]:
from pyspark.sql import SparkSession

spark = SparkSession\
.builder\
.master("yarn")\
.appName("caching demo-3")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [2]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [3]:
df = spark.read.format("csv").\
schema(orders_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [4]:
df.show()

+--------+----------+-----------+---------------+
|order_id|order_date|customer_id|   order_status|
+--------+----------+-----------+---------------+
|       1|2013-07-25|      11599|         CLOSED|
|       2|2013-07-25|        256|PENDING_PAYMENT|
|       3|2013-07-25|      12111|       COMPLETE|
|       4|2013-07-25|       8827|         CLOSED|
|       5|2013-07-25|      11318|       COMPLETE|
|       6|2013-07-25|       7130|       COMPLETE|
|       7|2013-07-25|       4530|       COMPLETE|
|       8|2013-07-25|       2911|     PROCESSING|
|       9|2013-07-25|       5657|PENDING_PAYMENT|
|      10|2013-07-25|       5648|PENDING_PAYMENT|
|      11|2013-07-25|        918| PAYMENT_REVIEW|
|      12|2013-07-25|       1837|         CLOSED|
|      13|2013-07-25|       9149|PENDING_PAYMENT|
|      14|2013-07-25|       9842|     PROCESSING|
|      15|2013-07-25|       2568|       COMPLETE|
|      16|2013-07-25|       7276|PENDING_PAYMENT|
|      17|2013-07-25|       2667|       COMPLETE|


In [12]:
spark.sql("""create database itv009490_cachingdemo_db""")

In [13]:
df.write.format("csv").saveAsTable("itv009490_cachingdemo_db.itv009490_orders1")

In [5]:
spark.sql("desc extended  itv009490_cachingdemo_db.itv009490_orders1 ").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|              bigint|   null|
|          order_date|                date|   null|
|         customer_id|              bigint|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|itv009490_caching...|       |
|               Table|   itv009490_orders1|       |
|               Owner|           itv009490|       |
|        Created Time|Fri Nov 24 13:57:...|       |
|         Last Access|Wed Dec 31 19:00:...|       |
|          Created By|         Spark 2.4.7|       |
|                Type|             MANAGED|       |
|            Provider|                 csv|       |
|    Table Properties|[transient_lastDd...|       |
|          Statistics|     840836655 bytes|       |
|           

In [18]:
spark.sql("""select count(*) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+--------+
|count(1)|
+--------+
|25831125|
+--------+



In [6]:
spark.sql("cache table itv009490_cachingdemo_db.itv009490_orders1")

In [21]:
spark.sql("""select count(*) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+--------+
|count(1)|
+--------+
|25831125|
+--------+



In [22]:
spark.sql("""select distinct order_status from itv009490_cachingdemo_db.itv009490_orders1""").show()

+---------------+
|   order_status|
+---------------+
|PENDING_PAYMENT|
|       COMPLETE|
|        ON_HOLD|
| PAYMENT_REVIEW|
|     PROCESSING|
|         CLOSED|
|SUSPECTED_FRAUD|
|        PENDING|
|       CANCELED|
+---------------+



In [26]:
spark.sql("""select count(distinct order_status) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                           9|
+----------------------------+



In [25]:
spark.sql("""uncache table itv009490_cachingdemo_db.itv009490_orders1""")

In [27]:
spark.sql("cache lazy table itv009490_cachingdemo_db.itv009490_orders1")

In [28]:
spark.sql("""select count(distinct order_status) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                           9|
+----------------------------+



In [29]:
spark.sql("""select count(*) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+--------+
|count(1)|
+--------+
|25831125|
+--------+



In [33]:
spark.sql("""select count(distinct order_status) from itv009490_cachingdemo_db.itv009490_orders1""").show()

+----------------------------+
|count(DISTINCT order_status)|
+----------------------------+
|                          10|
+----------------------------+



In [31]:
spark.sql("""insert into itv009490_cachingdemo_db.itv009490_orders1 values (11111,'2023-01-31',22222,'Booked') """).show()

++
||
++
++



In [34]:
spark.sql("clear cache")

In [7]:
spark.catalog.currentDatabase()

'default'

In [8]:
spark.sql("""use itv009490_cachingdemo_db """)

In [9]:
spark.catalog.currentDatabase()

'itv009490_cachingdemo_db'

In [10]:
spark.catalog.isCached("itv009490_cachingdemo_db.itv009490_orders1")

True

In [11]:
spark.sql("""create database itv009490_caching_demo_ext""")

In [None]:
spark.sql("""create table itv009490_caching_demo_ext.itv009490_orders_ext (order_id long, order_date string,
customer_id long, order_status string)
using csv "/public/trendytech/orders/"
""")

In [15]:
spark.stop()

In [17]:
orders_schema = 'order_id long, order_date date, customer_id long, order_status string'

In [18]:
df = spark.read.format("csv").\
schema(orders_schema).\
load("/public/trendytech/orders/orders_1gb.csv")

In [19]:
df.write.saveAsTable("itv009490_cachingdemo_db.itv009490_orders2")

In [24]:
spark.sql("""select count(*) from itv009490_cachingdemo_db.itv009490_orders2""").show()

+--------+
|count(1)|
+--------+
|25831125|
+--------+



In [27]:
spark.sql("""clear cache""")

In [25]:
spark.sql("""cache table itv009490_cachingdemo_db.itv009490_orders2""")

In [28]:
spark.sql("""select count(*) from itv009490_cachingdemo_db.itv009490_orders2""").show()

+--------+
|count(1)|
+--------+
|25831125|
+--------+

