In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.sql.warehouse.dir',f'/user/{username}/warehouse'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark.sql("use itv013010_caching_ext")

In [3]:
spark.sql("drop table itv013010_orders_ext")

In [4]:
spark.sql("drop database itv013010_caching_ext")

### Creating Database for External table

In [5]:
spark.sql("create database itv013010_caching_ext")

# Creating External table using:
### spark.sql("create table db_name.tb_name(schema) using csv location'path of csv file'")

In [6]:
spark.sql("create table itv013010_caching_ext.itv013010_orders_ext(order_id long, order_date string, customer_id long, order_status string)using csv location '/user/itv013010/orders_demo/'")

### checking if external table has loaded

In [7]:
spark.sql("select * from itv013010_caching_ext.itv013010_orders_ext").show()

+--------+--------------------+-----------+---------------+
|order_id|          order_date|customer_id|   order_status|
+--------+--------------------+-----------+---------------+
|       1|2013-07-25 00:00:...|      11599|         CLOSED|
|       2|2013-07-25 00:00:...|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:...|      12111|       COMPLETE|
|       4|2013-07-25 00:00:...|       8827|         CLOSED|
|       5|2013-07-25 00:00:...|      11318|       COMPLETE|
|       6|2013-07-25 00:00:...|       7130|       COMPLETE|
|       7|2013-07-25 00:00:...|       4530|       COMPLETE|
|       8|2013-07-25 00:00:...|       2911|     PROCESSING|
|       9|2013-07-25 00:00:...|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:...|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:...|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:...|       1837|         CLOSED|
|      13|2013-07-25 00:00:...|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:...|       98

### Performing a count query.

In [8]:
spark.sql("select count(*) from itv013010_caching_ext.itv013010_orders_ext").show()

+--------+
|count(1)|
+--------+
|   68883|
+--------+



### Describing a table to see its type i.e EXTERNAL TABLE or NOT

In [9]:
spark.sql("describe extended itv013010_caching_ext.itv013010_orders_ext").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|            order_id|              bigint|   null|
|          order_date|              string|   null|
|         customer_id|              bigint|   null|
|        order_status|              string|   null|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|            Database|itv013010_caching...|       |
|               Table|itv013010_orders_ext|       |
|               Owner|           itv013010|       |
|        Created Time|Wed Jul 17 03:43:...|       |
|         Last Access|Wed Dec 31 19:00:...|       |
|          Created By|         Spark 2.4.7|       |
|                Type|            EXTERNAL|       |
|            Provider|                 csv|       |
|    Table Properties|[transient_lastDd...|       |
|            Location|hdfs://m01.itvers...|       |
|       Serd

In [10]:
spark.sql("cache table itv013010_caching_ext.itv013010_orders_ext")

In [11]:
spark.sql("insert into itv013010_caching_ext.itv013010_orders_ext values (111111,'2023-05-29',222222,'BOOKED')")

### Now after adding the new data into the, will it hit the cache or not?
### No it wont at first

In [12]:
spark.sql("select count(*) from itv013010_caching_ext.itv013010_orders_ext").show()

+--------+
|count(1)|
+--------+
|   68884|
+--------+



### In case if you added say file of 20 records into the path, your table data won't be updated instantly.
### To update it we use
## 1) spark.catalog.refreshTable("db_name.tb_name")
## 2) spark.sql("refresh table db_name.tb_name")

In [15]:
spark.sql("refresh table itv013010_caching_ext.itv013010_orders_ext")

In [17]:
spark.sql("select count(*) from itv013010_caching_ext.itv013010_orders_ext").show()

+--------+
|count(1)|
+--------+
|   68909|
+--------+

