In [None]:
%run "./Setup.ipynb"

In [None]:
inputPath = "E:\\PySpark\\data\\users.json"

In [None]:
from pyspark.sql.functions import *

In [None]:
df1 = spark.read.json(inputPath)
df1.show()

In [None]:
df1.createOrReplaceTempView("users")

In [None]:
spark.catalog.listTables()

In [None]:
qry = """select age, count(1) as count
        from users
        where age is not null
        group by age
        order by count
        limit 5"""

df2 = spark.sql(qry)

df2.show()

#### GlobalTempViews

In [None]:
df1.createOrReplaceGlobalTempView("gusers")

In [None]:
spark.catalog.listTables("global_temp")

In [None]:
spark.catalog.currentDatabase()

In [None]:
qry2 = """select age, count(1) as count
        from global_temp.gusers
        where age is not null
        group by age
        order by count
        limit 5"""

df3 = spark.sql(qry2)

df3.show()

In [None]:
spark2 = spark.newSession()

qry2 = """select age, count(1) as count
        from global_temp.gusers
        where age is not null
        group by age
        order by count
        limit 5"""

df3 = spark2.sql(qry2)

df3.show()

#### Create a new session

In [None]:
spark2 = spark.newSession()

In [None]:
spark.catalog.listTables()

In [None]:
spark2.catalog.listTables()

In [None]:
qry = """select age, count(1) as count
        from users
        where age is not null
        group by age
        order by count
        limit 5"""

df2 = spark2.sql(qry)

df2.show()

In [None]:
print(qry2)

In [None]:
df3 = spark2.sql(qry2)

df3.show()

#### Saving to tables

In [None]:
spark.conf.get("spark.sql.warehouse.dir")

In [None]:
#spark.conf.set("spark.sql.warehouse.dir", "E:\\PySpark\\Warehouse\\spark-warehouse")

In [None]:
spark.catalog.currentDatabase()

In [None]:
spark.sql("desc database default").show(truncate = False)

In [None]:
df2 = df1.where("age is not null and gender = 'Male'")
df2.show()

In [None]:
df2.write.format("json").saveAsTable("users_json")

In [None]:
spark.catalog.listTables()

In [None]:
spark.sql("select * from users_json where age > 20").show()

In [None]:
df2.write.format("csv").option("header", True).saveAsTable("users_csv")

In [None]:
age = 20
spark.sql(f"select * from users_csv where age > {age}").show()

#### External Tables

In [None]:
df2 \
.write \
.format("parquet") \
.option("path", "E:\\PySpark\\external\\users_parquet") \
.saveAsTable("users_parquet")

In [None]:
spark.catalog.listTables()

In [None]:
age = 20
spark.sql(f"select * from users_parquet where age > {age}").show()

In [None]:
df3 = df1.where("age is not null and gender = 'Female'")
df3.show()

#### Appending data to an existing table

In [None]:
df3 \
.write \
.format("json") \
.mode("append") \
.saveAsTable("users_json")

In [None]:
spark.sql("select * from users_json").show()

#### Dropping managed and external tables

In [49]:
spark.catalog.listTables()

[]

In [None]:
spark.sql("drop table default.users_csv")

In [46]:
spark.sql("drop table default.users_json")

++
||
++
++



In [48]:
spark.sql("drop table default.users_parquet")

DataFrame[]

#### Create a schema/database

In [50]:
spark.sql("create database demodb")

DataFrame[]

In [53]:
spark.sql("describe database demodb").show(10, False)

+-------------------------+----------------------------------------------------------------------------------+
|database_description_item|database_description_value                                                        |
+-------------------------+----------------------------------------------------------------------------------+
|Database Name            |demodb                                                                            |
|Description              |                                                                                  |
|Location                 |file:/E:/PySpark/jupyter-notebooks/july-2025-3/spark-sql/spark-warehouse/demodb.db|
|Owner Name               |                                                                                  |
|Owner Type               |                                                                                  |
+-------------------------+----------------------------------------------------------------------------------+



In [54]:
spark.sql("use demodb")

DataFrame[]

In [55]:
spark.catalog.currentDatabase()

'demodb'

In [56]:
df2.write.format("csv").option("header", True).saveAsTable("users_csv")

In [59]:
spark.catalog.listTables()

[Table(name='users_csv', database='demodb', description=None, tableType='MANAGED', isTemporary=False),
 Table(name='users_json', database='demodb', description=None, tableType='MANAGED', isTemporary=False)]

In [58]:
df2.write.format("json").saveAsTable("users_json")

In [61]:
spark.sql("select * from users_json").show()

+---+------+-------+----------+------+
|age|gender|   name|     phone|userid|
+---+------+-------+----------+------+
| 25|  Male|  Satya|8501099876|     1|
| 25|  Male|  Rahim|      null|     3|
| 13|  Male| Sundar|8522233456|     6|
| 28|  Male|  Steve|8501085009|     7|
| 13|  Male| Samuel|8522235624|    11|
| 28|  Male|  Raghu|8501082222|    12|
| 13|  Male|Murugan|8522209563|    16|
| 31|  Male| Mukesh|8597638421|    17|
+---+------+-------+----------+------+



In [60]:
df3 = df1.where("gender = 'Female'")
df3.show()

+---+------+------+----------+------+
|age|gender|  name|     phone|userid|
+---+------+------+----------+------+
| 28|Female|Sandra|8508899001|     4|
| 15|Female|Keerti|      null|     5|
| 55|Female|Smriti|9246655498|     8|
| 36|Female| Veena|8508888888|     9|
| 15|Female| Kriti|      null|    10|
| 55|Female| Ramya|9246654658|    13|
| 31|Female|Hasina|8345213235|    14|
| 37|Female| Lasya|      null|    15|
+---+------+------+----------+------+



In [62]:
df3.write.parquet("E:\\PySpark\\output\\parquet")

In [65]:
spark.read.parquet("E:\\PySpark\\output\\parquet") \
.write \
.format("json") \
.mode("append") \
.saveAsTable("users_json")


In [66]:
spark.sql("select * from users_json").show()

+---+------+-------+----------+------+
|age|gender|   name|     phone|userid|
+---+------+-------+----------+------+
| 25|  Male|  Satya|8501099876|     1|
| 25|  Male|  Rahim|      null|     3|
| 13|  Male| Sundar|8522233456|     6|
| 28|  Male|  Steve|8501085009|     7|
| 13|  Male| Samuel|8522235624|    11|
| 28|  Male|  Raghu|8501082222|    12|
| 13|  Male|Murugan|8522209563|    16|
| 31|  Male| Mukesh|8597638421|    17|
| 28|Female| Sandra|8508899001|     4|
| 15|Female| Keerti|      null|     5|
| 55|Female| Smriti|9246655498|     8|
| 36|Female|  Veena|8508888888|     9|
| 15|Female|  Kriti|      null|    10|
| 55|Female|  Ramya|9246654658|    13|
| 31|Female| Hasina|8345213235|    14|
| 37|Female|  Lasya|      null|    15|
+---+------+-------+----------+------+



In [70]:
spark.read.parquet("E:\\PySpark\\external\\users_parquet").show()

+---+------+-------+----------+------+
|age|gender|   name|     phone|userid|
+---+------+-------+----------+------+
| 25|  Male|  Satya|8501099876|     1|
| 25|  Male|  Rahim|      null|     3|
| 13|  Male| Sundar|8522233456|     6|
| 28|  Male|  Steve|8501085009|     7|
| 13|  Male| Samuel|8522235624|    11|
| 28|  Male|  Raghu|8501082222|    12|
| 13|  Male|Murugan|8522209563|    16|
| 31|  Male| Mukesh|8597638421|    17|
+---+------+-------+----------+------+



#### Partitioned Tables

In [74]:
df1.show()

+----+------+-------+----------+------+
| age|gender|   name|     phone|userid|
+----+------+-------+----------+------+
|  25|  Male|  Satya|8501099876|     1|
|null|  Male|   null|5676599876|     2|
|  25|  Male|  Rahim|      null|     3|
|  28|Female| Sandra|8508899001|     4|
|  15|Female| Keerti|      null|     5|
|  13|  Male| Sundar|8522233456|     6|
|  28|  Male|  Steve|8501085009|     7|
|  55|Female| Smriti|9246655498|     8|
|  36|Female|  Veena|8508888888|     9|
|  15|Female|  Kriti|      null|    10|
|  13|  Male| Samuel|8522235624|    11|
|  28|  Male|  Raghu|8501082222|    12|
|  55|Female|  Ramya|9246654658|    13|
|  31|Female| Hasina|8345213235|    14|
|  37|Female|  Lasya|      null|    15|
|  13|  Male|Murugan|8522209563|    16|
|  31|  Male| Mukesh|8597638421|    17|
+----+------+-------+----------+------+



In [75]:
df1.write.format("csv").option("header", True).partitionBy("gender").saveAsTable("users_partitioned")

In [77]:
spark.sql("select * from users_partitioned where gender = 'Male'").show()

+----+-------+----------+------+------+
| age|   name|     phone|userid|gender|
+----+-------+----------+------+------+
|  25|  Satya|8501099876|     1|  Male|
|null|   null|5676599876|     2|  Male|
|  25|  Rahim|      null|     3|  Male|
|  13| Sundar|8522233456|     6|  Male|
|  28|  Steve|8501085009|     7|  Male|
|  13| Samuel|8522235624|    11|  Male|
|  28|  Raghu|8501082222|    12|  Male|
|  13|Murugan|8522209563|    16|  Male|
|  31| Mukesh|8597638421|    17|  Male|
+----+-------+----------+------+------+



In [78]:
df4 = spark.sql("select * from users_partitioned where gender = 'Male'")
df4.show()

+----+-------+----------+------+------+
| age|   name|     phone|userid|gender|
+----+-------+----------+------+------+
|  25|  Satya|8501099876|     1|  Male|
|null|   null|5676599876|     2|  Male|
|  25|  Rahim|      null|     3|  Male|
|  13| Sundar|8522233456|     6|  Male|
|  28|  Steve|8501085009|     7|  Male|
|  13| Samuel|8522235624|    11|  Male|
|  28|  Raghu|8501082222|    12|  Male|
|  13|Murugan|8522209563|    16|  Male|
|  31| Mukesh|8597638421|    17|  Male|
+----+-------+----------+------+------+

