In [1]:
%%writefile people.json
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

Writing people.json


In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("people")
                             .config("spark.driver.host","127.0.0.1") 
                             .config("spark.driver.bindAddress","127.0.0.1")
                             .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/07 15:10:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.json("./people.json")

In [11]:
spark.sql("use default")

DataFrame[]

In [16]:
# Drop a table
table_name = "my_people"
spark.sql("DROP TABLE IF EXISTS {}".format(table_name))

DataFrame[]

In [17]:
df.write.saveAsTable("my_people1")

                                                                                

In [19]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [23]:
spark.sql("show tables").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|my_people1|      false|
+---------+----------+-----------+



In [20]:
spark.sql("show tables from default ").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|my_people1|      false|
+---------+----------+-----------+



In [21]:
# List all tables and views
tables = spark.catalog.listTables()

In [22]:
tables

[Table(name='my_people1', catalog='spark_catalog', namespace=['default'], description=None, tableType='MANAGED', isTemporary=False)]

In [18]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [28]:
spark.sql("desc my_people1").show()

+--------+---------+-------+
|col_name|data_type|comment|
+--------+---------+-------+
|     age|   bigint|   null|
|    name|   string|   null|
+--------+---------+-------+



In [33]:
# spark, df are from the previous example
# Print the schema in a tree format
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [29]:
spark.sql("select name from my_people1").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [8]:
# Select only the "name" column
df.select("name").show()

+-------+
|   name|
+-------+
|Michael|
|   Andy|
| Justin|
+-------+



In [9]:
# Select everybody, but increment the age by 1
df.select(df['name'], df['age'] + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [10]:
# Select people older than 21
df.filter(df['age'] > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [13]:
df.filter("age > 21").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [12]:
# Select people older than 21
df.where(df['age'] > 21).show()


+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [14]:
df.where("age > 21").show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [30]:
# Count people by age
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [31]:
# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [32]:
spark.sql("show tables").show()

+---------+----------+-----------+
|namespace| tableName|isTemporary|
+---------+----------+-----------+
|  default|my_people1|      false|
|         |    people|       true|
+---------+----------+-----------+



In [38]:
# Stop the SparkSession
spark.stop()