this chapter is best to use [databricks community cluster](https://community.cloud.databricks.com) to practise

see `../SparkSQL.sql`  and `../SparkSQL.html`


one can use `spark-sql` CLI to enter `SQL` command directly



- [Table Types in Spark: External or Managed?](http://www.gatorsmile.io/table-types-in-spark-external-or-managed/)

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

spark = SparkSession\
    .builder\
    .appName("chapter-10-data-src")\
    .enableHiveSupport()\
    .getOrCreate()

import os
SPARK_BOOK_DATA_PATH = os.environ['SPARK_BOOK_DATA_PATH']

In [2]:
spark

In [3]:
file_path = SPARK_BOOK_DATA_PATH + "/data/flight-data/json/2015-summary.json"

spark.read.json(file_path)\
  .createOrReplaceTempView("flight_data") # DF => SQL

In [4]:
file_path

'/home/wengong/spark_data//data/flight-data/json/2015-summary.json'

In [5]:
df = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count)
FROM flight_data GROUP BY DEST_COUNTRY_NAME
""")\
  .where("DEST_COUNTRY_NAME like 'S%'")\
  .where("`sum(count)` > 10")
# SQL => DF


# COMMAND ----------

In [6]:
df.show(5)

+--------------------+----------+
|   DEST_COUNTRY_NAME|sum(count)|
+--------------------+----------+
|             Senegal|        40|
|              Sweden|       118|
|               Spain|       420|
|    Saint Barthelemy|        39|
|Saint Kitts and N...|       139|
+--------------------+----------+
only showing top 5 rows



In [7]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [37]:
spark.sql("SELECT current_database()").show() 

+------------------+
|current_database()|
+------------------+
|           default|
+------------------+



In [36]:
spark.sql("show tables;").show()

+--------+-------------------+-----------+
|database|          tableName|isTemporary|
+--------+-------------------+-----------+
| default|            flights|      false|
| default|        flights_csv|      false|
| default|flights_from_select|      false|
| default|       hive_flights|      false|
| default|     hive_flights_2|      false|
| default|      just_usa_view|      false|
| default|partitioned_flights|      false|
|        |        flight_data|       true|
+--------+-------------------+-----------+



In [9]:
spark.sql("drop table flights")

DataFrame[]

In [10]:
spark.sql("drop table hive_flights")

DataFrame[]

In [11]:
spark.sql("drop table hive_flights_2")

DataFrame[]

In [13]:
spark.sql("""
CREATE TABLE flights (
  DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)
USING JSON OPTIONS (path '/home/wengong/spark_data//data/flight-data/json/2015-summary.json')
""").show()

++
||
++
++



In [15]:
spark.sql("""
    select * from flights limit 5
""").show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



In [16]:
sql_stmt = f"""CREATE TABLE flights_csv (
  DEST_COUNTRY_NAME STRING,
  ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent",
  count LONG)
USING csv OPTIONS (header true, path '{SPARK_BOOK_DATA_PATH}data/flight-data/csv/2015-summary.csv')
"""

In [17]:
sql_stmt

'CREATE TABLE flights_csv (\n  DEST_COUNTRY_NAME STRING,\n  ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent",\n  count LONG)\nUSING csv OPTIONS (header true, path \'/home/wengong/spark_data/data/flight-data/csv/2015-summary.csv\')\n'

In [18]:
spark.sql(sql_stmt).show()

++
||
++
++



In [19]:
spark.sql("CREATE TABLE if not exists flights_from_select USING parquet AS SELECT * FROM flights")

DataFrame[]

In [30]:
spark.sql("CREATE TABLE if not exists flights_from_select2 USING parquet AS SELECT * FROM flights")

DataFrame[]

In [20]:
spark.sql("""
CREATE TABLE partitioned_flights USING parquet PARTITIONED BY (DEST_COUNTRY_NAME)
AS SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 5
""")

DataFrame[]

In [22]:
spark.sql("describe table partitioned_flights").show()

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
| ORIGIN_COUNTRY_NAME|   string|   null|
|               count|   bigint|   null|
|   DEST_COUNTRY_NAME|   string|   null|
|# Partition Infor...|         |       |
|          # col_name|data_type|comment|
|   DEST_COUNTRY_NAME|   string|   null|
+--------------------+---------+-------+



In [23]:
spark.sql("describe table flights").show()

+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|  DEST_COUNTRY_NAME|   string|   null|
|ORIGIN_COUNTRY_NAME|   string|   null|
|              count|   bigint|   null|
+-------------------+---------+-------+



In [25]:
spark.sql("select * from partitioned_flights").show()

+-------------------+-----+-----------------+
|ORIGIN_COUNTRY_NAME|count|DEST_COUNTRY_NAME|
+-------------------+-----+-----------------+
|      United States|   15|            Egypt|
|            Romania|   15|    United States|
|            Croatia|    1|    United States|
|            Ireland|  344|    United States|
|              India|   62|    United States|
+-------------------+-----+-----------------+



In [26]:
spark.sql("SHOW PARTITIONS partitioned_flights").show(truncate=False)

+-------------------------------+
|partition                      |
+-------------------------------+
|DEST_COUNTRY_NAME=Egypt        |
|DEST_COUNTRY_NAME=United States|
+-------------------------------+



In [27]:
sql_stmt=f"""
CREATE EXTERNAL TABLE hive_flights (
  DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{SPARK_BOOK_DATA_PATH}data/flight-data-hive/'
"""
sql_stmt

"\nCREATE EXTERNAL TABLE hive_flights (\n  DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)\nROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/home/wengong/spark_data/data/flight-data-hive/'\n"

In [28]:
spark.sql(sql_stmt)

DataFrame[]

In [29]:
df = spark.sql("select * from hive_flights")
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [30]:
spark.sql("DESCRIBE TABLE hive_flights").show()

+-------------------+---------+-------+
|           col_name|data_type|comment|
+-------------------+---------+-------+
|  DEST_COUNTRY_NAME|   string|   null|
|ORIGIN_COUNTRY_NAME|   string|   null|
|              count|   bigint|   null|
+-------------------+---------+-------+



In [31]:
sql_stmt = f"""
CREATE EXTERNAL TABLE hive_flights_2
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '{SPARK_BOOK_DATA_PATH}/data/flight-data-hive/' AS SELECT * FROM flights
"""
spark.sql(sql_stmt)

DataFrame[]

In [32]:
df = spark.sql("select * from hive_flights_2 limit 5")
df.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



In [33]:
spark.sql("select * from flights_from_select").show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [34]:
spark.sql("REFRESH table partitioned_flights")

DataFrame[]

In [None]:
-- COMMAND ----------

MSCK REPAIR TABLE partitioned_flights


-- COMMAND ----------

DROP TABLE flights_csv;


-- COMMAND ----------

DROP TABLE IF EXISTS flights_csv;


-- COMMAND ----------

CACHE TABLE flights


-- COMMAND ----------

UNCACHE TABLE FLIGHTS


-- COMMAND ----------

In [35]:
spark.sql("""CREATE VIEW just_usa_view AS 
  SELECT * FROM flights WHERE dest_country_name = 'United States'
""")
df = spark.sql("select * from just_usa_view limit 5")
df.show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|    United States|              India|   62|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+



In [None]:
-- COMMAND ----------

CREATE TEMP VIEW just_usa_view_temp AS
  SELECT * FROM flights WHERE dest_country_name = 'United States'


-- COMMAND ----------

CREATE GLOBAL TEMP VIEW just_usa_global_view_temp AS
  SELECT * FROM flights WHERE dest_country_name = 'United States'


-- COMMAND ----------

SHOW TABLES


-- COMMAND ----------

CREATE OR REPLACE TEMP VIEW just_usa_view_temp AS
  SELECT * FROM flights WHERE dest_country_name = 'United States'


-- COMMAND ----------

SELECT * FROM just_usa_view_temp


-- COMMAND ----------

EXPLAIN SELECT * FROM just_usa_view


-- COMMAND ----------

EXPLAIN SELECT * FROM flights WHERE dest_country_name = 'United States'


-- COMMAND ----------

DROP VIEW IF EXISTS just_usa_view;


-- COMMAND ----------

SHOW DATABASES


-- COMMAND ----------

CREATE DATABASE some_db


-- COMMAND ----------

USE some_db


-- COMMAND ----------

SHOW tables

SELECT * FROM flights 
-- fails with table/view not found


-- COMMAND ----------

SELECT * FROM default.flights


-- COMMAND ----------




-- COMMAND ----------

USE default;


-- COMMAND ----------

DROP DATABASE IF EXISTS some_db;


-- COMMAND ----------

In [None]:
-- COMMAND ----------

SELECT
  CASE WHEN DEST_COUNTRY_NAME = 'UNITED STATES' THEN 1
       WHEN DEST_COUNTRY_NAME = 'Egypt' THEN 0
       ELSE -1 END
FROM partitioned_flights


-- COMMAND ----------

CREATE VIEW IF NOT EXISTS nested_data AS
  SELECT (DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country, count FROM flights


-- COMMAND ----------

SELECT * FROM nested_data


-- COMMAND ----------

SELECT country.DEST_COUNTRY_NAME, count FROM nested_data


-- COMMAND ----------

SELECT country.*, count FROM nested_data


-- COMMAND ----------

SELECT DEST_COUNTRY_NAME as new_name, collect_list(count) as flight_counts,
  collect_set(ORIGIN_COUNTRY_NAME) as origin_set
FROM flights GROUP BY DEST_COUNTRY_NAME


-- COMMAND ----------

SELECT DEST_COUNTRY_NAME, ARRAY(1, 2, 3) FROM flights


-- COMMAND ----------

SELECT DEST_COUNTRY_NAME as new_name, collect_list(count)[0]
FROM flights GROUP BY DEST_COUNTRY_NAME


-- COMMAND ----------

CREATE OR REPLACE TEMP VIEW flights_agg AS
  SELECT DEST_COUNTRY_NAME, collect_list(count) as collected_counts
  FROM flights GROUP BY DEST_COUNTRY_NAME


-- COMMAND ----------

SELECT explode(collected_counts), DEST_COUNTRY_NAME FROM flights_agg


-- COMMAND ----------

SHOW FUNCTIONS


-- COMMAND ----------

SHOW SYSTEM FUNCTIONS


-- COMMAND ----------

SHOW USER FUNCTIONS


-- COMMAND ----------

SHOW FUNCTIONS "s*";


-- COMMAND ----------

SHOW FUNCTIONS LIKE "collect*";


-- COMMAND ----------

SELECT count, power3(count) FROM flights


-- COMMAND ----------

SELECT dest_country_name FROM flights
GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5


-- COMMAND ----------

SELECT * FROM flights
WHERE origin_country_name IN (SELECT dest_country_name FROM flights
      GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5)


-- COMMAND ----------

SELECT * FROM flights f1
WHERE EXISTS (SELECT 1 FROM flights f2
            WHERE f1.dest_country_name = f2.origin_country_name)
AND EXISTS (SELECT 1 FROM flights f2
            WHERE f2.dest_country_name = f1.origin_country_name)


-- COMMAND ----------

SELECT *, (SELECT max(count) FROM flights) AS maximum FROM flights