In [1]:
%scala
spark.sql("SELECT 1 + 1").collect()

In [2]:
spark.sql("SELECT 1 + 1").collect()

In [3]:
%scala
spark.sql("""
SELECT
user_id,
department,
first_name
FROM
professors
WHERE
department IN
(SELECT name
FROM department
WHERE created_date >= '2016-01-01')
""")

In [4]:
%sql
SHOW TABLES


In [5]:
%scala
case class Table(database:String, tableName:String, isTemporary:Boolean)
spark.sql("SHOW TABLES").as[Table].collect().map {t =>
try {
spark.sql(s"DROP TABLE IF EXISTS ${t.tableName}").collect()
}
catch {
case e: org.apache.spark.sql.AnalysisException => {
spark.sql(s"DROP VIEW IF EXISTS ${t.tableName}").collect()
}
}
}

In [6]:
%sql
CREATE TABLE flights (
DEST_COUNTRY_NAME STRING,
ORIGIN_COUNTRY_NAME STRING,
count LONG)
USING JSON
OPTIONS (
path '/FileStore/tables/8su3wraj1497632771405/2015_summary-ebaee.json')

In [7]:
%sql
CREATE TABLE flights_csv (
DEST_COUNTRY_NAME STRING,
  ORIGIN_COUNTRY_NAME STRING COMMENT "remember that the most prevalent will be the US",
count LONG)
USING csv
OPTIONS (
inferSchema true,
header true,
path '/FileStore/tables/baae5ww01497633203998/2015_summary-ebaee.csv')

In [8]:
%sql
CREATE TABLE flights_from_select
AS
SELECT * FROM flights

In [9]:
%sql
CREATE TABLE IF NOT EXISTS flights_from_select
AS
SELECT * FROM flights
LIMIT 5

In [10]:
%sql
CREATE TABLE partitioned_flights
USING parquet
PARTITIONED BY (DEST_COUNTRY_NAME)
AS
SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights
LIMIT 5 -- so we don't create a ton of files

In [11]:
%sql
INSERT INTO flights_from_select
SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights
LIMIT 20

In [12]:
%sql
INSERT INTO partitioned_flights
PARTITION (DEST_COUNTRY_NAME="UNITED STATES")
SELECT count, ORIGIN_COUNTRY_NAME FROM flights
WHERE DEST_COUNTRY_NAME='UNITED STATES'
LIMIT 12

In [13]:
%sql
SELECT * FROM flights_csv

In [14]:
%sql
SELECT * FROM flights

In [15]:
%sql
DESCRIBE TABLE flights_csv

In [16]:
%sql
SHOW PARTITIONS partitioned_flights

In [17]:
%sql
REFRESH table partitioned_flights

In [18]:
%sql
MSCK REPAIR TABLE partitioned_flights

In [19]:
%sql
DROP TABLE flights_csv;

In [20]:
%sql
DROP TABLE IF EXISTS flights_csv;

In [21]:
%sql
CREATE VIEW just_usa_view AS
SELECT *
FROM flights
WHERE dest_country_name = 'United States'

In [22]:
%sql
CREATE VIEW just_usa_global AS
SELECT *
FROM flights
WHERE dest_country_name = 'United States'

In [23]:
%sql
CREATE TEMP VIEW just_usa_view_temp AS
SELECT *
FROM flights
WHERE dest_country_name = 'United States'

In [24]:
%sql
CREATE GLOBAL TEMP VIEW just_usa_global_view_temp AS
SELECT *
FROM flights
WHERE dest_country_name = 'United States'
%sql SHOW TABLES

In [25]:
%sql
CREATE OR REPLACE TEMP VIEW just_usa_view_temp AS
SELECT *
FROM flights
WHERE dest_country_name = 'United States'

In [26]:
%sql
SELECT * FROM just_usa_view

In [27]:
%scala
val flights = spark.read.format("json")
.load("/mnt/defg/chapter-1-data/json/2015-summary.json")
val just_usa_df = flights
.where("dest_country_name = 'United States'")
just_usa_df
.selectExpr("*")
.explain

In [28]:
%sql
EXPLAIN SELECT * FROM just_usa_view

In [29]:
%sql
EXPLAIN
SELECT *
FROM flights
WHERE dest_country_name = 'United States'

In [30]:
%sql
DROP VIEW IF EXISTS just_usa_view;

In [31]:
%sql
SHOW DATABASES

In [32]:
%sql
CREATE DATABASE some_db

In [33]:
%sql
USE some_db

In [34]:
%sql
SHOW tables

In [35]:
%sql
SELECT * FROM flights

In [36]:
%sql
SELECT * FROM default.flights

In [37]:
%sql
SELECT current_database()

In [38]:
%sql
USE default;

In [39]:
%sql
DROP DATABASE IF EXISTS some_db;

In [40]:
%sql
SELECT
CASE WHEN DEST_COUNTRY_NAME = 'UNITED STATES' THEN 1
WHEN DEST_COUNTRY_NAME = 'Egypt' THEN 0
ELSE -1 END
FROM
partitioned_flights

In [41]:
%sql
CREATE VIEW IF NOT EXISTS
nested_data
AS
SELECT
(DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country,
count
FROM flights

In [42]:
%sql
SELECT * FROM nested_data

In [43]:
%sql
SELECT country.DEST_COUNTRY_NAME, count
FROM nested_data

In [44]:
%sql
SELECT country.*, count
FROM nested_data

In [45]:
%sql
SELECT
DEST_COUNTRY_NAME as new_name,
collect_list(count) as flight_counts,
collect_set(ORIGIN_COUNTRY_NAME) as origin_set
FROM
flights
GROUP BY
DEST_COUNTRY_NAME

In [46]:
%sql
SELECT
DEST_COUNTRY_NAME as new_name,
collect_list(count)[0]
FROM
flights
GROUP BY
DEST_COUNTRY_NAME

In [47]:
%sql
CREATE OR REPLACE TEMP VIEW flights_agg
AS
SELECT
DEST_COUNTRY_NAME,
collect_list(count) as collected_counts
FROM
flights
GROUP BY
DEST_COUNTRY_NAME

In [48]:
%sql
SELECT explode(collected_counts), DEST_COUNTRY_NAME
FROM flights_agg

In [49]:
%sql
SHOW FUNCTIONS

In [50]:
%sql
SHOW SYSTEM FUNCTIONS

In [51]:
%sql
SHOW USER FUNCTIONS

In [52]:
%sql
SHOW FUNCTIONS "s*";

In [53]:
%sql
SHOW FUNCTIONS LIKE "collect*";

In [54]:
%scala
def power3(number:Double):Double = {
number * number * number
}
spark.udf.register("power3", power3(_:Double):Double)

In [55]:
%sql
SELECT count, power3(count)
FROM flights

In [56]:
%sql
CREATE EXTERNAL TABLE hive_flights (
DEST_COUNTRY_NAME STRING,
ORIGIN_COUNTRY_NAME STRING,
count LONG)
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/mnt/defg/flight-data-hive/'

In [57]:
%sql
CREATE EXTERNAL TABLE hive_flights_2
ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
LOCATION '/mnt/defg/flight-data-hive/'
AS SELECT * FROM flights


In [58]:
%sql
SELECT * FROM hive_flights

In [59]:
%sql
SELECT dest_country_name
FROM flights
GROUP BY dest_country_name
ORDER BY sum(count) DESC
LIMIT 5

In [60]:
%sql
SELECT *
FROM flights
WHERE
origin_country_name IN (
SELECT dest_country_name
FROM flights
GROUP BY dest_country_name
ORDER BY sum(count) DESC
LIMIT 5)

In [61]:
%sql
SELECT *
FROM flights f1
WHERE EXISTS (
SELECT 1
FROM flights f2
WHERE f1.dest_country_name = f2.origin_country_name)
AND EXISTS (
SELECT 1
FROM flights f2
WHERE f2.dest_country_name = f1.origin_country_name)

In [62]:
%sql
SELECT *,
(SELECT max(count) FROM flights) AS maximum
FROM flights