Select data from a file or directory of files. Databricks supports these locations:

Unity Catalog volumes

Workspace files

Cloud object storage

DBFS mounts and DBFS root

Ephemeral storage attached to the driver node of the cluster

Spark SQL and DatabricksSQL
https://docs.databricks.com/en/files/index.html

In [None]:
#Spark SQL & Databricks SQL

#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
SELECT * FROM csv.`/Volumes/my_catalog/my_schema/my_volume/data.csv`;
LIST '/Volumes/my_catalog/my_schema/my_volume/';


#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
SELECT * FROM json.'file:/Workspace/Users/<user-folder>/file.json'

#single file, cloud object store through unity catalog
SELECT * FROM csv.`abfss://container-name@storage-account-name.dfs.core.windows.net/path/file.json`; 
LIST 'abfss://container-name@storage-account-name.dfs.core.windows.net/path';

#single file, from DBFS mount (not securable by Unity Catalog)
SELECT * FROM json.`/mnt/path/to/data.json`;

#from a directory
SELECT * FROM json.`path/to/directory`;

#wildcard match
SELECT * FROM csv.`dir/path/*.csv`;

Apache Spark

In [None]:
#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
spark.read.format("json").load("/volumes/my_catalog/my_schema/my_volumne/data.json").show()

#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
spark.read.format("json").load("file:/Workspace/Users/<user-folder>/data.json").show()

#single file, cloud object store through unity catalog
spark.read.format("json").load("abfss://container-name@storage-account-name.dfs.core.windows.net/path/file.json").show()

#single file, from DBFS mount (not securable by Unity Catalog)
spark.read.format("json").load("/mnt/path/to/data.json").show()

#from a directory
spark.read.format("json").load('path/*.json')



Python/Pandas

In [None]:
#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
#Pandas
df = pd.read_csv('/Volumes/my_catalog/my_schema/my_volume/data.csv')
#Python
df = spark.read.format("csv").load("/Volumes/catalog_name/schema_name/volume_name/data.csv")

#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
df = pd.read_csv('/Workspace/Users/<user-folder>/data.csv')

#cloud object storage.
#Not Supported

#DBFS
df = pd.read_csv('/dbfs/mnt/path/to/data.csv')

##

Create a view, a temporary view, and a CTE as a reference to a file

See DE 4.1 Notebook

standard view = presists
temp view = notebook session scope
global temp view = cluster session scope

In [None]:
CREATE GLOBAL TEMP VIEW my_temp_vw
AS
SELECT * FROM json.'my/file/path.json'

/*Now select*/
SELECT * FROM my_temp_vw

In [None]:
CREATE TEMP VIEW books_tmp_vw
      (book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
    USING CSV
    OPTIONS (
      path = "${dataset.bookstore}/books-csv/export_*.csv",
      header = "true",
      delimiter = ";"
    );

/*Alternative*/

    CREATE OR REPLACE TABLE csv_table
    USING CSV
    OPTIONS (
      path '/path/to/your/csv/file.csv',
      header = 'true',
      delimiter = ','
    );


In [None]:
USE ${da.db_name}_default_location;
/*${hive variable}*/

CREATE OR REPLACE TEMPORARY VIEW temp_delays USING CSV OPTIONS (
  path = '${da.paths.working_dir}/flights/departuredelays.csv', 
  header = "true",
  mode = "FAILFAST" -- abort file parsing with a RuntimeException if any malformed lines are encountered
);
CREATE OR REPLACE TABLE external_table LOCATION '${da.paths.working_dir}/external_table' AS
  SELECT * FROM temp_delays;

SELECT * FROM external_table;

Identify that tables from external sources are not Delta Lake tables.

In [None]:
DESCRIBE EXTENDED my_external_table_name

Create a table from a JDBC connection or an external database

See DE 4.2 notebook

In [None]:
CREATE DATABASE IF NOT EXISTS jdbc_db;

    CREATE TABLE jdbc_table
    USING jdbc
    OPTIONS (
          url = "jdbc:{databaseServerType}://{jdbcHostname}:{jdbcPort}",
          dbtable = "{jdbcDatabase}.table",
          user = "{jdbcUsername}",
          password = "{jdbcPassword}"
    );

/*Full Example with sql server*/

CREATE TABLE my_table_name
USING jdbc
OPTIONS(
  url 'jdbc:sqlserver://acctsql.public.84b583495e70.database.windows.net:3342',
  dtable 'mpp.AR_2'
  user 'sqlsvcACBJ'
  password 'asdafasfsahha'
)




/*Full example*/
DROP TABLE IF EXISTS users_jdbc;

CREATE TABLE users_jdbc
USING JDBC
OPTIONS (
  url = "jdbc:sqlite:/${da.username}_ecommerce.db",
  dbtable = "users"
)

Create a table from an external CSV file

DE 4.3

In [None]:
CREATE OR REPLACE TEMP VIEW sales_tmp_vw
  (order_id LONG, email STRING, transactions_timestamp LONG, total_item_quantity INTEGER, purchase_revenue_in_usd DOUBLE, unique_items INTEGER, items STRING)
USING CSV
OPTIONS (
  path "${da.paths.datasets}/raw/sales-csv",
  header "true",
  delimiter "|"
);

CREATE TABLE sales_delta AS
  SELECT * FROM sales_tmp_vw;
  
SELECT * FROM sales_delta

Identify a table location using the extented description

In [None]:
 DESCRIBE EXTENDED managed_table_in_db_with_custom_location;

Create a basic Delta Table and insert values

In [None]:
CREATE TABLE my_table_name
(id INT, name STRING, value DOUBLE);

In [None]:
/*Option 1*/
INSERT INTO my_table_name VALUES (1,"steve",1.0);
INSERT INTO my_table_name VALUES (2,"leve", 2.0);
INSERT INTO my_table_name VALUES (3,"keve", 2.0);

/*Option 2*/
INSERT INTO my_table_name
VALUES
    (4,"bro", 22.0),
    (5,"bre", 33.0),
    (6,"mot", 324.0),
    (7,"lem", 24.0)

Deduplicate rows from an existing Delta Lake table
DE 4.6

In [None]:
df = spark.read.format("delta").load(table_path)
        deduplicated_df = df.dropDuplicates()
        deduplicated_df.write.format("delta").mode("overwrite").save(table_path)


In [None]:
CREATE OR REPLACE TEMP VIEW users_deduped AS
  SELECT DISTINCT(*) FROM users_dirty;

SELECT * FROM users_deduped

Create a new table from an existing table while removing duplicate rows.

In [None]:
CREATE TABLE my_table AS

SELECT
*
FROM 
some_other_place

Deduplicate a row based on specific columns.

In [None]:
 deduplicated_df = df.dropDuplicates(["column1", "column2"])

    SELECT COUNT(DISTINCT(user_id, user_first_touch_timestamp))
    FROM users_dirty
    WHERE user_id IS NOT NULL

Extract a calendar date from a timestamp



In [None]:
CAST(my_timestamp AS DATE) AS my_date_field

/*Altenative to get rid of microseconds in UNIX (the 1e6 part)*/
CAST(my_timestamp/1e6 AS timestamp) my_date_field

Extract a specific pattern from an existing string column

DE4.6

In [None]:
SELECT *,
  date_format(first_touch, "MMM d, yyyy") AS first_touch_date,
  date_format(first_touch, "HH:mm:ss") AS first_touch_time,
  regexp_extract(email, "(?<=@).+", 0) AS email_domain
FROM (
  SELECT *,
    CAST(user_first_touch_timestamp / 1e6 AS timestamp) AS first_touch 
  FROM deduped_users
)

Utilize the dot ":" syntax to extract nested data fields

Remember that JSON is key value pairs

DE4.7

In [None]:
/*In most cases, Kafka data will be binary-encoded JSON values. We'll cast the key and value as strings below to look at these in a human-readable format.*/
CREATE OR REPLACE TEMP VIEW events_strings AS
  SELECT string(key), string(value) 
  FROM events_raw;
  
SELECT * FROM events_strings

In [None]:
/*Spark SQL has built-in functionality to directly interact with JSON data stored as strings. We can use the : syntax to traverse nested data structures. Remember to create the view up top first*/

SELECT value:device, value:geo:city 
FROM events_strings

Parse JSON strings into structs.

A JSON struct is a nested data structure that can include other JSON objects or arrays. It’s similar to a dictionary or map in programming languages, where each key is associated with a value. Values can be strings, numbers, arrays, or other JSON objects.

Databricks provides various functions and methods for working with JSON data, especially when you need to handle nested or semi-structured data. You can read JSON data into a DataFrame and then use Spark SQL functions to query and manipulate it.

DE4.7

In [None]:
##Example of JSON

{
    "name": "John Doe",
    "age": 30,
    "address": {
      "street": "123 Elm St",
      "city": "Somewhere"
    },
    "phones": ["123-456-7890", "987-654-3210"]
  }


##Explanation
In this JSON object:

"name" is a string.
"age" is a number.
"address" is another JSON object (struct) with "street" and "city".
"phones" is an array of strings.


Parsing 

Spark SQL also has a schema_of_json function to derive the JSON schema from an example. Here, we copy and paste an example JSON to the function and chain it into the from_json function to cast our value field to a struct type.

DE 4.7

In [None]:
CREATE OR REPLACE TEMP VIEW parsed_customers AS
  SELECT 
      customer_id
      ,from_json(profile, 
                  schema_of_json('{"first_name":"Thomas",
                              "last_name":"Lane","gender":"Male",
                              "address":{"street":"06 Boulevard Victor Hugo",
                              "city":"Paris","country":"France"}}'
                              )
                  ) 
        AS profile_struct
  FROM customers;
  
SELECT * FROM parsed_customers


Once a JSON string is unpacked to a struct type, Spark supports * (star) unpacking to flatten fields into columns.

In [None]:
CREATE OR REPLACE TEMP VIEW new_events_final AS
  SELECT json.* 
  FROM parsed_events;
  
SELECT * FROM new_events_final

Utilize the dot "." syntax to extract nested data fields

This can be used to Explore Data Structures
Spark SQL has robust syntax for working with complex and nested data types.

The ecommerce field is a struct that contains a double and 2 longs.

We can interact with the subfields in this field using standard . syntax similar to how we might traverse nested data in JSON.

In [None]:
SELECT ecommerce.purchase_revenue_in_usd 
FROM events
WHERE ecommerce.purchase_revenue_in_usd IS NOT NULL

Identify which result will be returned based on a join query.

This is just standard joins: INNER, OUTER, LEFT, RIGHT, ANTI, CROSS, SEMI

CROSS = cartesian product
SEMI = everything in left matching right, but only show left table columns
ANTI = everything in left not matching right, but only show left table columns

SET OPERATORS: UNION, MINUS, INTERSET

INTERSECT = only matching records

Identify the PIVOT clause as a way to convert data from a long format to a wide format
DE4.7

Purpose = flatten information and aggregate it

In [None]:
CREATE OR REPLACE TABLE transactions AS

    SELECT * FROM (
      SELECT
        customer_id,
        book.book_id AS book_id,
        book.quantity AS quantity
      FROM orders_enriched
    ) PIVOT (
      sum(quantity) FOR book_id in (
        'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
        'B07', 'B08', 'B09', 'B10', 'B11', 'B12'
      )
    );


Identify a scenario to use the explode function versus the flatten function

Explode = puts each element in an array on its own row

Flatten = allows multiple arrays to be combined into a single array

Array_distinct = removes duplicate elements from an rray

Collect_Set = collect unique values for a field, including fields within arrays

DE4.7

In [None]:
/*Explode: puts each element in the items array field (a struct) into its own row*/

SELECT user_id, event_timestamp, event_name, explode(items) AS item 
FROM events

/*collect and flatten*/


Define a SQL UDF

Basic syntax  = function name, optional parameters, type returned, and custom logic

DE4.8

In [None]:
CREATE OR REPLACE FUNCTION yelling(text STRING)
    RETURNS STRING
    RETURN concat(upper(text), "!!!")

/*Now select the function*/
SELECT yelling(food) FROM foods

/*Output is APPLE!!!, BANANA!!!, CARROT!!!*/

Simple Control Flow Functions

In [None]:
CREATE FUNCTION foods_i_like(food STRING)
RETURNS STRING
RETURN CASE 
  WHEN food = "beans" THEN "I love beans"
  WHEN food = "potatoes" THEN "My favorite vegetable is potatoes"
  WHEN food <> "beef" THEN concat("Do you have any good recipes for ", food ,"?")
  ELSE concat("I don't eat ", food)
END;

Identify the location of a function.

In [None]:
 DESCRIBE FUNCTION EXTENDED get_url


Higher Order Functions DE4.7

FILTER filters an array using the given lambda function.
EXIST tests whether a statement is true for one or more elements in an array.
TRANSFORM uses the given lambda function to transform all elements in an array.
REDUCE takes two lambda functions to reduce the elements of an array to a single value by merging the elements into a buffer, and the apply a finishing function on the final buffer.

AUTO LOADER IS BELOW 

DE 6.0 series

readStream

In [None]:
#auto loader syntax
df_read = (spark.readStream
    .format('cloudFiles')
    .option('cloudFiles.format', 'csv')
    .option('cloudFiles.schemaLocation', {path})
    .option('header', 'true')
    .load('path')
    .schema(schema)
)

In [None]:
#for a table
(spark.readStream
    .table("books")
    .createOrReplaceTempView("my_streaming_temp_view"))

writeStream

In [None]:
#auto loader syntax
df_read.writeStream(
    .format('delta')
    .option('cloudFiles.checkpointLocation')
    .outputMode('append')
    .query('myQuery_name')
    .trigger(availableNow=True)
    .toTable('my_table_name')
)

In [None]:
#table syntax
(spark.table("my_spark_table_vw")
    .writeStream
    .trigger(processingTime='4 seconds')
    .outputMode("complete") #complete = full re-write. append = incremental load. Aggregations always require the complete mode
    .option("checkpointLocation", "dbfs:/mnt/demo/author_counts_checkpoint")
    .table("my_table_name")
)