Select data from a file or directory of files. Databricks supports these locations:

Unity Catalog volumes

Workspace files

Cloud object storage

DBFS mounts and DBFS root

Ephemeral storage attached to the driver node of the cluster

Spark SQL and DatabricksSQL
https://docs.databricks.com/en/files/index.html

In [None]:
#Spark SQL & Databricks SQL

#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
SELECT * FROM csv.`/Volumes/my_catalog/my_schema/my_volume/data.csv`;
LIST '/Volumes/my_catalog/my_schema/my_volume/';


#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
SELECT * FROM json.'file:/Workspace/Users/<user-folder>/file.json'

#single file, cloud object store through unity catalog
SELECT * FROM csv.`abfss://container-name@storage-account-name.dfs.core.windows.net/path/file.json`; 
LIST 'abfss://container-name@storage-account-name.dfs.core.windows.net/path';

#single file, from DBFS mount (not securable by Unity Catalog)
SELECT * FROM json.`/mnt/path/to/data.json`;

#from a directory
SELECT * FROM json.`path/to/file.json`;

#wildcard match
SELECT * FROM csv.`dir/path/*.csv`;

Apache Spark

In [None]:
#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
spark.read.format("json").load("/volumes/my_catalog/my_schema/my_volumne/data.json").show()

#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
spark.read.format("json").load("file:/Workspace/Users/<user-folder>/data.json").show()

#single file, cloud object store through unity catalog
spark.read.format("json").load("abfss://container-name@storage-account-name.dfs.core.windows.net/path/file.json").show()

#single file, from DBFS mount (not securable by Unity Catalog)
spark.read.format("json").load("/mnt/path/to/data.json").show()

#from a directory
spark.read.format("json").load('path/*.json')



Python/Pandas

In [None]:
#single file, unity catalog volume (for non tabular data files stored in cloud object storage)
#Pandas
df = pd.read_csv('/Volumes/my_catalog/my_schema/my_volume/data.csv')
#Python
df = spark.read.format("csv").load("/Volumes/catalog_name/schema_name/volume_name/data.csv")

#single file, workspace files (files in a workspace that aren't notebooks. 500MB limit)
df = pd.read_csv('/Workspace/Users/<user-folder>/data.csv')

#cloud object storage.
#Not Supported

#DBFS
df = pd.read_csv('/dbfs/mnt/path/to/data.csv')

##

Create a view, a temporary view, and a CTE as a reference to a file

IF NOT EXISTS is something that is not supported


In [None]:
CREATE TEMP VIEW books_tmp_vw
      (book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
    USING CSV
    OPTIONS (
      path = "${dataset.bookstore}/books-csv/export_*.csv",
      header = "true",
      delimiter = ";"
    );

/*Alternative*/

    CREATE OR REPLACE TABLE csv_table
    USING CSV
    OPTIONS (
      path '/path/to/your/csv/file.csv',
      header 'true',
      delimiter ','
    );


Create a table from a JDBC connection and from an external CSV file

In [None]:
CREATE DATABASE IF NOT EXISTS jdbc_db;

    %sql
    CREATE TABLE jdbc_table
    USING jdbc
    OPTIONS (
      url 'jdbc:mysql://your_database_hostname:3306/your_database_name',
      user 'your_username',
      password 'your_password',
      dbtable '(SELECT * FROM your_table_name)'
    );



Identify a table location using the extented description

In [None]:
 DESCRIBE EXTENDED managed_table_in_db_with_custom_location;

Deduplicate rows from an existing Delta Lake table

In [None]:
df = spark.read.format("delta").load(table_path)
        deduplicated_df = df.dropDuplicates()
        deduplicated_df.write.format("delta").mode("overwrite").save(table_path)


Create a new table from an existing table while removing duplicate rows.

In [None]:
CREATE TABLE my_table AS

SELECT
*
FROM 

Deduplicate a row based on specific columns.

In [None]:
 deduplicated_df = df.dropDuplicates(["column1", "column2"])

    SELECT COUNT(DISTINCT(user_id, user_first_touch_timestamp))
    FROM users_dirty
    WHERE user_id IS NOT NULL

Parse JSON strings into structs.

In [None]:
CREATE OR REPLACE TEMP VIEW parsed_customers AS
  SELECT 
      customer_id
      ,from_json(profile, 
                  schema_of_json('{"first_name":"Thomas",
                              "last_name":"Lane","gender":"Male",
                              "address":{"street":"06 Boulevard Victor Hugo",
                              "city":"Paris","country":"France"}}'
                              )
                  ) 
        AS profile_struct
  FROM customers;
  
SELECT * FROM parsed_customers

Identify the PIVOT clause as a way to convert data from a long format to a wide format

In [None]:
CREATE OR REPLACE TABLE transactions AS

    SELECT * FROM (
      SELECT
        customer_id,
        book.book_id AS book_id,
        book.quantity AS quantity
      FROM orders_enriched
    ) PIVOT (
      sum(quantity) FOR book_id in (
        'B01', 'B02', 'B03', 'B04', 'B05', 'B06',
        'B07', 'B08', 'B09', 'B10', 'B11', 'B12'
      )
    );


Define a SQL UDF

In [None]:
CREATE OR REPLACE FUNCTION yelling(text STRING)
    RETURNS STRING
    RETURN concat(upper(text), "!!!")

Identify the location of a function.

In [None]:
 DESCRIBE FUNCTION EXTENDED get_url


readStream

In [None]:
df_read = (spark.readStream
    .format('cloudFiles')
    .option('cloudFiles.format', 'csv')
    .option('cloudFiles.schemaLocation', {path})
    .option('header', 'true')
    .load('path')
    .schema(schema)
)

In [None]:
df_read.writeStream(
    .format('delta')
    .option('cloudFiles.checkpointLocation')
    .outputMode('append')
    .query('myQuery_name')
    .trigger(availableNow=True)
    .toTable('my_table_name')
)