# Spark SQL Quick Reference Notebook

> **Note:** All examples in this notebook use the TPCH dataset stored in `samples.tpch`.

Welcome to the **SAS-to-Databricks Quick Reference** notebook! This guide is designed for SAS users who are getting started with Databricks SQL and Spark, and it walks you through the most common data-manipulation tasks you already know from PROC SQL and Data Step—now in the Databricks environment. Inside, you’ll find:

- **Aggregations**: COUNT, SUM, AVG and more  
- **Basic Data Cleansing**: null handling, trimming, case conversion  
- **Common Date Functions**: date_add, datediff, date_trunc, and parsing/formatting  
- **Common Table Expressions (CTEs)**: break complex logic into modular steps
- **Joins**: INNER, LEFT, RIGHT, FULL, CROSS, LEFT-ANTI, and self-joins  
- **Temporary Views**: create temporary views for interactive analysis and intermediate steps
- **Table Management**: CREATE, ALTER, and DROP managed tables  
- **Window Functions**: ROW_NUMBER, RANK, LAG/LEAD 

# Example Aggregations

In [0]:
%sql
-- Count all orders
SELECT COUNT(*) FROM samples.tpch.orders;

In [0]:
%sql

-- Total revenue per customer.
SELECT
  o_custkey,
  SUM(o_totalprice) AS total_revenue
FROM samples.tpch.orders
GROUP BY o_custkey;

In [0]:
%sql

-- Average line‐item quantity per order.
SELECT
  l_orderkey,
  AVG(l_quantity) AS avg_quantity
FROM samples.tpch.lineitem
GROUP BY l_orderkey;

In [0]:
%sql

-- Count of orders per month.
SELECT
  date_format(o_orderdate, 'yyyy-MM') AS month,
  COUNT(*) AS order_count
FROM samples.tpch.orders
GROUP BY date_format(o_orderdate, 'yyyy-MM');

In [0]:
%sql

-- Distinct count of parts ordered by each customer.
SELECT
  o.o_custkey,
  COUNT(DISTINCT l.l_partkey) AS unique_parts_ordered
FROM samples.tpch.orders AS o
JOIN samples.tpch.lineitem AS l
  ON o.o_orderkey = l.l_orderkey
GROUP BY o.o_custkey;

In [0]:
%sql
-- Total and average order value per nation.
SELECT
  n.n_name,
  SUM(o.o_totalprice) AS total_rev,
  AVG(o.o_totalprice) AS avg_rev
FROM samples.tpch.orders AS o
JOIN samples.tpch.customer AS c
  ON o.o_custkey = c.c_custkey
JOIN samples.tpch.nation AS n
  ON c.c_nationkey = n.n_nationkey
GROUP BY n.n_name;

In [0]:
%sql

-- Nations with more than 100 orders.
SELECT
  n.n_name,
  COUNT(*) AS order_count
FROM samples.tpch.orders AS o
JOIN samples.tpch.customer AS c
  ON o.o_custkey = c.c_custkey
JOIN samples.tpch.nation AS n
  ON c.c_nationkey = n.n_nationkey
GROUP BY n.n_name
HAVING COUNT(*) > 100;

In [0]:
%sql

-- Revenue rollup by region and nation.
SELECT
  r.r_name,
  n.n_name,
  SUM(o.o_totalprice) AS revenue
FROM samples.tpch.orders AS o
JOIN samples.tpch.customer AS c
  ON o.o_custkey = c.c_custkey
JOIN samples.tpch.nation AS n
  ON c.c_nationkey = n.n_nationkey
JOIN samples.tpch.region AS r
  ON n.n_regionkey = r.r_regionkey
GROUP BY ROLLUP(r.r_name, n.n_name);



In [0]:
%sql
-- Revenue cube by region, nation, and customer. CUBE will ompute all combinations of aggregates across all three dimensions.
SELECT
  r.r_name,
  n.n_name,
  c.c_custkey,
  SUM(o.o_totalprice) AS revenue
FROM samples.tpch.orders AS o
JOIN samples.tpch.customer AS c
  ON o.o_custkey = c.c_custkey
JOIN samples.tpch.nation AS n
  ON c.c_nationkey = n.n_nationkey
JOIN samples.tpch.region AS r
  ON n.n_regionkey = r.r_regionkey
GROUP BY CUBE(r.r_name, n.n_name, c.c_custkey);

In [0]:
%sql

-- Median (approximate) order value per customer.
SELECT
  o_custkey,
  percentile_approx(o_totalprice, 0.5) AS median_order_value
FROM samples.tpch.orders
GROUP BY o_custkey;

In [0]:
%sql

-- Running total of daily revenue.
SELECT
  order_day,
  SUM(daily_revenue) OVER (
    ORDER BY order_day ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total
FROM (
  SELECT
    date_format(o_orderdate, 'yyyy-MM-dd') AS order_day,
    SUM(o_totalprice)                   AS daily_revenue
  FROM samples.tpch.orders
  GROUP BY date_format(o_orderdate, 'yyyy-MM-dd')
) t
ORDER BY order_day;

# Example Data Cleansing

In [0]:
%sql
-- Remove duplicate parts
SELECT DISTINCT * FROM samples.tpch.part;

In [0]:
%sql

-- Replace nulls in customer comments
SELECT COALESCE(c_comment, 'No comment') AS cleaned_comment FROM samples.tpch.customer LIMIT 100;

In [0]:
%sql
-- Filter out nulls in orders
SELECT * FROM samples.tpch.orders WHERE o_totalprice IS NOT NULL  LIMIT 100;

In [0]:
%sql
-- Trim extra whitespace in customer name
SELECT TRIM(c_name) AS cleaned_name FROM samples.tpch.customer  LIMIT 100;

In [0]:
%sql
-- Replace empty strings with NULL
SELECT NULLIF(TRIM(c_comment), '') AS normalized_comment FROM samples.tpch.customer  LIMIT 100;

In [0]:
%sql
-- Standardize casing for part name
SELECT UPPER(p_name) AS part_name_upper FROM samples.tpch.part  LIMIT 100;
SELECT LOWER(p_name) AS part_name_lower FROM samples.tpch.part  LIMIT 100;

In [0]:
%sql
-- Parse and format phone numbers (assuming a consistent format like '+1-XXX-XXX-XXXX')
SELECT REGEXP_REPLACE(c_phone, '[^0-9]', '') AS digits_only_phone FROM samples.tpch.customer  LIMIT 100;

In [0]:
%sql
-- Extract year from shipdate and remove future-dated rows
SELECT * FROM samples.tpch.lineitem 
WHERE YEAR(l_shipdate) <= YEAR(CURRENT_DATE())  LIMIT 100;

In [0]:
%sql
-- Filter out orders with invalid or zero price
SELECT * FROM samples.tpch.orders 
WHERE o_totalprice IS NOT NULL AND o_totalprice > 0  LIMIT 100;

In [0]:
%sql
-- Split full name into first and last
SELECT
  SPLIT(TRIM(c_name), ' ')[0] AS first_name,
  SPLIT(TRIM(c_name), ' ')[SIZE(SPLIT(c_name,' ')) - 1] AS last_name,
  c_custkey
FROM samples.tpch.customer  LIMIT 100;

In [0]:
%sql
-- Deduplicate orders: keep only the earliest order per customer
WITH ranked_orders AS (
  SELECT
    o_custkey,
    o_orderkey,
    ROW_NUMBER() OVER (PARTITION BY o_custkey ORDER BY o_orderdate) AS rn
  FROM samples.tpch.orders 
)
SELECT
  o_custkey,
  o_orderkey
FROM ranked_orders
WHERE rn = 1
LIMIT 100;

In [0]:
%sql
-- get the latest order from a customer
-- 1. Rank every order per customer by order date (newest first)
WITH ranked_orders AS (
  SELECT
    o_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    ROW_NUMBER() 
      OVER (
        PARTITION BY o_custkey         -- group by customer
        ORDER BY o_orderdate DESC,    -- newest order first
                 o_orderkey DESC      -- tie-breaker if two orders share the same date
      ) AS rn
  FROM samples.tpch.orders
)

-- 2. Select only the top-ranked (latest) order for each customer
SELECT *
FROM ranked_orders
WHERE rn = 1
 LIMIT 100;


# Common Date Functions

In [0]:
%sql
-- Extract components
SELECT o_orderdate, YEAR(o_orderdate) AS year, MONTH(o_orderdate) AS month FROM samples.tpch.orders  LIMIT 100;

In [0]:
%sql
-- Date arithmetic (Add 7 days)
SELECT o_orderdate, o_orderdate + INTERVAL 7 DAYS AS week_later FROM samples.tpch.orders  LIMIT 100;

In [0]:
%sql
-- Current date/time
SELECT CURRENT_DATE(), CURRENT_TIMESTAMP();

In [0]:
%sql
-- Truncate to unit
SELECT o_orderdate, DATE_TRUNC('MM', o_orderdate) AS month_start FROM samples.tpch.orders  LIMIT 100;

In [0]:
%sql
-- Format and parse
SELECT DATE_FORMAT(o_orderdate, 'yyyy-MM') AS order_month FROM samples.tpch.orders  LIMIT 100;

In [0]:
%sql
-- Days between
SELECT DATEDIFF(l_shipdate, l_receiptdate) AS days_to_ship FROM samples.tpch.lineitem  LIMIT 100;

In [0]:
%sql
-- Add/subtract days
SELECT o_orderdate, DATE_ADD(o_orderdate, 10) AS plus_10_days FROM samples.tpch.orders  LIMIT 100;
SELECT o_orderdate, DATE_SUB(o_orderdate, 5) AS minus_5_days FROM samples.tpch.orders  LIMIT 100;

# Example Joins

In [0]:
%sql
-- Inner Join: Get customer names and their orders placed in January 1994.
SELECT
  c.c_name,
  o.o_orderkey,
  o.o_orderdate,
  o.o_totalprice
FROM samples.tpch.customer AS c
  INNER JOIN samples.tpch.orders AS o
    ON c.c_custkey = o.o_custkey
WHERE o.o_orderdate BETWEEN '1994-01-01' AND '1994-01-31'
 LIMIT 100;

In [0]:
%sql
-- Left Outer Join: List all customers, showing order info only if they’ve placed one.
SELECT
  c.c_custkey,
  c.c_name,
  o.o_orderkey,
  o.o_orderdate
FROM samples.tpch.customer AS c
  LEFT OUTER JOIN samples.tpch.orders AS o
    ON c.c_custkey = o.o_custkey
 LIMIT 100;

In [0]:
%sql

-- Right Outer Join: Show all orders—even if there’s no matching customer record.
SELECT
  o.o_orderkey,
  o.o_orderdate,
  c.c_custkey,
  c.c_name
FROM samples.tpch.customer AS c
  RIGHT OUTER JOIN samples.tpch.orders AS o
    ON c.c_custkey = o.o_custkey
LIMIT 100;

In [0]:
%sql
-- Full Outer Join: Capture the union of customers and orders, including unmatched rows.
SELECT
  c.c_custkey,
  c.c_name,
  o.o_orderkey,
  o.o_orderdate
FROM samples.tpch.customer AS c
  FULL OUTER JOIN samples.tpch.orders AS o
    ON c.c_custkey = o.o_custkey
 LIMIT 100;

In [0]:
%sql
-- Left Anti Join: Find customers who have not placed any orders.
SELECT
  c.c_custkey,
  c.c_name
FROM samples.tpch.customer AS c
  LEFT ANTI JOIN samples.tpch.orders AS o
    ON c.c_custkey = o.o_custkey
 LIMIT 100    ;

In [0]:
%sql
-- Cross Join: Cartesian product of customers and orders (use sparingly!).
SELECT
  c.c_name,
  o.o_orderkey
FROM samples.tpch.customer AS c
  CROSS JOIN samples.tpch.orders AS o
LIMIT 10;

In [0]:
%sql
-- Self Join: Find pairs of orders by the same customer on the same date.
SELECT
  o1.o_orderkey AS order_a,
  o2.o_orderkey AS order_b,
  o1.o_orderdate
FROM samples.tpch.orders AS o1
  JOIN samples.tpch.orders AS o2
    ON o1.o_custkey   = o2.o_custkey
   AND o1.o_orderdate = o2.o_orderdate
   AND o1.o_orderkey < o2.o_orderkey
LIMIT 10;

# Windows Functions
Learn more about [windows functions here](https://www.geeksforgeeks.org/window-functions-in-sql/)

Think of a window function as a “calculator” that looks at a moving frame—or “window”—of rows around the current row, computes something (like a running total, rank, or average), and then sticks the result back on that same row. Unlike a traditional GROUP BY that collapses many rows into one, window functions let you:

- **Partition** your data into groups (e.g., per customer)

- **Order** rows within each group (e.g., by date)

- **Compute** an aggregate or ranking over that window (e.g., cumulative sum, row number, lag/lead values)

### Example
The table below shows how the `ROW_NUMBER()` window function assigns a sequential rank to each order for a customer order based on the date. 
- **custkey:** the customer identifier (here, all rows are for customer 1).
- **orderdate:** the date of each order.
- **order_seq:** the sequence number given by

**SQL Syntax:**
```
SELECT 
  custkey
  ,orderdate
  ,ROW_NUMBER() OVER (PARTITION BY o_custkey ORDER BY o_orderdate) AS order_seq
FROM customers
```

**Output**

| custkey | orderdate  | order_seq |
|---------|------------|-----------|
| 1       | 1994-01-05 | 1         |
| 1       | 1994-01-10 | 2         |
| 1       | 1994-01-20 | 3         |

In [0]:
%sql
-- Calculate row number of each order per customer based on order date (oldest first)
SELECT
  o_custkey,
  o_orderkey,
  o_orderdate,
  ROW_NUMBER() OVER (PARTITION BY o_custkey ORDER BY o_orderdate) AS order_seq
FROM samples.tpch.orders
LIMIT 100;

In [0]:
%sql
-- Rank orders by total price per customer (highest first)
SELECT
  o_custkey,
  o_orderkey,
  o_totalprice,
  RANK() OVER (PARTITION BY o_custkey ORDER BY o_totalprice DESC) AS price_rank
FROM samples.tpch.orders
LIMIT 100;

In [0]:
%sql
-- Dense rank of parts by retail price to assign price tiers
SELECT
  p_partkey,
  p_name,
  p_retailprice,
  DENSE_RANK() OVER (ORDER BY p_retailprice DESC) AS price_tier
FROM samples.tpch.part
LIMIT 100;

In [0]:
%sql
-- Moving average of daily revenue over the current and two preceding days
WITH daily_rev AS (
  SELECT
    date_format(o_orderdate, 'yyyy-MM-dd') AS day,
    SUM(o_totalprice) AS revenue
  FROM samples.tpch.orders
  GROUP BY date_format(o_orderdate, 'yyyy-MM-dd')
)
SELECT
  day,
  revenue,
  ROUND(AVG(revenue) OVER (ORDER BY day ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 2) AS rev_3day_avg
FROM daily_rev
ORDER BY day;

In [0]:
%sql
-- Show previous and next order dates for each order per customer
SELECT
  o_custkey,
  o_orderkey,
  o_orderdate,
  LAG(o_orderdate) OVER (PARTITION BY o_custkey ORDER BY o_orderdate) AS prev_order_date,
  LEAD(o_orderdate) OVER (PARTITION BY o_custkey ORDER BY o_orderdate) AS next_order_date
FROM samples.tpch.orders
LIMIT 100;

In [0]:
%sql
-- Compute percentile rank of each order’s total price across all orders
SELECT
  o_orderkey,
  o_totalprice,
  PERCENT_RANK() OVER (ORDER BY o_totalprice) AS price_percent_rank
FROM samples.tpch.orders
LIMIT 100;

In [0]:
%sql
-- Assign quartiles to customers based on total revenue
WITH cust_rev AS (
  SELECT
    o_custkey,
    SUM(o_totalprice) AS total_rev
  FROM samples.tpch.orders
  GROUP BY o_custkey
)
SELECT
  o_custkey,
  total_rev,
  NTILE(4) OVER (ORDER BY total_rev DESC) AS revenue_quartile
FROM cust_rev
ORDER BY revenue_quartile, total_rev DESC
LIMIT 100;

# CTEs

A Common Table Expression (CTE) is a way to define a temporary, named result set that you can reference within a single SQL statement. It’s essentially an inline view with a name, scoped to the query where it’s declared. CTEs were introduced in the SQL-99 standard and since then have become widely supported (e.g., in SQL Server, PostgreSQL, Oracle, MySQL, and Spark SQL).

[Learn more here](https://www.geeksforgeeks.org/cte-in-sql/)

In [0]:
%sql
-- Latest order from a customer

-- 1. Rank every order per customer by order date (newest first)
WITH ranked_orders AS (
  SELECT
    o_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    ROW_NUMBER() 
      OVER (
        PARTITION BY o_custkey         -- group by customer
        ORDER BY o_orderdate DESC,    -- newest order first
                 o_orderkey DESC      -- tie-breaker if two orders share the same date
      ) AS rn
  FROM samples.tpch.orders
)

-- 2. Select only the top-ranked (latest) order for each customer
SELECT *
FROM ranked_orders
WHERE rn = 1
LIMIT 100;

In [0]:
%sql
--Top 5 Customers by Total Spend

WITH customer_spend AS (
  -- sum up total price per order
  SELECT
    o_custkey,
    SUM(o_totalprice) AS total_spent
  FROM samples.tpch.orders
  GROUP BY o_custkey
),
ranked_customers AS (
  -- join back to get customer names and rank by spend
  SELECT
    c.c_custkey,
    c.c_name,
    cs.total_spent,
    ROW_NUMBER() OVER (ORDER BY cs.total_spent DESC) AS rn
  FROM samples.tpch.customer c
  JOIN customer_spend cs
    ON c.c_custkey = cs.o_custkey
)
SELECT
  c_name      AS customer,
  total_spent
FROM ranked_customers
WHERE rn <= 5
ORDER BY total_spent DESC
LIMIT 100;

# Temporary Views 
Temporary views are the foundation for creating a multi-step program/process in a notebook when using SQL!

Temporary views are virtual tables that are based on the result of a SQL query. They are not stored in the catalog and are only visible to the current session, similar to that of a `temporary table` in other databases. They are automatically dropped when the session ends or when you explicitly drop them using `DROP VIEW`. Temporary views allow you to reuse the result of a complex query without creating a permanent table!

- _Note: A temporary View created in one notebook isn't accessible to others!_ Temporary views are visible only to the session that created them and are dropped when the session ends.

### Temporary Views or CTEs?
Temporary views can be an excellent complement—or even substitute—for CTEs when you need to break complex logic into reusable, inspectable building blocks:

**Session-wide reuse**
- CTEs exist only within the single SQL statement that defines them.
- Temp views persist for your entire  session, so you can reference the same intermediate result across multiple independent queries without redefining it each time.

**Improved readability & modularity**
- Instead of a single gigantic `WITH … SELECT …` block, you can layer logic:
  1. Create step-by-step temp views (`step1_view`, `step2_view`, …)
  2. Then `SELECT` or join them in your final analysis.
- This makes each piece easier to develop, test, and debug in isolation.

**Performance tuning & caching**
- You can cache a temp view in memory (`CACHE TABLE`) so that expensive transformations run once and serve many downstream queries.
- CTEs get re-evaluated each time you reference them in the same statement, which can waste work if your CTE is used multiple times.

### How do you create a temporary view?
**Syntax:**
<br>`CREATE OR REPLACE TEMPORARY VIEW view_name AS query`

**<br>Example of a temporary view:**

```
CREATE OR REPLACE TEMP VIEW daily_revenue
AS (
  SELECT
    date_format(o_orderdate, 'yyyy-MM-dd') AS day,
    SUM(o_totalprice) AS revenue
  FROM samples.tpch.orders
  GROUP BY ALL
);
```

### Temp View - SQL Example 1 (see below)

In [0]:
%sql
-- Latest order from a customer

-- Create a temporary view for ranked orders
CREATE OR REPLACE TEMP VIEW ranked_orders AS
  SELECT
    o_custkey,
    o_orderkey,
    o_orderdate,
    o_totalprice,
    ROW_NUMBER() 
      OVER (
        PARTITION BY o_custkey         -- group by customer
        ORDER BY o_orderdate DESC,    -- newest order first
                 o_orderkey DESC      -- tie-breaker if two orders share the same date
      ) AS rn
  FROM samples.tpch.orders;

In [0]:
%sql
-- Select only the top-ranked (latest) order for each customer from the temp view created in the cell above
SELECT *
FROM ranked_orders
WHERE rn = 1
LIMIT 100;

### Temp View - SQL Example 2 (see below)

In [0]:
%sql
-- Create a temporary view for customer spend
CREATE OR REPLACE TEMP VIEW customer_spend AS
SELECT
  o_custkey,
  SUM(o_totalprice) AS total_spent
FROM samples.tpch.orders
GROUP BY o_custkey;

In [0]:
%sql

-- Create a temporary view for ranked customers using the temp view called customer_spend created in the cell above
CREATE OR REPLACE TEMP VIEW ranked_customers AS
SELECT
  c.c_custkey,
  c.c_name,
  cs.total_spent,
  ROW_NUMBER() OVER (ORDER BY cs.total_spent DESC) AS rn
FROM samples.tpch.customer c
JOIN customer_spend cs
  ON c.c_custkey = cs.o_custkey;

In [0]:
%sql
-- Select top 5 customers by total spend from the temp view called ranked_customers created in the cell above
SELECT
  c_name AS customer,
  total_spent
FROM ranked_customers
WHERE rn <= 5
ORDER BY total_spent DESC
LIMIT 100;

# Table Management
**Please note**: The following table management commands (`CREATE`, `REPLACE`, `DROP`, `TRUNCATE`, `INSERT`,
`MERGE`, `UPDATE`, `DELETE`, `ALTER`, etc.) require proper permissions in your Databricks environment.
If you don’t have the necessary privileges, these operations may fail.

In [0]:
%sql
-- Describe Table: Inspect the schema (column names/types) of a table.
DESCRIBE TABLE samples.tpch.orders;

In [0]:
%sql
-- Describe Table Extended: Inspect the schema (column names/types) of a table and other table metadata
DESCRIBE TABLE EXTENDED samples.tpch.orders;

In [0]:
%sql
-- Describe Detail: Get extended metadata (location, file count, size) for a Delta table.
DESCRIBE DETAIL samples.tpch.orders;

In [0]:
%sql
-- Show Tables In: List all tables in the TPCH schema.
SHOW TABLES IN samples.tpch;

In [0]:
%sql
-- Create or Replace Table: Create a bronze table of all 1994 line items. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
CREATE OR REPLACE TABLE <catalog>.<schema>.<table_name>
AS
(SELECT *
FROM samples.tpch.lineitem
WHERE l_shipdate BETWEEN '1994-01-01' AND '1994-12-31');

In [0]:
%sql
-- Create or Replace View: Define a view showing each customer’s monthly revenue.  Replace <catalog>.<schema>.<table_name> with your fully qualified name for the view.
CREATE OR REPLACE VIEW <catalog>.<schema>.<table_name>
AS
(SELECT
  c_custkey,
  date_format(o_orderdate, 'yyyy-MM') AS order_month,
  SUM(o_totalprice)         AS revenue
FROM samples.tpch.orders
GROUP BY c_custkey, date_format(o_orderdate, 'yyyy-MM')
);

In [0]:
%sql
-- Drop Table If Exists: Clean up an old staging table before recreating it. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
DROP TABLE IF EXISTS <catalog>.<schema>.<table_name>;

In [0]:
%sql
-- Truncate Table: Quickly remove all data from a table while keeping its schema. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the view.
TRUNCATE TABLE <catalog>.<schema>.<table_name>;

In [0]:
%sql
-- Insert Into: Append today’s new orders into a historical orders table. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
INSERT INTO <catalog>.<schema>.<table_name>
SELECT *
FROM samples.tpch.orders
WHERE o_orderdate = CURRENT_DATE();

In [0]:
%sql
-- Merge Into: Upsert daily shipment records—update existing, insert new.
MERGE INTO <catalog>.<schema>.<table_name> AS target
USING daily_shipments AS source
  ON target.ship_id = source.ship_id
WHEN MATCHED THEN
  UPDATE SET
    target.status       = source.status,
    target.last_updated = source.last_updated
WHEN NOT MATCHED THEN
  INSERT (ship_id, order_id, status, last_updated)
  VALUES (source.ship_id, source.order_id, source.status, source.last_updated);

In [0]:
%sql
-- Update: Correct customer nation key where it’s missing. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
UPDATE <catalog>.<schema>.<table_name>
SET c_nationkey = 1
WHERE c_nationkey IS NULL;

In [0]:
%sql
-- Delete: Remove cancelled orders that have zero total price. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
DELETE FROM <catalog>.<schema>.<table_name>
WHERE o_totalprice = 0;

In [0]:
%sql
-- Alter Table – Add Column: Add a load_date column to track ingestion timestamp. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
ALTER TABLE <catalog>.<schema>.<table_name>
ADD COLUMNS (load_date DATE);

In [0]:
%sql
-- Alter Table – Drop Column: Remove an obsolete column from a table. Replace <catalog>.<schema>.<table_name> with your fully qualified name for the table.
ALTER TABLE <catalog>.<schema>.<table_name>
DROP COLUMN revenue;