In [0]:
-- =====================================================
-- DAY 9 POC - SLOWLY CHANGING DIMENSIONS
-- My Learning Notes
-- Date: November 5, 2025
-- 
-- What I'm learning today:
-- 1. What are dimensions vs facts
-- 2. Why dimension data changes (the "slowly changing" part)
-- 3. Different SCD types (0, 1, 2, 3)
-- 4. How to implement SCD Type 2 (most important!)
-- 5. Querying historical data
-- =====================================================

-- Clean up if I ran this before
DROP TABLE IF EXISTS dim_customer_type1;
DROP TABLE IF EXISTS dim_customer_type2;
DROP TABLE IF EXISTS dim_customer_type3;
DROP TABLE IF EXISTS fact_orders;

-- =====================================================
-- SECTION 1: Understanding the Problem
-- Why do we need SCD?
-- =====================================================

SELECT '=== Understanding the Problem ===' AS section;

-- Imagine: Simple customer table
CREATE TABLE dim_customer_type1 (
    customer_id INT PRIMARY KEY,
    name STRING,
    email STRING,
    city STRING,
    phone STRING
);

-- Insert initial customer
INSERT INTO dim_customer_type1 VALUES
(101, 'Alice Johnson', 'alice@email.com', 'Boston', '555-0001');

SELECT 'Initial customer data' AS status;
SELECT * FROM dim_customer_type1;

-- Customer Alice moves to New York
-- Question: What do I do?
-- Option 1: Just UPDATE (simple but lose history)
-- Option 2: Keep old record + add new (preserve history)

-- Let me show both approaches...

-- =====================================================
-- SECTION 2: SCD TYPE 1 - Overwrite (No History)
-- =====================================================

SELECT '=== SCD TYPE 1: Simple Overwrite ===' AS section;

-- Alice moved to New York, let's update
UPDATE dim_customer_type1
SET city = 'New York',
    phone = '555-9999'
WHERE customer_id = 101;

SELECT 'After Type 1 update (overwrote old data)' AS status;
SELECT * FROM dim_customer_type1;

-- What happened:
-- - Old city 'Boston' is GONE
-- - New city 'New York' replaced it
-- - Cannot answer: "Where did Alice live in 2023?"

-- When to use Type 1:
-- - Don't need history
-- - Fixing typos
-- - Current status only matters
-- Example: Current address, current status

SELECT 'Type 1 Lesson' AS lesson,
       'Simple but loses history. Good for current-only data.' AS summary;

-- =====================================================
-- SECTION 3: SCD TYPE 2 - Track Full History
-- This is the IMPORTANT one!
-- =====================================================

SELECT '=== SCD TYPE 2: Keep Full History ===' AS section;

-- Create table with SCD Type 2 columns
CREATE TABLE dim_customer_type2 (
    customer_key INT,           -- Surrogate key (unique for each row)
    customer_id INT,            -- Business key (same customer_id for same person)
    name STRING,
    email STRING,
    city STRING,
    phone STRING,
    -- SCD Type 2 specific columns:
    is_current BOOLEAN,         -- TRUE = current record, FALSE = old record
    valid_from DATE,            -- When this record became valid
    valid_to DATE,              -- When this record expired (9999-12-31 = current)
    version INT                 -- Version number (1, 2, 3...)
);

-- Insert initial record for Alice
INSERT INTO dim_customer_type2 VALUES
(1, 101, 'Alice Johnson', 'alice@email.com', 'Boston', '555-0001',
 TRUE, '2020-01-01', '9999-12-31', 1);

SELECT 'Initial record - Alice in Boston' AS status;
SELECT * FROM dim_customer_type2;

-- Now Alice moves to New York on 2024-06-15
-- I need to:
-- Step 1: Close old record (mark as not current)
-- Step 2: Insert new record (mark as current)

SELECT 'Alice is moving to New York...' AS status;

-- Step 1: Close old record
UPDATE dim_customer_type2
SET is_current = FALSE,
    valid_to = '2024-06-14'  -- Day before move
WHERE customer_id = 101 AND is_current = TRUE;

-- Step 2: Insert new record
INSERT INTO dim_customer_type2 VALUES
(2, 101, 'Alice Johnson', 'alice@email.com', 'New York', '555-9999',
 TRUE, '2024-06-15', '9999-12-31', 2);

SELECT 'After Type 2 update (kept history!)' AS status;
SELECT * FROM dim_customer_type2 ORDER BY customer_key;

-- Now I have TWO rows for Alice:
-- Row 1: customer_key=1, Boston, is_current=FALSE (historical)
-- Row 2: customer_key=2, New York, is_current=TRUE (current)

-- This is the power of Type 2!
-- I can answer: "Where did Alice live in 2023?" → Boston
-- I can answer: "Where does Alice live now?" → New York

SELECT 'Type 2 Lesson' AS lesson,
       'Keeps full history. Most common in data warehouses.' AS summary;

-- =====================================================
-- SECTION 4: Querying SCD Type 2 Data
-- =====================================================

SELECT '=== Querying Type 2: Current Data ===' AS section;

-- To get CURRENT data only, filter is_current = TRUE
SELECT 
    customer_id,
    name,
    city,
    phone,
    'Current Record' AS record_type
FROM dim_customer_type2
WHERE is_current = TRUE;

-- Result: Only shows New York (current)

SELECT '=== Querying Type 2: Historical Data ===' AS section;

-- To get data as of a specific date, use date range
-- Question: Where did Alice live on 2023-05-01?
SELECT 
    customer_id,
    name,
    city,
    phone,
    valid_from,
    valid_to,
    'Record valid on 2023-05-01' AS record_type
FROM dim_customer_type2
WHERE customer_id = 101
  AND '2023-05-01' BETWEEN valid_from AND valid_to;

-- Result: Shows Boston (was valid in 2023)

SELECT '=== Querying Type 2: All History ===' AS section;

-- To see complete history, just select all
SELECT 
    version,
    city,
    valid_from,
    valid_to,
    is_current,
    CASE WHEN is_current THEN 'CURRENT' ELSE 'HISTORICAL' END AS status
FROM dim_customer_type2
WHERE customer_id = 101
ORDER BY version;

-- Result: Shows progression Boston → New York

-- =====================================================
-- SECTION 5: SCD TYPE 3 - Limited History
-- =====================================================

SELECT '=== SCD TYPE 3: Keep Current + Previous ===' AS section;

-- Type 3 keeps current value + one previous value
CREATE TABLE dim_customer_type3 (
    customer_id INT PRIMARY KEY,
    name STRING,
    email STRING,
    current_city STRING,
    previous_city STRING,
    city_changed_date DATE,
    current_phone STRING,
    previous_phone STRING
);

-- Insert initial data
INSERT INTO dim_customer_type3 VALUES
(101, 'Alice Johnson', 'alice@email.com', 
 'Boston', NULL, NULL,
 '555-0001', NULL);

SELECT 'Initial Type 3 record' AS status;
SELECT * FROM dim_customer_type3;

-- Alice moves to New York
-- Type 3: Move current to previous, update current
UPDATE dim_customer_type3
SET previous_city = current_city,
    current_city = 'New York',
    city_changed_date = '2024-06-15',
    previous_phone = current_phone,
    current_phone = '555-9999'
WHERE customer_id = 101;

SELECT 'After Type 3 update' AS status;
SELECT * FROM dim_customer_type3;

-- Now I can see:
-- - Current city: New York
-- - Previous city: Boston
-- - When it changed: 2024-06-15

-- But if Alice moves again to Miami:
UPDATE dim_customer_type3
SET previous_city = current_city,
    current_city = 'Miami',
    city_changed_date = '2024-12-01',
    previous_phone = current_phone,
    current_phone = '555-8888'
WHERE customer_id = 101;

SELECT 'After second move (Boston is lost!)' AS status;
SELECT * FROM dim_customer_type3;

-- Notice: Boston is GONE
-- Type 3 only keeps ONE previous value
-- Current = Miami, Previous = New York, Boston = lost

SELECT 'Type 3 Lesson' AS lesson,
       'Simple, keeps current + 1 previous. But loses older history.' AS summary;

-- =====================================================
-- SECTION 6: Real-World Example - Orders with SCD Type 2
-- =====================================================

SELECT '=== Real-World: Orders with Historical Customer Info ===' AS section;

-- Create fact table (orders)
CREATE TABLE fact_orders (
    order_id INT PRIMARY KEY,
    customer_key INT,  -- Links to dim_customer_type2.customer_key (not customer_id!)
    order_date DATE,
    amount DECIMAL(10, 2)
);

-- Insert orders
-- Order 1: Made when Alice lived in Boston (2023-03-15)
-- Order 2: Made when Alice lived in New York (2024-09-20)

INSERT INTO fact_orders VALUES
(1001, 1, '2023-03-15', 150.00),  -- customer_key=1 (Boston version)
(1002, 2, '2024-09-20', 200.00);  -- customer_key=2 (New York version)

-- Now I can see orders with historical customer info
SELECT 
    o.order_id,
    o.order_date,
    o.amount,
    c.name,
    c.city AS customer_city_at_order_time,
    CASE 
        WHEN c.is_current THEN 'Current Address'
        ELSE 'Historical Address'
    END AS address_type
FROM fact_orders o
JOIN dim_customer_type2 c ON o.customer_key = c.customer_key
ORDER BY o.order_date;

-- Result shows:
-- Order 1001 → Alice was in Boston (historical accuracy!)
-- Order 1002 → Alice was in New York

-- This is WHY we use SCD Type 2!
-- Historical orders show where customer lived WHEN they ordered

SELECT 'SCD Type 2 Benefit' AS benefit,
       'Orders linked to correct historical customer data!' AS explanation;

-- =====================================================
-- SECTION 7: Handling Changes - MERGE Pattern
-- =====================================================

SELECT '=== Using MERGE for SCD Type 2 Updates ===' AS section;

-- Scenario: New customer data arrives
-- Bob is new customer
-- Alice changed email

-- Step 1: Close old records that are changing
MERGE INTO dim_customer_type2 AS target
USING (
    -- New/changed data
    SELECT 101 AS customer_id, 'Alice Johnson' AS name, 
           'alice.new@email.com' AS email, 'New York' AS city, '555-9999' AS phone
    UNION ALL
    SELECT 102, 'Bob Smith', 'bob@email.com', 'Chicago', '555-2001'
) AS source
ON target.customer_id = source.customer_id 
   AND target.is_current = TRUE
   AND (target.email != source.email)  -- Email changed
WHEN MATCHED THEN
    UPDATE SET 
        is_current = FALSE,
        valid_to = CURRENT_DATE() - INTERVAL 1 DAY;

-- Step 2: Insert new versions for changes
INSERT INTO dim_customer_type2
SELECT 
    (SELECT COALESCE(MAX(customer_key), 0) + 1 FROM dim_customer_type2) + 
        ROW_NUMBER() OVER (ORDER BY customer_id) - 1 AS customer_key,
    customer_id,
    name,
    email,
    city,
    phone,
    TRUE AS is_current,
    CURRENT_DATE() AS valid_from,
    DATE('9999-12-31') AS valid_to,
    ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY customer_id) + 
        (SELECT COALESCE(MAX(version), 0) FROM dim_customer_type2 WHERE customer_id = source.customer_id) AS version
FROM (
    SELECT 101 AS customer_id, 'Alice Johnson' AS name, 
           'alice.new@email.com' AS email, 'New York' AS city, '555-9999' AS phone
    UNION ALL
    SELECT 102, 'Bob Smith', 'bob@email.com', 'Chicago', '555-2001'
) AS source
WHERE NOT EXISTS (
    SELECT 1 FROM dim_customer_type2 t
    WHERE t.customer_id = source.customer_id
      AND t.is_current = TRUE
      AND t.email = source.email
);

SELECT 'After MERGE - Current records only' AS status;
SELECT * FROM dim_customer_type2 WHERE is_current = TRUE ORDER BY customer_id;

SELECT 'After MERGE - Alice full history' AS status;
SELECT 
    customer_key,
    customer_id,
    name,
    email,
    city,
    version,
    is_current,
    valid_from,
    valid_to
FROM dim_customer_type2 
WHERE customer_id = 101
ORDER BY version;

-- Now Alice has 3 versions:
-- Version 1: Boston, old email (historical)
-- Version 2: New York, old email (historical)
-- Version 3: New York, new email (current)

-- Bob is newly inserted with version 1

SELECT 'MERGE Lesson' AS lesson,
       'MERGE handles both new inserts and updates. Makes SCD Type 2 idempotent!' AS summary;

-- =====================================================
-- SECTION 8: Common Queries - My Practice
-- =====================================================

SELECT '=== Practice Query 1: Current Snapshot ===' AS query;

-- Show all current customers (what we see "today")
SELECT 
    customer_id,
    name,
    city,
    email,
    phone
FROM dim_customer_type2
WHERE is_current = TRUE
ORDER BY customer_id;

-- This is what application users see - current state

SELECT '=== Practice Query 2: Point-in-Time Query ===' AS query;

-- What did our customer list look like on 2024-01-01?
SELECT 
    customer_id,
    name,
    city,
    email,
    'Snapshot as of 2024-01-01' AS note
FROM dim_customer_type2
WHERE '2024-01-01' BETWEEN valid_from AND valid_to
ORDER BY customer_id;

-- This shows historical state

SELECT '=== Practice Query 3: Track Changes ===' AS query;

-- Show all changes for a specific customer
SELECT 
    customer_id,
    version,
    city,
    email,
    valid_from,
    valid_to,
    CASE 
        WHEN is_current THEN '→ CURRENT'
        ELSE ''
    END AS current_flag
FROM dim_customer_type2
WHERE customer_id = 101
ORDER BY version;

-- Shows progression: Boston → New York, email changes

SELECT '=== Practice Query 4: Find Recent Changes ===' AS query;

-- Which customers changed in last 30 days?
SELECT 
    customer_id,
    name,
    city,
    valid_from AS changed_on
FROM dim_customer_type2
WHERE valid_from >= CURRENT_DATE() - INTERVAL 30 DAY
  AND version > 1  -- Not initial record
ORDER BY valid_from DESC;

-- =====================================================
-- SECTION 9: Comparison Summary
-- =====================================================

SELECT '=== COMPARISON: All SCD Types ===' AS section;

-- Create comparison table
CREATE TABLE scd_comparison (
    scd_type STRING,
    description STRING,
    keeps_history STRING,
    complexity STRING,
    use_case STRING
);

INSERT INTO scd_comparison VALUES
('Type 0', 'Fixed - never changes', 'N/A', 'Simple', 'Birth date, SSN'),
('Type 1', 'Overwrite old value', 'NO', 'Simple', 'Current status, typo fixes'),
('Type 2', 'Insert new row, keep old', 'YES - Full', 'Complex', 'Most common - addresses, departments'),
('Type 3', 'Current + previous columns', 'YES - Limited', 'Medium', 'Simple before/after comparison'),
('Type 4', 'History in separate table', 'YES - Full', 'Medium', 'Rarely used'),
('Type 6', 'Hybrid 1+2+3', 'YES - Full', 'Very Complex', 'Rarely used');

SELECT * FROM scd_comparison ORDER BY scd_type;

-- =====================================================
-- SECTION 10: What I Learned - Summary
-- =====================================================

SELECT '=== MY LEARNING SUMMARY ===' AS summary;

SELECT 
    'Key Concept' AS topic,
    'What I Learned' AS lesson
UNION ALL
SELECT 
    'Dimensions',
    'Tables with descriptive attributes (who, what, where). Facts reference them.'
UNION ALL
SELECT 
    'Slowly Changing',
    'Dimension data changes over time (address, salary, department)'
UNION ALL
SELECT 
    'SCD Type 1',
    'Simple UPDATE - overwrites, no history. Good for current-only data.'
UNION ALL
SELECT 
    'SCD Type 2',
    'Keep full history. Insert new row, mark old as not current. MOST COMMON!'
UNION ALL
SELECT 
    'Type 2 Keys',
    'Surrogate key (unique per row) + Natural key (business ID)'
UNION ALL
SELECT 
    'Type 2 Flags',
    'is_current (TRUE/FALSE), valid_from, valid_to dates'
UNION ALL
SELECT 
    'Type 2 Queries',
    'Current: WHERE is_current=TRUE. Historical: WHERE date BETWEEN valid_from AND valid_to'
UNION ALL
SELECT 
    'SCD Type 3',
    'Keep current + 1 previous value. Simple but limited history.'
UNION ALL
SELECT 
    'MERGE for SCD',
    'Makes SCD loads idempotent. Can run multiple times safely.'
UNION ALL
SELECT 
    'Real World',
    'Facts join to dimension using surrogate key for historical accuracy';

-- =====================================================
-- SECTION 11: Important Patterns I Should Remember
-- =====================================================

SELECT '=== PATTERNS TO REMEMBER ===' AS patterns;

-- Pattern 1: Always use surrogate key
SELECT 
    'Pattern 1' AS pattern_num,
    'Use surrogate key (customer_key) not business key (customer_id) as PRIMARY KEY' AS pattern,
    'Allows multiple versions of same customer' AS reason;

-- Pattern 2: Always filter is_current for current data
SELECT 
    'Pattern 2' AS pattern_num,
    'Always add WHERE is_current = TRUE when querying current state' AS pattern,
    'Avoids showing duplicate customers (all versions)' AS reason;

-- Pattern 3: Use 9999-12-31 for current records
SELECT 
    'Pattern 3' AS pattern_num,
    'Set valid_to = 9999-12-31 for current records' AS pattern,
    'BETWEEN works correctly, avoids NULL complications' AS reason;

-- Pattern 4: Close old before inserting new
SELECT 
    'Pattern 4' AS pattern_num,
    'UPDATE old record (set is_current=FALSE, valid_to=yesterday) then INSERT new' AS pattern,
    'Prevents gaps or overlaps in date ranges' AS reason;

-- =====================================================
-- SECTION 12: Interview Preparation Notes
-- =====================================================

SELECT '=== INTERVIEW Q&A - What I Would Say ===' AS interview;

CREATE TABLE interview_qa (
    question STRING,
    my_answer STRING
);

INSERT INTO interview_qa VALUES
('What is SCD Type 2?',
 'SCD Type 2 keeps full history by inserting new rows when data changes. Each version has is_current flag and valid_from/valid_to dates. Uses surrogate key to allow multiple versions of same business entity.'),

('Why use surrogate key?',
 'Natural business key (customer_id) repeats across versions. Surrogate key (customer_key) is unique per row. Facts join using surrogate key to link to correct historical version.'),

('How to query current data?',
 'Filter WHERE is_current = TRUE to get latest version only.'),

('How to query historical data?',
 'Use WHERE date BETWEEN valid_from AND valid_to to get version active on specific date.'),

('Type 1 vs Type 2?',
 'Type 1 overwrites (no history, simple, fast). Type 2 inserts new row (keeps history, complex, more storage). Choose based on whether history matters.'),

('When to use Type 3?',
 'When need simple current vs previous comparison. Example: current vs previous phone number. Limited to one previous value.'),

('How to make SCD load idempotent?',
 'Use MERGE to check if change already exists. Only close old and insert new if data actually changed. Safe to rerun.');

SELECT * FROM interview_qa;

-- =====================================================
-- CLEANUP NOTE
-- =====================================================

-- Keeping tables so I can review results
-- In real project, I would clean up:
-- DROP TABLE IF EXISTS dim_customer_type1;
-- DROP TABLE IF EXISTS dim_customer_type2;
-- DROP TABLE IF EXISTS dim_customer_type3;
-- DROP TABLE IF EXISTS fact_orders;
-- DROP TABLE IF EXISTS scd_comparison;
-- DROP TABLE IF EXISTS interview_qa;

-- =====================================================
-- FINAL VERIFICATION
-- =====================================================

SELECT '=== FINAL CHECK: What Tables I Created ===' AS final_check;

-- Show all my practice tables
SELECT 'dim_customer_type1' AS table_name, 
       'SCD Type 1 demo' AS purpose,
       (SELECT COUNT(*) FROM dim_customer_type1) AS row_count
UNION ALL
SELECT 'dim_customer_type2',
       'SCD Type 2 demo - MAIN LEARNING',
       (SELECT COUNT(*) FROM dim_customer_type2)
UNION ALL
SELECT 'dim_customer_type3',
       'SCD Type 3 demo',
       (SELECT COUNT(*) FROM dim_customer_type3)
UNION ALL
SELECT 'fact_orders',
       'Fact table linking to Type 2',
       (SELECT COUNT(*) FROM fact_orders);

-- =====================================================
-- PERSONAL NOTES FOR REVIEW
-- =====================================================

--SELECT '=== MY PERSONAL NOTES ===' AS notes;

--SELECT 
    'Note 1' AS note_num,
    'Type 2 is most important - practice this the most!' AS note
--UNION ALL
--SELECT 
    'Note 2',
    'Always filter is_current=TRUE for current data queries'
UNION ALL
SELECT 
    'Note 3',
    'Surrogate key is essential - customer_key not customer_id'
UNION ALL
SELECT 
    'Note 4',
    'Date ranges with BETWEEN work perfectly with 9999-12-31 for current'
UNION ALL
SELECT 
    'Note 5',
    'MERGE makes it idempotent - can rerun safely'
UNION ALL
SELECT 
    'Note 6',
    'Facts join to dimensions using surrogate key for historical accuracy'
UNION ALL
SELECT 
    'Note 7',
    'Real benefit: Orders show customer address at time of order, not current address';

-- =====================================================
-- END OF DAY 9 POC
-- =====================================================

--SELECT '=== ✅ DAY 9 COMPLETE ===' AS done;
--SELECT 'I learned: Dimension basics, SCD Types 0/1/2/3, Type 2 implementation, Historical queries, MERGE patterns' AS summary;

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-6302352287244069>, line 1[0m

File [0;32m/databricks/python/lib/python3.12/site-packages/IPython/core/interactiveshell.py:2541[0m, in [0;36mInteractiveShell.run_cell_magic[0;34m(self, magic_name, line, cell)[0m
[1;32m   2539[0m [38;5;28;01mwith[39;00m [38;5;28mself[39m[38;5;241m.[39mbuiltin_trap:
[1;32m   2540[0m     args [38;5;241m=[39m (magic_arg_s, cell)
[0;32m-> 2541[0m     result [38;5;241m=[39m fn([38;5;241m*[39margs, [38;5;241m*[39m[38;5;241m*[39mkwargs)
[1;32m   2543[0m [38;5;66;03m# The code below prevents the output from being displayed[39;00m
[1;32m   2544[0m [38;5;66;03m# when using magics with decorator @output_can_be_silenced[39;00m
[1;32m   2545[0m [38;5;66;03m# when the last Python token in the expression is a ';'.[39;00m
[1;32m   2546[0