### Setup and Imports

In [1]:
import duckdb
import pandas as pd
from pathlib import Path

def run_query(sql_query: str) -> pd.DataFrame:
    """
    Connects to the dbt warehouse, runs a SQL query,
    prints the results beautifully, and returns a DataFrame.
    """
    db_path = Path('../data_lake/dbt.duckdb')
    df = pd.DataFrame() # Initialize an empty DataFrame
    try:
        with duckdb.connect(database=str(db_path), read_only=True) as con:
            df = con.execute(sql_query).df()
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

print("Helper function `run_query` is defined.")

Helper function `run_query` is defined.


### List Available Models

In [2]:
print("\n--- Available Tables/Models in 'main' schema ---")
run_query("SHOW TABLES")


--- Available Tables/Models in 'main' schema ---


Unnamed: 0,name
0,all_transactions_by_customer
1,dim_calendar
2,fct_credit_metrics_by_customer
3,fct_daily_transactions_by_customer
4,statements


### Load the all_transactions_by_customer Model

In [3]:
print("\n--- Loading 'all_transactions_by_customer' model ---")
transactions_df = run_query("SELECT * FROM main.all_transactions_by_customer")
print("Schema:")
transactions_df.info()
print("\nFirst 5 rows:")
transactions_df.head()


--- Loading 'all_transactions_by_customer' model ---
Schema:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217 entries, 0 to 1216
Data columns (total 30 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   username                                   1217 non-null   object        
 1   email                                      1217 non-null   object        
 2   address                                    1217 non-null   object        
 3   financial_institution                      1217 non-null   object        
 4   employer_name                              1217 non-null   object        
 5   login_id                                   1217 non-null   object        
 6   request_id                                 1217 non-null   object        
 7   request_datetime                           1217 non-null   object        
 8   request_status                      

Unnamed: 0,username,email,address,financial_institution,employer_name,login_id,request_id,request_datetime,request_status,days_detected,...,deposits,balance,is_revenue,is_debit,most_recent_statement_date,most_recent_statement_date_minus_30_days,most_recent_statement_date_minus_60_days,most_recent_statement_date_minus_90_days,most_recent_statement_date_minus_180_days,most_recent_statement_date_minus_365_days
0,Joel Schaubel,JOELSCHAUBEL@GMAIL.COM,"36 HOLKHAM AVE, ANCASTER, ON, L9K1P1",Simplii,,5eff116b-d0d9-4924-4b37-08dc29c779f9,727DAE61-63E9-4121-801E-F11CA8FF32FD,2024-02-11 19:26:39,Get Statements Completed,,...,,2181.74,,True,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
1,Joel Schaubel,JOELSCHAUBEL@GMAIL.COM,"36 HOLKHAM AVE, ANCASTER, ON, L9K1P1",Simplii,,5eff116b-d0d9-4924-4b37-08dc29c779f9,727DAE61-63E9-4121-801E-F11CA8FF32FD,2024-02-11 19:26:39,Get Statements Completed,,...,2000.0,3403.64,True,,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
2,Joel Schaubel,JOELSCHAUBEL@GMAIL.COM,"36 HOLKHAM AVE, ANCASTER, ON, L9K1P1",Simplii,,5eff116b-d0d9-4924-4b37-08dc29c779f9,727DAE61-63E9-4121-801E-F11CA8FF32FD,2024-02-11 19:26:39,Get Statements Completed,,...,0.02,3403.66,True,,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
3,Joel Schaubel,JOELSCHAUBEL@GMAIL.COM,"36 HOLKHAM AVE, ANCASTER, ON, L9K1P1",Simplii,,5eff116b-d0d9-4924-4b37-08dc29c779f9,727DAE61-63E9-4121-801E-F11CA8FF32FD,2024-02-11 19:26:39,Get Statements Completed,,...,,3103.66,,True,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
4,Joel Schaubel,JOELSCHAUBEL@GMAIL.COM,"36 HOLKHAM AVE, ANCASTER, ON, L9K1P1",Simplii,,5eff116b-d0d9-4924-4b37-08dc29c779f9,727DAE61-63E9-4121-801E-F11CA8FF32FD,2024-02-11 19:26:39,Get Statements Completed,,...,,3081.89,,True,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09


### Model Development

Modify below or copy and paste to a new cell. Remember that, in dbt, you reference an input model as `{{ ref('model_name') }}`, while, in this notebook, you should use `main.model_name`.

In [5]:
query = """
    -- This model creates a complete, daily time series for each customer over the
    -- last 180 days, filling in any missing dates with the last known balance.
    
    WITH all_daily_transactions AS (
        SELECT
            username,
            email,
            request_id,
            request_datetime,
            date,
            withdrawals,
            deposits,
            balance,
            is_revenue,
            is_debit,
            most_recent_statement_date,
            most_recent_statement_date_minus_30_days,
            most_recent_statement_date_minus_60_days,
            most_recent_statement_date_minus_90_days,
            most_recent_statement_date_minus_180_days,
            most_recent_statement_date_minus_365_days
        FROM main.all_transactions_by_customer
    )
    
    ,dim_calendar AS (
        SELECT *
        FROM main.dim_calendar
    )
    
    ,customer_date_range AS (
        SELECT
            email,
            request_id,
            date,
            
            -- Use 180 days as default for scaffolding, but this can be easily changed
            most_recent_statement_date_minus_365_days AS start_date,
            most_recent_statement_date AS end_date,
            
            -- Include all auxiliary date columns for reference
            most_recent_statement_date,
            most_recent_statement_date_minus_30_days,
            most_recent_statement_date_minus_60_days,
            most_recent_statement_date_minus_90_days,
            most_recent_statement_date_minus_180_days,
            most_recent_statement_date_minus_365_days
        FROM all_daily_transactions
        GROUP BY ALL
    )
    
    ,customer_scaffold AS (
        SELECT
            cdr.email,
            cdr.request_id,
            cal.date_day as date
        FROM customer_date_range AS cdr
        CROSS JOIN dim_calendar AS cal
        WHERE cal.date_day > cdr.start_date
            AND cal.date_day <= cdr.end_date
        ORDER By date DESC
    )
    
    ,padded_transactions AS (
        SELECT
            scf.email,
            scf.request_id,
            scf.date,
            -- Average balance for days with multiple transactions
            AVG(trn.balance) AS average_balance
        FROM customer_scaffold AS scf
        LEFT JOIN all_daily_transactions AS trn ON scf.email = trn.email
            AND scf.request_id = trn.request_id
            AND scf.date = trn.date
        GROUP BY ALL
    )
    
    ,daily_balances AS (
        SELECT
            email,
            request_id,
            date,
            
            -- Fill forward the last known balance for days without transactions
            LAST_VALUE(average_balance IGNORE NULLS) OVER(
                PARTITION BY email, request_id
                ORDER BY date
                ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
            ) AS revised_average_balance
        FROM padded_transactions AS trn
    )
    
    ,daily_revenue AS (
        SELECT
            email,
            request_id,
            date,
            
            -- Sum of all deposits for the day. This is the "Day Rev" from the sheet.
           SUM(deposits) OVER(PARTITION BY email, request_id, DAYOFYEAR(date)) AS daily_revenue,
        FROM all_daily_transactions
        WHERE is_revenue
        QUALIFY ROW_NUMBER() OVER(PARTITION BY email, request_id, date) = 1
    )
    
    ,weekly_revenue AS (
        SELECT
            email,
            request_id,
            date,
            
            -- Sum of all deposits for the day. This is the "Weekly revenue" from the sheet.
           SUM(deposits) OVER(PARTITION BY email, request_id, WEEKOFYEAR(date)) AS weekly_revenue,
        FROM all_daily_transactions
        WHERE is_revenue
        QUALIFY ROW_NUMBER() OVER(PARTITION BY email, request_id, date) = 1
    )
    
    ,customer_daily_metrics AS (
        SELECT
            db.email,
            db.request_id,
            db.date,
            ROUND(db.revised_average_balance, 2) AS revised_average_balance,
    
           -- Daily and weekly revenues
            drv.daily_revenue,
            wrv.weekly_revenue,
            
            -- Include auxiliary date columns for reference
            cdr.*
        FROM daily_balances AS db
        LEFT JOIN daily_revenue AS drv ON db.email = drv.email
            AND db.request_id = drv.request_id
            AND db.date = drv.date
        LEFT JOIN weekly_revenue AS wrv ON db.email = wrv.email
            AND db.request_id = wrv.request_id
            AND db.date = wrv.date
        LEFT JOIN customer_date_range AS cdr ON db.email = cdr.email
            AND db.request_id = cdr.request_id
            AND db.date = cdr.date
    )
    
    SELECT *
    FROM customer_daily_metrics

"""
query_result_dataframe = run_query(query)
query_result_dataframe.head()

Unnamed: 0,email,request_id,date,revised_average_balance,daily_revenue,weekly_revenue,email_1,request_id_1,date_1,start_date,end_date,most_recent_statement_date,most_recent_statement_date_minus_30_days,most_recent_statement_date_minus_60_days,most_recent_statement_date_minus_90_days,most_recent_statement_date_minus_180_days,most_recent_statement_date_minus_365_days
0,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-04-19,2108.1,,,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-04-19,2023-02-09,2024-02-09,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
1,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-06-30,2337.11,0.02,3114.94,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-06-30,2023-02-09,2024-02-09,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
2,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-07-06,3408.12,2045.0,6245.72,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-07-06,2023-02-09,2024-02-09,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
3,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-08-14,4025.68,947.09,2381.54,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-08-14,2023-02-09,2024-02-09,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09
4,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-08-22,1450.16,3678.4,31192.08,JOELSCHAUBEL@GMAIL.COM,727DAE61-63E9-4121-801E-F11CA8FF32FD,2023-08-22,2023-02-09,2024-02-09,2024-02-09,2024-01-10,2023-12-11,2023-11-11,2023-08-13,2023-02-09


### Closing the Connection



In [33]:
con.close()
print("\nDatabase connection closed.") 


Database connection closed.
