# ETL Class Example Notebook

This notebook demonstrates the three main steps of the ETL pipeline: **Extract**, **Transform**, and **Load**. It mirrors the structure of `etl_notebook.ipynb` but provides a concise class‑based example for quick reference.


## Extract

Set up the project root on `sys.path` so that package imports work from any working directory. Import the extraction utilities required for this example.


In [None]:
import os
import sys
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session

In [None]:

from ca_biositing.datamodels.config import settings
from ca_biositing.pipeline.etl.extract import proximate, sample_ids, ultimate, cmpana, feedstock_collection_info

# Extract data
prox_raw = proximate.extract()
ult_raw = ultimate.extract()
cmpana_raw = cmpana.extract()

feedstock_collector_info_raw = feedstock_collection_info.extract()

sampleids_raw = sample_ids.extract()

analysis_data = [prox_raw, ult_raw, cmpana_raw]
print(f"Extracted {len(analysis_data)} dataframes.")

print(sampleids_raw.head())

## Transform


In [4]:
from ca_biositing.pipeline.utils.cleaning_functions import cleaning as cleaning_mod

cleaned_data = []
for df in analysis_data:
    df['dataset'] = 'biocirv'
    cleaned_df = cleaning_mod.standard_clean(df)
    cleaned_data.append(cleaned_df)

print(f"Cleaned {len(cleaned_data)} dataframes.")

Cleaned 3 dataframes.


  return df.astype("object").replace(regex, np.nan, regex=True)
  return df.astype("object").replace(regex, np.nan, regex=True)
  return df.astype("object").replace(regex, np.nan, regex=True)


In [5]:
from ca_biositing.pipeline.utils.cleaning_functions import coercion as coercion_mod

coerced_data = []
for df in cleaned_data:
    # Example: coerce columns into the designated data types (int, float, datetime, geom, etc)
    coerced_df = coercion_mod.coerce_columns(df,
                                             int_cols=['repl_no'], 
                                             float_cols=['value'],
                                             datetime_cols=['created_at', 'updated_at'])
    coerced_data.append(coerced_df)

print(f"Coerced {len(coerced_data)} dataframes.")

Coerced 3 dataframes.


In [6]:
from ca_biositing.pipeline.utils.name_id_swap import normalize_dataframes
from ca_biositing.datamodels.schemas.generated.ca_biositing import *

normalize_columns = {
    'resource': (Resource, 'name'),
    'prepared_sample': (PreparedSample, 'name'),
    'preparation_method': (Method, 'name'),
    'parameter': (Parameter, 'name'),
    'unit': (Unit, 'name'),
    'sample_unit': (Unit, 'name'),
    'analyst_email': (Contact, 'email'),
    'primary_ag_product': (PrimaryAgProduct, 'name'),
    'provider_code': (Provider, 'codename'),
    'dataset': (Dataset, 'name')
}

normalized_data = []
for df in coerced_data:
    normalized_df = normalize_dataframes(df, normalize_columns)
    normalized_data.append(normalized_df)

print(f"Normalized {len(normalized_data)} dataframes.")

DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


Normalized 3 dataframes.


In [7]:
## Prepare Record Information DataFrames

observation_data = []
for df in normalized_data:
    obs_df = df[[
        'dataset_id',
        'analysis_type', 
        'record_id',
        'parameter_id',
        'value',
        'unit_id', 
        'note'
    ]].copy().rename(columns={'analysis_type': 'record_type'})
    obs_df = obs_df.dropna(subset=['record_id', 'parameter_id', 'value'])
    observation_data.append(obs_df)

print(f"Prepared {len(observation_data)} observation dataframes.")

Prepared 3 observation dataframes.


In [8]:
## Prepare Record Information DataFrames

record_data = []
for df in normalized_data:
    # 1. Define explicit mappings for non-normalized columns
    rename_map = {
        'record_id': 'record_id',
        'repl_no': 'technical_replication_no',
        'qc_result': 'qc_pass',
        'note': 'note'
    }
    
    # 2. Dynamically add normalized columns from the normalize_columns dictionary
    for col in normalize_columns.keys():
        norm_col = f"{col}_id"
        if norm_col in df.columns:
            # Special case: rename to match target record table schema
            target_name = 'analyst_id' if col == 'analyst_email' else \
                          'method_id' if col == 'preparation_method' else norm_col
            rename_map[norm_col] = target_name

    # 3. Only select columns that actually exist in this specific dataframe
    available_cols = [c for c in rename_map.keys() if c in df.columns]
    final_rename = {k: v for k, v in rename_map.items() if k in available_cols}

    record_df = df[available_cols].copy().rename(columns=final_rename)

    # 4. Drop rows where critical identifiers are missing (NaN)
    if 'record_id' in record_df.columns:
        record_df = record_df.dropna(subset=['record_id'])
    
    record_data.append(record_df)

print(f"Prepared {len(record_data)} record dataframes.")

Prepared 3 record dataframes.


## Load


In [9]:
db_url = settings.database_url
if "@db:" in db_url:
    db_url = db_url.replace("@db:", "@localhost:")
elif "db:5432" in db_url:
    db_url = db_url.replace("db:5432", "localhost:5432")

engine = create_engine(db_url)

def upsert_observations(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        record['updated_at'] = now
        if record.get('created_at') is None:
            record['created_at'] = now
        stmt = insert(Observation).values(record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in Observation.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    for obs_df in observation_data:
        upsert_observations(obs_df, session)
    session.commit()
print('Upsert of all observations completed.')

Upsert of all observations completed.


In [10]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import ProximateRecord

def upsert_proximate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    # Filter record dictionary to only include columns that exist in the table
    table_columns = {c.name for c in ProximateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(ProximateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in ProximateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the first dataframe in record_data is Proximate
    upsert_proximate_records(record_data[0], session)
    session.commit()
print('Upsert of Proximate records completed.')

Upsert of Proximate records completed.


In [22]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import UltimateRecord

def upsert_ultimate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in UltimateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(UltimateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in UltimateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the second dataframe in record_data is Ultimate
    upsert_ultimate_records(record_data[1], session)
    session.commit()
print('Upsert of Ultimate records completed.')

Upsert of Ultimate records completed.


In [24]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import CompositionalRecord

def upsert_compositional_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in CompositionalRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(CompositionalRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in CompositionalRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the third dataframe in record_data is Compositional
    upsert_compositional_records(record_data[2], session)
    session.commit()
print('Upsert of Compositional records completed.')

Upsert of Compositional records completed.


In [4]:
feedstock_collector_info_raw

Unnamed: 0,Organization UUID,Assigned codename,ProviderName,Provider_text,Provider type,Facility type,Primary product(s),Street address,City,County,...,Provider contact,Provider title,Provider email,Provider phone,Provider website,Description,Process capability,Unnamed: 19,Available Codenames,DO NOT EDIT - THIS IS AN IMPORTRANGE FUNCTION
0,A64D4877-20E2-BC3E-FAD1-E6FD388231,vibrant,Salida Hulling Association,Salida Hulling Association,Processor,Processing,Almonds,350 N. Dakota Avenue,Modesto,Stanislaus,...,,,,,,Almond processer with a processing capacity of...,,,willow,
1,C8751C56-A5C0-BBB6-5C04-2B6C785712,tiny,Travaille & Phippen,Travaille & Phippen,Processor,Processing,Almonds,12700 Graves Road,Manteca,San Joaquin,...,David Phippen,Owner,dsphip@msn.com,(209) 531-7380,,Almond processer with a processing capacity of...,,,birch,
2,3288EBF1-3B30-5CD0-5363-D214236CDD,humorous,Cortez Growers Association,Cortez Growers Association,Processor,Processing,Almonds,12714 Cortez Avenue,Turlock,Stanislaus,...,,,,,,Almond processer based in Cortez (suburb of Tu...,,,magnolia,
3,B0C8A27D-7AA7-AE5C-8968-579BDC5E44,closeup,Hilltop Ranch Inc.,Hilltop Ranch Inc.,Processor,"Growing, Processing",Almonds,3890 Looney Rood,Ballico,Merced,...,,,,,,"Huller/sheller, they may also grow some crops ...",,,elm,
4,1FF78CF9-6889-C004-DFD2-B2BAC1EAA1,crane,Stewart & Jasper Orchards,Stewart & Jasper Orchards,Processor,"Growing, Processing, Warehouse",Almonds,3500 Shiells Road,Newman,Merced,...,,,,,,"Grower, huller/sheller, seller, and whole sell...",,,olive,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,EB215E12-4987-11A5-7CE3-3E1321A79E,lavalier,Wriggly (Gerardo to provide details),Wriggly (Gerardo to provide details),,,,,,,...,,,,,,,,,,
57,A222F8F0-7E5C-DDD6-8375-1EB184E9B6,schooner,Bowles Farming Company,Bowles Farming Company,Farmer,Growing,Cotton,11609 Hereford Rd,Los Banos,Merced,...,Wayne Pricolo,Farm Manager,wayne@bfarm.com,(209) 827-3000,www.bfarm.com,,,,,
58,F7B17AEC-B6FA-4BB2-B414-10FDBC31A3,frigate,Petrini Land Company,Petrini Land Company,Farmer,Growing,Rice,P.O. Box 4547,Stockton,San Joaquin,...,,,,,,Rice grower located on Rindge Tract Island. G...,,,,
59,42C78625-79A4-DC1A-C6DB-0E40F9429B,galleon,Brocchini Farms,Brocchini Farms,Farmer,Growing,Olives - processing,27011 S. Austin Road,Ripon,San Joaquin,...,,,,,,Grower identified by oil processor as producer...,,,,


In [5]:
from ca_biositing.pipeline.etl.extract.biodiesel_plants import extract as biodiesel_extract
biodiesel_plants_raw = biodiesel_extract()

In [7]:
biodiesel_plants_raw.head()

Unnamed: 0,company,bbi_index,city,state,capacity_mmg_per_y,feedstock,status,address,coordinates,latitude,longitude,source
0,American GreenFuels,,New Haven,Connecticut,35,,,,,41.2901,-72.9029,https://atlas.eia.gov/datasets/79dad60ce89c475...
1,Down To Earth Energy LLC,,Monroe,Georgia,2,,,,,33.75717,-83.7277,https://atlas.eia.gov/datasets/79dad60ce89c475...
2,Maine Bio-Fuel Inc,,Portland,Maine,1,,,,,43.6914,-70.3281,https://atlas.eia.gov/datasets/79dad60ce89c475...
3,Cape Cod Biofuels Inc,,Sandwich,Massachusetts,1,,,,,41.7177,-70.4845,https://atlas.eia.gov/datasets/79dad60ce89c475...
4,Renewable Fuels by Peterson,,North Haverhill,New Hampshire,8,,,,,44.077,-72.0047,https://atlas.eia.gov/datasets/79dad60ce89c475...


In [2]:
from typing import List
from ca_biositing.pipeline.etl.extract.basic_sample_info import extract as basic_sample_info_extract
from ca_biositing.pipeline.etl.transform.resource import transform as resource_transform

basic_sample_info = basic_sample_info_extract()

EXTRACT_SOURCES: List[str] = ["basic_sample_info"]

cleaned_data = resource_transform.fn({"basic_sample_info": basic_sample_info})

cleaned_data.head()

DEBUG: gsheet_to_df called for Aim 1-Feedstock Collection and Processing Data-BioCirV / 01-BasicSampleInfo
DEBUG: Authenticating with credentials.json
DEBUG: Opening spreadsheet Aim 1-Feedstock Collection and Processing Data-BioCirV
DEBUG: Opening worksheet 01-BasicSampleInfo
DEBUG: Fetching all values from 01-BasicSampleInfo
DEBUG: Successfully fetched 246 rows


DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


  return df.astype("object").replace(regex, np.nan, regex=True)


Unnamed: 0,name,primary_ag_product_id,resource_class_id,resource_subclass_id,note,etl_run_id,lineage_group_id
0,tomato pomace,1464.0,1.0,3.0,,,
1,tomato pomace,1464.0,1.0,3.0,,,
2,tomato pomace,1464.0,1.0,3.0,,,
3,grape pomace,1413.0,1.0,3.0,,,
4,grape stem,1413.0,1.0,2.0,,,
