# ETL Class Example Notebook

This notebook demonstrates the three main steps of the ETL pipeline: **Extract**, **Transform**, and **Load**. It mirrors the structure of `etl_notebook.ipynb` but provides a concise class‑based example for quick reference.


## Extract

Set up the project root on `sys.path` so that package imports work from any working directory. Import the extraction utilities required for this example.


In [9]:
import os
import sys
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session


# Find the project root (directory containing 'pixi.toml') 
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if project_root is None:
    raise FileNotFoundError('Could not locate project root')

# Ensure the root is on the Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)




In [13]:

from ca_biositing.datamodels.config import settings
from ca_biositing.pipeline.etl.extract import samplemetadata, provider_info

# Extract data
samplemetadata = samplemetadata.extract()
provider_info = provider_info.extract()

sampling_data = [samplemetadata, provider_info]

samplemetadata

DEBUG: gsheet_to_df called for Sampling_data_redacted / samplemetadata
DEBUG: Authenticating with credentials.json
DEBUG: Opening spreadsheet Sampling_data_redacted
DEBUG: Opening worksheet samplemetadata
DEBUG: Fetching all values from samplemetadata
DEBUG: Successfully fetched 106 rows


DEBUG: gsheet_to_df called for Sampling_data_redacted / provider_info
DEBUG: Authenticating with credentials.json
DEBUG: Opening spreadsheet Sampling_data_redacted
DEBUG: Opening worksheet provider_info
DEBUG: Fetching all values from provider_info
DEBUG: Successfully fetched 64 rows


Unnamed: 0,Index,Field_Sample_Name,Resource,Provider_codename,FV_Date_Time,Sampling_Location,Sampling_Street,Sampling_City,Sampling_Zip,Sampling_LatLong,...,Last_Application_Month,Treatment_Amt,Treatment_Units,Treatment_Notes,Soil_Type,Crop_Variety,Crop_Cultivar,Production_Notes,Field_Storage_Location,Field_Storage_Conditions
0,EBD7B1F2,Pos-Alf033,Alfalfa,possessive,6/30/2025 10:30,,,,95206,,...,,,,,,,,Prod_Date is approximate. Crop was baled in J...,,
1,309299A1,Pos-Alf033,Alfalfa,possessive,6/30/2025 10:30,,,,95206,,...,,,,,,,,Prod_Date is approximate. Crop was baled in J...,,
2,64AA3698,,Wheat hay,possessive,6/30/2025 10:30,,,,95206,,...,,,,,,,,Prod_Date is approximate. Crop was baled in J...,,
3,B05F116C,,Wheat hay,possessive,6/30/2025 10:30,,,,95206,,...,,,,,,,,Prod_Date is approximate. Crop was baled in J...,,
4,21C2B270,Pos-WSt034,Wheat straw,possessive,6/30/2025 10:30,,,,95206,,...,,,,,,,,Prod_Date is approximate. Crop was baled in J...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100,D2C472E5,,,,,,,,,,...,,,,,,,,,,
101,4C4D0C95,,,,,,,,,,...,,,,,,,,,,
102,F091228E,,,,,,,,,,...,,,,,,,,,,
103,8328DB46,,,,,,,,,,...,,,,,,,,,,


## Transform


In [14]:
from ca_biositing.pipeline.utils.cleaning_functions import cleaning as cleaning_mod

cleaned_data = []
for df in sampling_data:
    df['dataset'] = 'biocirv'
    cleaned_df = cleaning_mod.standard_clean(df)
    cleaned_data.append(cleaned_df)

print(f"Cleaned {len(cleaned_data)} dataframes.")

Cleaned 2 dataframes.


  return df.astype("object").replace(regex, np.nan, regex=True)


In [15]:
from ca_biositing.pipeline.utils.cleaning_functions import coercion as coercion_mod

coerced_data = []
for df in cleaned_data:
    # Example: coerce columns into the designated data types (int, float, datetime, geom, etc)
    coerced_df = coercion_mod.coerce_columns(df,
                                             int_cols=['repl_no', 'qty'], 
                                             float_cols=['value', 'particle_width', 'particle_length', 'particle_height'],
                                             datetime_cols=['created_at', 'updated_at', 'fv_date_time', 'sample_ts', 'prod_date'])
    coerced_data.append(coerced_df)

print(f"Coerced {len(coerced_data)} dataframes.")

Coerced 2 dataframes.


In [None]:
joined_data = coerced_data[0].merge(
    coerced_data[1], 
    on='provider_codename', 
    how='left'
)

joined_data.head()

Unnamed: 0,index,field_sample_name,resource,provider_codename,fv_date_time,sampling_location,sampling_street,sampling_city,sampling_zip,sampling_latlong,...,crop_cultivar,production_notes,field_storage_location,field_storage_conditions,dataset_x,county,provider_type,facility_type,primary_ag_product,dataset_y
0,ebd7b1f2,pos-alf033,alfalfa,possessive,2025-06-30 10:30:00,,,,95206,,...,,prod_date is approximate. crop was baled in j...,,,biocirv,san joaquin,farmer,"growing, hay production","hay - other, winter wheat, hay - alfalfa",biocirv
1,309299a1,pos-alf033,alfalfa,possessive,2025-06-30 10:30:00,,,,95206,,...,,prod_date is approximate. crop was baled in j...,,,biocirv,san joaquin,farmer,"growing, hay production","hay - other, winter wheat, hay - alfalfa",biocirv
2,64aa3698,,wheat hay,possessive,2025-06-30 10:30:00,,,,95206,,...,,prod_date is approximate. crop was baled in j...,,,biocirv,san joaquin,farmer,"growing, hay production","hay - other, winter wheat, hay - alfalfa",biocirv
3,b05f116c,,wheat hay,possessive,2025-06-30 10:30:00,,,,95206,,...,,prod_date is approximate. crop was baled in j...,,,biocirv,san joaquin,farmer,"growing, hay production","hay - other, winter wheat, hay - alfalfa",biocirv
4,21c2b270,pos-wst034,wheat straw,possessive,2025-06-30 10:30:00,,,,95206,,...,,prod_date is approximate. crop was baled in j...,,,biocirv,san joaquin,farmer,"growing, hay production","hay - other, winter wheat, hay - alfalfa",biocirv


In [None]:
from ca_biositing.pipeline.utils.name_id_swap import normalize_dataframes
from ca_biositing.datamodels.schemas.generated.ca_biositing import *

normalize_columns = {
    'resource': (Resource, 'name'),
    'provider_codename': (Provider, 'codename'),
    'primary_collector': (Contact, 'name'),
    'storage_dur_units': (Unit, 'name'),
    'particle_units': (Unit, 'name'),
    'prepared_sample': (PreparedSample, 'name'),
    'soil_type': (SoilType, 'name'),
    'county': (LocationAddress, 'county'),
    'primary_ag_product': (PrimaryAgProduct, 'name'),
    'dataset_x': (Dataset, 'name'),
}

normalized_data = []
for df in coerced_data:
    normalized_df = normalize_dataframes(df, normalize_columns)
    normalized_data.append(normalized_df)

print(f"Normalized {len(normalized_data)} dataframes.")

In [None]:
## Prepare Record Information DataFrames

observation_data = []
for df in normalized_data:
    obs_df = df[[
        'dataset_id',
        'analysis_type', 
        'record_id',
        'parameter_id',
        'value',
        'unit_id', 
        'note'
    ]].copy().rename(columns={'analysis_type': 'record_type'})
    obs_df = obs_df.dropna(subset=['record_id', 'parameter_id', 'value'])
    observation_data.append(obs_df)

print(f"Prepared {len(observation_data)} observation dataframes.")

In [None]:
## Prepare Record Information DataFrames

record_data = []
for df in normalized_data:
    # 1. Define explicit mappings for non-normalized columns
    rename_map = {
        'record_id': 'record_id',
        'repl_no': 'technical_replication_no',
        'qc_result': 'qc_pass',
        'note': 'note'
    }
    
    # 2. Dynamically add normalized columns from the normalize_columns dictionary
    for col in normalize_columns.keys():
        norm_col = f"{col}_id"
        if norm_col in df.columns:
            # Special case: rename to match target record table schema
            target_name = 'analyst_id' if col == 'analyst_email' else \
                          'method_id' if col == 'preparation_method' else norm_col
            rename_map[norm_col] = target_name

    # 3. Only select columns that actually exist in this specific dataframe
    available_cols = [c for c in rename_map.keys() if c in df.columns]
    final_rename = {k: v for k, v in rename_map.items() if k in available_cols}

    record_df = df[available_cols].copy().rename(columns=final_rename)

    # 4. Drop rows where critical identifiers are missing (NaN)
    if 'record_id' in record_df.columns:
        record_df = record_df.dropna(subset=['record_id'])
    
    record_data.append(record_df)

print(f"Prepared {len(record_data)} record dataframes.")

## Load


In [None]:
db_url = settings.database_url
if "@db:" in db_url:
    db_url = db_url.replace("@db:", "@localhost:")
elif "db:5432" in db_url:
    db_url = db_url.replace("db:5432", "localhost:5432")

engine = create_engine(db_url)

def upsert_observations(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        record['updated_at'] = now
        if record.get('created_at') is None:
            record['created_at'] = now
        stmt = insert(Observation).values(record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in Observation.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    for obs_df in observation_data:
        upsert_observations(obs_df, session)
    session.commit()
print('Upsert of all observations completed.')

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import ProximateRecord

def upsert_proximate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    # Filter record dictionary to only include columns that exist in the table
    table_columns = {c.name for c in ProximateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(ProximateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in ProximateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the first dataframe in record_data is Proximate
    upsert_proximate_records(record_data[0], session)
    session.commit()
print('Upsert of Proximate records completed.')

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import UltimateRecord

def upsert_ultimate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in UltimateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(UltimateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in UltimateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the second dataframe in record_data is Ultimate
    upsert_ultimate_records(record_data[1], session)
    session.commit()
print('Upsert of Ultimate records completed.')

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import CompositionalRecord

def upsert_compositional_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in CompositionalRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(CompositionalRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in CompositionalRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the third dataframe in record_data is Compositional
    upsert_compositional_records(record_data[2], session)
    session.commit()
print('Upsert of Compositional records completed.')

In [None]:
feedstock_collector_info_raw

In [None]:
from ca_biositing.pipeline.etl.extract.biodiesel_plants import extract as biodiesel_extract
biodiesel_plants_raw = biodiesel_extract()

In [None]:
biodiesel_plants_raw.head()

In [None]:
from typing import List
from ca_biositing.pipeline.etl.extract.basic_sample_info import extract as basic_sample_info_extract
from ca_biositing.pipeline.etl.transform.resource import transform as resource_transform

basic_sample_info = basic_sample_info_extract()

EXTRACT_SOURCES: List[str] = ["basic_sample_info"]

cleaned_data = resource_transform.fn({"basic_sample_info": basic_sample_info})

cleaned_data.head()