# ETL Class Example Notebook

This notebook demonstrates the three main steps of the ETL pipeline: **Extract**, **Transform**, and **Load**. It mirrors the structure of `etl_notebook.ipynb` but provides a concise class‑based example for quick reference.


## Extract

Set up the project root on `sys.path` so that package imports work from any working directory. Import the extraction utilities required for this example.


In [None]:
import os
import sys
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session


# Find the project root (directory containing 'pixi.toml') 
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if project_root is None:
    raise FileNotFoundError('Could not locate project root')

# Ensure the root is on the Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)




In [None]:

from ca_biositing.datamodels.config import settings
from ca_biositing.pipeline.etl.extract import samplemetadata, provider_info

# Extract data
samplemetadata = samplemetadata.extract()
provider_info = provider_info.extract()

sampling_data = [samplemetadata, provider_info]

samplemetadata

## Transform


In [None]:
from ca_biositing.pipeline.utils.cleaning_functions import cleaning as cleaning_mod

cleaned_data = []
for df in sampling_data:
    df['dataset'] = 'biocirv'
    cleaned_df = cleaning_mod.standard_clean(df)
    cleaned_data.append(cleaned_df)

print(f"Cleaned {len(cleaned_data)} dataframes.")

In [None]:
    if etl_run_id:
        cleaned_df['etl_run_id'] = etl_run_id
    if lineage_group_id:
        cleaned_df['lineage_group_id'] = lineage_group_id

In [None]:
from ca_biositing.pipeline.utils.cleaning_functions import coercion as coercion_mod

coerced_data = []
for df in cleaned_data:
    # Example: coerce columns into the designated data types (int, float, datetime, geom, etc)
    coerced_df = coercion_mod.coerce_columns(df,
                                             int_cols=['repl_no', 'qty'], 
                                             float_cols=['value', 'particle_width', 'particle_length', 'particle_height'],
                                             datetime_cols=['created_at', 'updated_at', 'fv_date_time', 'sample_ts', 'prod_date'])
    coerced_data.append(coerced_df)

print(f"Coerced {len(coerced_data)} dataframes.")

In [None]:
from ca_biositing.pipeline.utils.name_id_swap import normalize_dataframes
from ca_biositing.datamodels.schemas.generated.ca_biositing import *

normalize_columns = {
    'resource': (Resource, 'name'),
    'provider_codename': (Provider, 'codename'),
    'primary_collector': (Contact, 'name'),
    'storage_dur_units': (Unit, 'name'),
    'particle_units': (Unit, 'name'),
    'sample_unit': (Unit, 'name'),
    'prepared_sample': (PreparedSample, 'name'),
    'soil_type': (SoilType, 'name'),
    'storage_mode': (Method, 'name'),
    'county': (LocationAddress, 'county'),
    'primary_ag_product': (PrimaryAgProduct, 'name'),
    'provider_type': (Provider, 'type'),
    'dataset': (Dataset, 'name'),
    'field_storage_location' : (LocationAddress, 'full_address'),
}

normalized_data = []
for df in coerced_data:
    normalized_df = normalize_dataframes(df, normalize_columns)
    normalized_data.append(normalized_df)

normalized_data[0].head()

In [None]:
joined_data = normalized_data[0].merge(
    normalized_data[1], 
    on='provider_codename_id', 
    how='left'
)

joined_data.dtypes

In [None]:

##Select and Rename Columns

# 1. Define the mapping of existing columns to new names
rename_map = {
    'field_sample_name': 'name',
    'resource_id': 'resource_id',
    'provider_codename_id': 'provider_id',
    'primary_collector_id': 'collector_id',
    'sample_source': 'sample_collection_source',
    'qty': 'qty',
    'sample_unit_id': 'amount_collected_unit_id',
    'county_id': 'sampling_location_id',
    'storage_mode_id': 'field_storage_method_id',
    'storage_dur_value': 'field_storage_duration_value',
    'storage_dur_units_id': 'field_storage_duration_unit_id',
    'field_storage_location_id': 'field_storage_location_id',
    'sample_ts': 'collection_timestamp',
    'sample_notes': 'note'
}

# 2. Select existing columns, rename them, and assign new empty columns
field_sample = joined_data[list(rename_map.keys())].rename(columns=rename_map).assign(
    collection_method=None,
    harvest_datemethod=None,
    harvest_date=None,
    field_sample_storage_location_id_2=None
)

field_sample.head()

In [None]:
## Prepare Record Information DataFrames

record_data = []
for df in normalized_data:
    # 1. Define explicit mappings for non-normalized columns
    rename_map = {
        'record_id': 'record_id',
        'repl_no': 'technical_replication_no',
        'qc_result': 'qc_pass',
        'note': 'note'
    }
    
    # 2. Dynamically add normalized columns from the normalize_columns dictionary
    for col in normalize_columns.keys():
        norm_col = f"{col}_id"
        if norm_col in df.columns:
            # Special case: rename to match target record table schema
            target_name = 'analyst_id' if col == 'analyst_email' else \
                          'method_id' if col == 'preparation_method' else norm_col
            rename_map[norm_col] = target_name

    # 3. Only select columns that actually exist in this specific dataframe
    available_cols = [c for c in rename_map.keys() if c in df.columns]
    final_rename = {k: v for k, v in rename_map.items() if k in available_cols}

    record_df = df[available_cols].copy().rename(columns=final_rename)

    # 4. Drop rows where critical identifiers are missing (NaN)
    if 'record_id' in record_df.columns:
        record_df = record_df.dropna(subset=['record_id'])
    
    record_data.append(record_df)

print(f"Prepared {len(record_data)} record dataframes.")

## Load


In [1]:
db_url = settings.database_url
if "@db:" in db_url:
    db_url = db_url.replace("@db:", "@localhost:")
elif "db:5432" in db_url:
    db_url = db_url.replace("db:5432", "localhost:5432")

engine = create_engine(db_url)

def upsert_observations(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        record['updated_at'] = now
        if record.get('created_at') is None:
            record['created_at'] = now
        stmt = insert(Observation).values(record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in Observation.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    for obs_df in observation_data:
        upsert_observations(obs_df, session)
    session.commit()
print('Upsert of all observations completed.')

NameError: name 'settings' is not defined

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import ProximateRecord

def upsert_proximate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    # Filter record dictionary to only include columns that exist in the table
    table_columns = {c.name for c in ProximateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(ProximateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in ProximateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the first dataframe in record_data is Proximate
    upsert_proximate_records(record_data[0], session)
    session.commit()
print('Upsert of Proximate records completed.')

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import UltimateRecord

def upsert_ultimate_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in UltimateRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(UltimateRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in UltimateRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the second dataframe in record_data is Ultimate
    upsert_ultimate_records(record_data[1], session)
    session.commit()
print('Upsert of Ultimate records completed.')

In [None]:
from ca_biositing.datamodels.schemas.generated.ca_biositing import CompositionalRecord

def upsert_compositional_records(df, session):
    if df.empty:
        return
    now = datetime.now(timezone.utc)
    table_columns = {c.name for c in CompositionalRecord.__table__.columns}
    records = df.replace({np.nan: None}).to_dict(orient='records')
    for record in records:
        clean_record = {k: v for k, v in record.items() if k in table_columns}
        clean_record['updated_at'] = now
        if clean_record.get('created_at') is None:
            clean_record['created_at'] = now
        stmt = insert(CompositionalRecord).values(clean_record)
        update_dict = {
            c.name: stmt.excluded[c.name]
            for c in CompositionalRecord.__table__.columns
            if c.name not in ['id', 'created_at', 'record_id']
        }
        upsert_stmt = stmt.on_conflict_do_update(
            index_elements=['record_id'],
            set_=update_dict
        )
        session.execute(upsert_stmt)

with Session(engine) as session:
    # Assuming the third dataframe in record_data is Compositional
    upsert_compositional_records(record_data[2], session)
    session.commit()
print('Upsert of Compositional records completed.')

In [None]:
feedstock_collector_info_raw

In [None]:
from ca_biositing.pipeline.etl.extract.biodiesel_plants import extract as biodiesel_extract
biodiesel_plants_raw = biodiesel_extract()

In [None]:
biodiesel_plants_raw.head()

In [None]:
from typing import List
from ca_biositing.pipeline.etl.extract.basic_sample_info import extract as basic_sample_info_extract
from ca_biositing.pipeline.etl.transform.resource import transform as resource_transform

basic_sample_info = basic_sample_info_extract()

EXTRACT_SOURCES: List[str] = ["basic_sample_info"]

cleaned_data = resource_transform.fn({"basic_sample_info": basic_sample_info})

cleaned_data.head()

## PREPARED SAMPLE ETL

### Extract

In [3]:
import os
import sys
from datetime import datetime, timezone
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.dialects.postgresql import insert
from sqlalchemy.orm import Session


# Find the project root (directory containing 'pixi.toml') 
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if project_root is None:
    raise FileNotFoundError('Could not locate project root')

# Ensure the root is on the Python path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [9]:
from ca_biositing.datamodels.config import settings
from ca_biositing.pipeline.etl.extract import preparation

# Extract data
preparation_raw = preparation.extract()

print(f"Extracted {len([preparation_raw])} dataframes.")

preparation_raw.head()

DEBUG: gsheet_to_df called for Aim 1-Feedstock Collection and Processing Data-BioCirV / 02-Preparation
DEBUG: Authenticating with credentials.json
DEBUG: Opening spreadsheet Aim 1-Feedstock Collection and Processing Data-BioCirV
DEBUG: Opening worksheet 02-Preparation
DEBUG: Fetching all values from 02-Preparation
DEBUG: Successfully fetched 507 rows


Extracted 1 dataframes.


Unnamed: 0,UUID,Record_ID,Resource,Sample_name,Source_codename,Preparation_method,Prepared_sample,Storage_cond,Prep_temp_C,Amount_before_drying_g,...,Analyze_status,Prox_prepro_count,XRF_prepro_count,Cmp_prepro_count,XRD_prepro_count,ICP_prepro_count,Cal_prepro_count,Ult_prepro_count,FTNIR_prepro_count,RGB_prepro_count
0,F30C220E-637C-CF3B-C39D-D95DE57164,PreP_01,Tomato pomace,Oak-TmPm01,Oakleaf,Freeze Dry,Oak-TmPm01FD(64),Room Temp,-46,471.2,...,wait,0,0,0,1,0,1,0,1,0
1,CF86943B-BD98-CC2D-D337-B30CB1AA85,PreP_02,Tomato pomace,Oak-TmPm01,Oakleaf,Oven dry,Oak-TmPm01O(85),RT vacuum sealed,40,200.5,...,yes,0,0,15,2,0,0,0,0,1
2,4D584CFD-AF8D-0F97-BFE6-682E1A3308,PreP_03,Tomato pomace,Oak-TmPm01,Oakleaf,Freeze,Oak-TmPm01F(08),Frozen vac sealed,-20,525.3,...,wait,0,0,0,0,0,0,0,0,0
3,00D6D902-B934-5D99-F8C2-1EC6D59973,PreP_04,Tomato pomace,Oak-TmPm01,Oakleaf,As Is,Oak-TmPm01A(73),4C,4,698.5,...,yes,12,0,0,0,0,0,0,0,0
4,8FE4BD54-C63E-CB01-A003-931E7467D2,PreP_05,Tomato pomace,Pin-TmPm02,Pinecrest,Freeze Dry,Pin-TmPm02FD(D2),RT vacuum sealed,-46,1901.7,...,wait,0,0,0,0,0,0,0,0,0


### Transform

In [13]:
from ca_biositing.pipeline.utils.cleaning_functions import cleaning as cleaning_mod

preparation_clean = cleaning_mod.standard_clean(preparation_raw)

preparation_clean.head()

print(f"Cleaned {len(preparation_clean)} rows in preparation_clean dataframe.")

preparation_clean

Cleaned 506 rows in preparation_clean dataframe.


  return df.astype("object").replace(regex, np.nan, regex=True)


Unnamed: 0,uuid,record_id,resource,sample_name,source_codename,preparation_method,prepared_sample,storage_cond,prep_temp_c,amount_before_drying_g,...,analyze_status,prox_prepro_count,xrf_prepro_count,cmp_prepro_count,xrd_prepro_count,icp_prepro_count,cal_prepro_count,ult_prepro_count,ftnir_prepro_count,rgb_prepro_count
0,f30c220e-637c-cf3b-c39d-d95de57164,prep_01,tomato pomace,oak-tmpm01,oakleaf,freeze dry,oak-tmpm01fd(64),room temp,-46,471.20,...,wait,0,0,0,1,0,1,0,1,0
1,cf86943b-bd98-cc2d-d337-b30cb1aa85,prep_02,tomato pomace,oak-tmpm01,oakleaf,oven dry,oak-tmpm01o(85),rt vacuum sealed,40,200.50,...,yes,0,0,15,2,0,0,0,0,1
2,4d584cfd-af8d-0f97-bfe6-682e1a3308,prep_03,tomato pomace,oak-tmpm01,oakleaf,freeze,oak-tmpm01f(08),frozen vac sealed,-20,525.30,...,wait,0,0,0,0,0,0,0,0,0
3,00d6d902-b934-5d99-f8c2-1ec6d59973,prep_04,tomato pomace,oak-tmpm01,oakleaf,as is,oak-tmpm01a(73),4c,4,698.50,...,yes,12,0,0,0,0,0,0,0,0
4,8fe4bd54-c63e-cb01-a003-931e7467d2,prep_05,tomato pomace,pin-tmpm02,pinecrest,freeze dry,pin-tmpm02fd(d2),rt vacuum sealed,-46,1901.70,...,wait,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
502,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
503,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
504,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0


##### Coerce datatypes

In [None]:
from ca_biositing.pipeline.utils.cleaning_functions import coercion as coercion_mod

####ISSUE HERE WITH FLOAT COLUMNS NOT RECOGNIZED PROPERLY###, PARTICULARLY THE AMOUNT BEFORE DRYING G COLUMNS!!! I THINK THIS IS BECAUSE OF COMMAS IN THE VALUE!!!

### ALSO WANT TO COERCE THE YES/NO TO BOOL FOR DRYING STEP COLUMN ###

preparation_coerced  = coercion_mod.coerce_columns(preparation_clean,
                                             int_cols=[], 
                                             float_cols=['prep_temp_c', 'amount_before_drying_g', 'amount_after_drying_g', 'amount_remaining_g'], 
                                             datetime_cols=['preparation_date', 'amount_as_of_date'],
                                             bool_cols=['drying_step']
                                             )

print(f"Coerced {len(preparation_coerced)} rows in preparation_coerced dataframe.")

preparation_coerced

Coerced 506 rows in preparation_coerced dataframe.


Unnamed: 0,uuid,record_id,resource,sample_name,source_codename,preparation_method,prepared_sample,storage_cond,prep_temp_c,amount_before_drying_g,...,analyze_status,prox_prepro_count,xrf_prepro_count,cmp_prepro_count,xrd_prepro_count,icp_prepro_count,cal_prepro_count,ult_prepro_count,ftnir_prepro_count,rgb_prepro_count
0,f30c220e-637c-cf3b-c39d-d95de57164,prep_01,tomato pomace,oak-tmpm01,oakleaf,freeze dry,oak-tmpm01fd(64),room temp,-46.0,471.200012,...,wait,0,0,0,1,0,1,0,1,0
1,cf86943b-bd98-cc2d-d337-b30cb1aa85,prep_02,tomato pomace,oak-tmpm01,oakleaf,oven dry,oak-tmpm01o(85),rt vacuum sealed,40.0,200.500000,...,yes,0,0,15,2,0,0,0,0,1
2,4d584cfd-af8d-0f97-bfe6-682e1a3308,prep_03,tomato pomace,oak-tmpm01,oakleaf,freeze,oak-tmpm01f(08),frozen vac sealed,-20.0,525.299988,...,wait,0,0,0,0,0,0,0,0,0
3,00d6d902-b934-5d99-f8c2-1ec6d59973,prep_04,tomato pomace,oak-tmpm01,oakleaf,as is,oak-tmpm01a(73),4c,4.0,698.500000,...,yes,12,0,0,0,0,0,0,0,0
4,8fe4bd54-c63e-cb01-a003-931e7467d2,prep_05,tomato pomace,pin-tmpm02,pinecrest,freeze dry,pin-tmpm02fd(d2),rt vacuum sealed,-46.0,,...,wait,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
502,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
503,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0
504,,,,,,,,,,,...,wait,0,0,0,0,0,0,0,0,0


In [25]:
## SELECT THE COLUMNS FOR THE PREPARATION TABLE and NORMALIZE

preparation_selected = preparation_coerced[['prepared_sample', #prepared_sample.name
                                            'sample_name', #field_sample.name
                                            'preparation_method', #prep_method_id -> preparation_method.name
                                            'preparation_date', #prep_date
                                            'analyst_email', #preparer_id -> contact.email
                                            'note']] #note

preparation_method_selected = preparation_coerced[['preparation_method', #preparation_method.name
                                                   'prep_temp_c', #prep_temp_c
                                                   'drying_step' #drying_step
                                                   ]]

preparation_selected.head()

Unnamed: 0,prepared_sample,sample_name,preparation_method,preparation_date,analyst_email,note
0,oak-tmpm01fd(64),oak-tmpm01,freeze dry,2024-09-24,xkang2@lbl.gov,
1,oak-tmpm01o(85),oak-tmpm01,oven dry,2024-09-24,xkang2@lbl.gov,
2,oak-tmpm01f(08),oak-tmpm01,freeze,2024-09-24,xkang2@lbl.gov,
3,oak-tmpm01a(73),oak-tmpm01,as is,2024-09-24,xkang2@lbl.gov,
4,pin-tmpm02fd(d2),pin-tmpm02,freeze dry,2024-09-24,xkang2@lbl.gov,


In [26]:
## Normalization of preparation_selected

from ca_biositing.pipeline.utils.name_id_swap import normalize_dataframes
from ca_biositing.datamodels.schemas.generated.ca_biositing import FieldSample, PreparationMethod, Contact

normalize_columns = {
    'sample_name': (FieldSample, 'name'),
    'preparation_method': (PreparationMethod, 'name'),
    'analyst_email': (Contact, 'email')
}

preparation_normalized = normalize_dataframes(preparation_selected, normalize_columns)
preparation_normalized.head()

DEBUG: Starting normalization for 1 DataFrames
DEBUG: Opening database session...
DEBUG: Database session opened


Unnamed: 0,prepared_sample,preparation_date,note,sample_name_id,preparation_method_id,analyst_email_id
0,oak-tmpm01fd(64),2024-09-24,,114.0,7.0,1.0
1,oak-tmpm01o(85),2024-09-24,,114.0,1.0,1.0
2,oak-tmpm01f(08),2024-09-24,,114.0,9.0,1.0
3,oak-tmpm01a(73),2024-09-24,,114.0,3.0,1.0
4,pin-tmpm02fd(d2),2024-09-24,,126.0,7.0,1.0
