In [1]:
import os
import sys
import pandas as pd
import numpy as np
import janitor as jn
from IPython.display import display

# --- Robustly find the project root ---
# The project root is the directory containing the 'pixi.toml' file.
path = os.getcwd()
project_root = None
while path != os.path.dirname(path): # Stop at the filesystem root
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

# --- Add project root to sys.path ---
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root '{project_root}' to sys.path")
else:
    print(f"Project root '{project_root}' is already in sys.path")

# --- Import the module ---
try:
    from src.ca_biositing.pipeline.ca_biositing.pipeline.etl.extract import proximate, ultimate, cmpana
    print("Successfully imported all module.")
except ImportError as e:
    print(f"Failed to import modules: {e}")
    print(f"\nFull sys.path: {sys.path}")

# --- Run the extraction ---
if 'proximate' in locals():
    try:
        # Pass the project_root to the extract function
        df = proximate.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted proximate data.")
            display(df.head())
        else:
            print("\n Prox extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during prox extraction: {e}")

if 'ultimate' in locals():
    try:
        df2 = ultimate.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted Ultimate data.")
            display(df2.head())
        else:
            print("\n Ultimate extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during extraction: {e}")

if 'cmpana' in locals():
    try:
        df3 = cmpana.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted CmpAna data.")
            display(df3.head())
        else:
            print("\nCmpAna extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during cmp ana extraction: {e}")
    finally:
        print("\nCmp Ana extraction process completed.")

Added project root '/Users/pjsmitty301/ca-biositing' to sys.path
Successfully imported all module.



Successfully extracted proximate data.


Unnamed: 0,Prox_UUID_031,Record_ID,Source_codename,Prepared_sample,Resource,Preparation_method,Storage_cond,Exper_abbrev,Repl_no,Repl_ID,Parameter,Value,Unit,Created_at,Updated_at,QC_result,Upload_status,Note,Analysis_type,Analyst_email
0,D7965110-407F-E356-D41D-B3B9A2B7B7,(73)B7B7,Oakleaf,Oak-TmPm01A(73),Tomato pomace,As Is,4C,Prox01xk,1,Prox01xk(73)1,Moisture,61.85,% total weight,2024-10-02 10:31:01,,Pass,not ready,,Proximate analysis,xkang2@lbl.gov
1,C8FEA984-2E9A-8DEF-55FB-1A9D7D9BA8,(73)9BA8,Oakleaf,Oak-TmPm01A(73),Tomato pomace,As Is,4C,Prox01xk,2,Prox01xk(73)2,Moisture,63.21,% total weight,2024-10-02 10:31:31,,Pass,ready,,Proximate analysis,xkang2@lbl.gov
2,DF304D5D-3A85-4881-7142-6D4E5F957D,(73)957D,Oakleaf,Oak-TmPm01A(73),Tomato pomace,As Is,4C,Prox01xk,3,Prox01xk(73)3,Moisture,63.27,% total weight,2024-10-02 10:32:01,,Pass,imported,,Proximate analysis,xkang2@lbl.gov
3,01C6C5BE-CEA6-54AF-3924-B0BAD69335,(73)9335,Oakleaf,Oak-TmPm01A(73),Tomato pomace,As Is,4C,Prox01xk,1,Prox01xk(73)1,Ash,0.69,% total weight,2024-10-03 10:31:01,,Pass,import failed,,Proximate analysis,xkang2@lbl.gov
4,126745C7-DD41-2F6D-0DC5-28DBCA415F,(73)415F,Oakleaf,Oak-TmPm01A(73),Tomato pomace,As Is,4C,Prox01xk,2,Prox01xk(73)2,Ash,0.89,% total weight,2024-10-03 10:31:31,,Pass,,,Proximate analysis,xkang2@lbl.gov



Successfully extracted Ultimate data.


Unnamed: 0,Ult_UUID_037,Record_ID,Ult_sample_name,Prepared_sample,Resource,Preparation_method,Storage_cond,Exper_abbrev,Repl_no,Repl_ID,...,Unit,Created_at,Updated_at,QC_result,Note,Analysis_type,Equipment,Raw_data_URL,Analyst_email,Upload_status
0,421617A5-E50E-9642-2974-B3275FE822,)U22E822,Hum-AlmHu023KM2(15)U22,Hum-AlmHu023KM2(15),Almond Hulls,Knife Mill (2mm),RT vacuum sealed,Ult26kh,1,Ult26kh(15)1,...,pc,,,Pass,1,Ultimate analysis,,,,
1,7E7919C2-5DB4-6BEF-75E2-7E51321200,)U001200,Hum-AlmHu023KM2(15)U00,Hum-AlmHu023KM2(15),Almond Hulls,Knife Mill (2mm),RT vacuum sealed,Ult26kh,1,Ult26kh(15)1,...,pc,,,Fail,1 dup,Ultimate analysis,,,,
2,3AA85881-1185-642F-C44B-41AD2275D2,)UD275D2,Hum-AlmSh022KM2(13)UD2,Hum-AlmSh022KM2(13),Almond Shells,Knife Mill (2mm),RT vacuum sealed,Ult26kh,1,Ult26kh(13)1,...,pc,,,Pass,2,Ultimate analysis,,,,
3,FA418804-6C4F-4C90-D78F-84D7DF54D3,)UD354D3,Ene-WaSh017OKM2(82)UD3,Ene-WaSh017OKM2(82),Walnut Shells,Oven Dry + Knife Mill (2mm),RT vacuum sealed,Ult26kh,1,Ult26kh(82)1,...,pc,,,Pass,3,Ultimate analysis,,,,
4,6FDACBFC-7E0B-473B-444F-85B7650267,)U670267,Ebo-GpPm010OKM2(1B)U67,Ebo-GpPm010OKM2(1B),Grape pomace,Oven Dry + Knife Mill (2mm),RT vacuum sealed,Ult26kh,1,Ult26kh(1B)1,...,pc,,,Pass,4,Ultimate analysis,,,,



Successfully extracted CmpAna data.


Unnamed: 0,Cmp_UUID_033,Record_ID,Prepared_sample,Resource,Preparation_method,Storage_cond,Exper_abbrev,Repl_no,Repl_ID,Parameter,...,Unit,Created_at,Updated_at,QC_result,Note,Analysis_type,Equipment,Raw_data_URL,Analyst_email,Upload_status
0,3EE2993D-86E3-1F16-C7EA-F8D555E114,(85)E114,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,1,Cmp04xk(85)1,Glucan,...,% dry weight,1/23/2025 9:00:01,,pass,,Chemical composition,,,xkang2@lbl.gov,ready
1,46878EF9-1226-22A0-D5D8-CF65E241CB,(85)41CB,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,2,Cmp04xk(85)2,Glucan,...,% dry weight,1/23/2025 9:00:16,,pass,,Chemical composition,,,xkang2@lbl.gov,ready
2,76A7A2F4-C4E4-E60F-1187-DEC6E02246,(85)2246,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,3,Cmp04xk(85)3,Glucan,...,% dry weight,1/23/2025 9:00:31,,pass,,Chemical composition,,,xkang2@lbl.gov,ready
3,7A136832-286B-07CB-62DE-ACF52F9311,(85)9311,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,1,Cmp04xk(85)1,Glucose,...,% dry weight,1/23/2025 9:00:46,,pass,,Chemical composition,,,xkang2@lbl.gov,ready
4,B709ECEE-F9A6-A55D-A59E-93B7B863D7,(85)63D7,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,2,Cmp04xk(85)2,Glucose,...,% dry weight,1/23/2025 9:01:01,,pass,,Chemical composition,,,xkang2@lbl.gov,ready



Cmp Ana extraction process completed.


#### This function seeks to clean the incoming gsheet dataframes and coerce the types

In [2]:
def clean_the_gsheets(df):
    # 1. Clean names and drop rows
    df = df.clean_names().dropna(subset=['repl_no', 'value'])

    # 2. Coerce types (using errors='coerce' handles messy string data)
    df['repl_no'] = pd.to_numeric(df['repl_no'], errors='coerce').astype('Int32') # Capital 'I' handles NaNs
    df['value'] = pd.to_numeric(df['value'], errors='coerce').astype(np.float32)

    # 3. Dates
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
    df['updated_at'] = pd.to_datetime(df['updated_at'], errors='coerce')

    # 4. Convert remaining objects to best possible types (like strings)
    df = df.convert_dtypes()
    
    return df # Return the FULL dataframe, not just .head()

In [None]:
dataframes = [df, df2, df3]

clean_dataframes = [clean_the_gsheets(df) for df in dataframes]

clean_dataframes[2].head()


In [None]:
summary_stats = clean_dataframes[0].\
    groupby(['resource', 'parameter'])['value'].\
        agg(['mean', 'median', 'min', 'max', 'std', 'count'])

summary_stats

In [None]:
clean_dataframes[0][['resource', 'parameter', 'value', 'unit']].\
    groupby(['resource', 'parameter', 'unit'], as_index=False).\
    agg({'value': 'mean'}).\
    query('value > 30').\
    sort_values(by='value', ascending=False).\
    round({'value': 1})


In [None]:
list_of_param = ("Moisture", "Total solids", "Ash")

def is_it_volatile_solids(df):
    df['check'] = "VS"

    df.loc[df['parameter'].isin(list_of_param), 'check'] = "In list"
    return df

is_it_volatile_solids(df)

df[['check', 'parameter']]



In [None]:

#This defines a function to calculate the square root of the 'value' column in a DataFrame
def sqrtvalue(df):
    df = df.assign(sqrtvalue = df['value'] ** 0.5)
    return df

#List comprehension to apply sqrtvalue to each DataFrame
clean_rooted_df = [sqrtvalue(df) for df in clean_dataframes]

# Display the head of the third DataFrame
clean_rooted_df[2].head()

In [None]:
cmpana_raw = cmpana.extract(project_root=project_root)

cmpana_raw.head()

In [3]:
from sqlmodel import Session, select, create_engine
import pandas as pd
import os
import sys


# Database Connection
DATABASE_URL = "postgresql+psycopg2://biocirv_user:biocirv_dev_password@localhost:5432/biocirv_db"
engine = create_engine(DATABASE_URL)
print(f"Connected to database.")

primary_ag_product = pd.read_sql("SELECT * FROM primary_ag_product;", con=engine)

#reorders columns so id and name are first
cols = ['id', 'name'] + [c for c in primary_ag_product.columns if c not in ['id', 'name']]

primary_ag_product = primary_ag_product[[*cols]]

primary_ag_product


Connected to database.


Unnamed: 0,id,name,note,description,uri
0,1,Tomatoes for processing,,,
1,2,Grapes,,,
2,3,Almonds,,,
3,4,Walnuts,,,
4,5,Sweet potatoes,,,
5,6,Algae,,,
6,7,Olives - processing,,,
7,8,Corn - all,,,
8,9,Hay - alfalfa,,,
9,10,Silage - wheat,,,


In [4]:
from sqlalchemy.orm import Session
from sqlalchemy import select
import pandas as pd
import os
import sys

# --- project root discovery (unchanged) ---
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- imports ---
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.engine import engine
from src.ca_biositing.datamodels.ca_biositing.datamodels.schemas.generated.ca_biositing import PrimaryAgProduct

# --- query + dataframe ---
with Session(engine) as db:
    stmt = select(*PrimaryAgProduct.__table__.columns)
    rows = db.execute(stmt).mappings().all()

df = pd.DataFrame(rows)

df.head()


Unnamed: 0,description,id,name,note,uri
0,,1,Tomatoes for processing,,
1,,2,Grapes,,
2,,3,Almonds,,
3,,4,Walnuts,,
4,,5,Sweet potatoes,,


In [14]:
from sqlalchemy.orm import Session
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.engine import engine
from src.ca_biositing.datamodels.ca_biositing.datamodels.schemas.generated.ca_biositing import *
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.name_id_swap import (
    replace_name_with_id_df,
)   

#This extractst the raw proximate data
df = cmpana.extract(project_root=project_root)

#this cleans the names to lowercase and parses data into a standard format. Also renames the column to match with what will be in the database
test_df = clean_the_gsheets(df).rename(columns={'parameter': 'name'})

#this replaces the names with IDs
with Session(engine) as db:
    parameter_ids = replace_name_with_id_df(
        db=db,
        df=test_df,
        ref_model=Parameter,
        name_column_name="name",   # column in df + table
        id_column_name="id",            # PK column in table
        final_column_name="parameter_id"
    )

##I EVENTUALLY WANT SOME LOGS ABOUT HOW MANY WERE ADDED, HOW MANY RETRIEVED, ETC. MAYBE PUT THAT IN THE 
#resource_id_mapping = df_with_ids.rename(columns={"id": "resource_id"})

#resource_id_mapping


In [15]:
parameter_ids


Unnamed: 0,cmp_uuid_033,record_id,prepared_sample,resource,preparation_method,storage_cond,exper_abbrev,repl_no,repl_id,value,...,created_at,updated_at,qc_result,note,analysis_type,equipment,raw_data_url,analyst_email,upload_status,parameter_id
0,3EE2993D-86E3-1F16-C7EA-F8D555E114,(85)E114,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,1,Cmp04xk(85)1,14.16,...,2025-01-23 09:00:01,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,ready,6
1,46878EF9-1226-22A0-D5D8-CF65E241CB,(85)41CB,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,2,Cmp04xk(85)2,14.18,...,2025-01-23 09:00:16,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,ready,6
2,76A7A2F4-C4E4-E60F-1187-DEC6E02246,(85)2246,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,3,Cmp04xk(85)3,14.12,...,2025-01-23 09:00:31,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,ready,6
3,7A136832-286B-07CB-62DE-ACF52F9311,(85)9311,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,1,Cmp04xk(85)1,15.74,...,2025-01-23 09:00:46,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,ready,4
4,B709ECEE-F9A6-A55D-A59E-93B7B863D7,(85)63D7,Oak-TmPm01O(85),Tomato pomace,Oven dry,RT vacuum sealed,Cmp04xk,2,Cmp04xk(85)2,15.75,...,2025-01-23 09:01:01,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,ready,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,79799601-0E25-3832-3BBF-F4BD9EE973,(A0)E973,Hum-AlmBr024KM2(A0),Almond Branches,Knife Mill (2mm),RT vacuum sealed,Cmp17xk,2,Cmp17xk(A0)2,17.290001,...,2025-03-04 09:55:00,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,not ready,5
386,A4E3393F-72C3-A0ED-C041-105A33031A,(A0)031A,Hum-AlmBr024KM2(A0),Almond Branches,Knife Mill (2mm),RT vacuum sealed,Cmp17xk,3,Cmp17xk(A0)3,17.5,...,2025-03-04 09:55:15,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,not ready,5
387,3D5A85F8-31A2-0CC2-74E6-05D8197C3A,(A0)7C3A,Hum-AlmBr024KM2(A0),Almond Branches,Knife Mill (2mm),RT vacuum sealed,Cmp17xk,1,Cmp17xk(A0)1,35.970001,...,2025-03-04 09:55:30,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,not ready,3
388,C7AE456E-4DCD-AE34-A0D0-BD39D04E42,(A0)4E42,Hum-AlmBr024KM2(A0),Almond Branches,Knife Mill (2mm),RT vacuum sealed,Cmp17xk,2,Cmp17xk(A0)2,35.279999,...,2025-03-04 09:55:45,NaT,pass,,Chemical composition,,,xkang2@lbl.gov,not ready,3


In [None]:
import numpy as np
from prefect import get_run_logger

resource = pd.read_sql("SELECT id, name FROM resource", con=engine)

resource['name'] = resource['name'].str.lower()

resource['name'] = resource['name'].replace('', np.nan)
resource.dropna(subset=['name'], inplace=True)


resource


In [None]:
resource = pd.read_sql("SELECT id, name FROM resource", con=engine)

#this converts the entire dataframe to lowercase
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)

print(df)

In [None]:
field_sample = pd.read_sql_query("SELECT * FROM field_sample", con=engine)

field_sample = field_sample[['id', 'name', 'resource_id']]

field_sample