In [None]:
import os
import sys
import pandas as pd
import numpy as np
import janitor as jn
from IPython.display import display

# --- Robustly find the project root ---
# The project root is the directory containing the 'pixi.toml' file.
path = os.getcwd()
project_root = None
while path != os.path.dirname(path): # Stop at the filesystem root
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

# --- Add project root to sys.path ---
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root '{project_root}' to sys.path")
else:
    print(f"Project root '{project_root}' is already in sys.path")

# --- Import the module ---
try:
    from src.ca_biositing.pipeline.ca_biositing.pipeline.etl.extract import proximate, ultimate, cmpana
    print("Successfully imported all module.")
except ImportError as e:
    print(f"Failed to import modules: {e}")
    print(f"\nFull sys.path: {sys.path}")

# --- Run the extraction ---
if 'proximate' in locals():
    try:
        # Pass the project_root to the extract function
        df = proximate.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted proximate data.")
            display(df.head())
        else:
            print("\n Prox extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during prox extraction: {e}")

if 'ultimate' in locals():
    try:
        df2 = ultimate.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted Ultimate data.")
            display(df2.head())
        else:
            print("\n Ultimate extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during extraction: {e}")

if 'cmpana' in locals():
    try:
        df3 = cmpana.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted CmpAna data.")
            display(df3.head())
        else:
            print("\nCmpAna extraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during cmp ana extraction: {e}")
    finally:
        print("\nCmp Ana extraction process completed.")

#### This function seeks to clean the incoming gsheet dataframes and coerce the types

In [6]:
def clean_the_gsheets(df):
    # 1. Clean names and drop rows
    df = df.clean_names().dropna(subset=['repl_no', 'value'])

    # 2. Coerce types (using errors='coerce' handles messy string data)
    df['repl_no'] = pd.to_numeric(df['repl_no'], errors='coerce').astype('Int32') # Capital 'I' handles NaNs
    df['value'] = pd.to_numeric(df['value'], errors='coerce').astype(np.float32)

    # 3. Dates
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
    df['updated_at'] = pd.to_datetime(df['updated_at'], errors='coerce')

    # 4. Convert remaining objects to best possible types (like strings)
    df = df.convert_dtypes()

    # 5. Convert all string data to lowercase
    df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)

    # 6. Convert empty strings to NaN
    df.replace("", np.nan, inplace=True)

    return df # Return the FULL dataframe, not just .head()

In [None]:
from sqlmodel import Session, select, create_engine
import pandas as pd
import os
import sys
from dotenv import load_dotenv

#This module queries the db via the ORM

load_dotenv(dotenv_path=project_root + "\\resources\\docker\\.env")

# Database Connection

POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWORD= os.getenv("POSTGRES_PASSWORD")

# 2. Host Port Mapping
# This is the port on your local machine that will connect to the container's port 5432.
POSTGRES_PORT= os.getenv("POSTGRES_PORT")

DATABASE_URL = f"postgresql+psycopg2://{POSTGRES_USER}:{POSTGRES_PASSWORD}@localhost:{POSTGRES_PORT}/biocirv_db"
engine = create_engine(DATABASE_URL)
print(f"Connected to database.")

primary_ag_product = pd.read_sql("SELECT * FROM primary_ag_product;", con=engine)

#reorders columns so id and name are first
cols = ['id', 'name'] + [c for c in primary_ag_product.columns if c not in ['id', 'name']]

primary_ag_product = primary_ag_product[[*cols]]

primary_ag_product


In [9]:
#This is a get_or_create type module for data normalization.

# Extract a df from a gsheet
df = cmpana.extract(project_root=project_root)

# Cleans the df names and coerces data types
df = clean_the_gsheets(df)

# Replace empty strings with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)


# These are columns that need to be normalized, AKA replaced with IDs.
# This is a mapping that has first, what it is called in pandas "resource"
# then, the SQLAlchemy model "Resource", and then what it is called in the
# database "name"


NORMALIZE_COLUMNS = {
    "resource": (Resource, "name"),
    "prepared_sample": (PreparedSample, "name"),
    "preparation_method": (PreparationMethod, "name"),
    "parameter": (Parameter, "name"),
    "unit": (Unit, "name"),
    "analyst_email": (Contact, "email"),
    "analysis_type": (AnalysisType, "name"),
    "primary_ag_product": (PrimaryAgProduct, "name")
}

df_normalized = df.copy()

with Session(engine) as db:
    for df_col, (model, model_name_attr) in NORMALIZE_COLUMNS.items():
        if df_col not in df_normalized.columns:
            continue

        df_normalized = replace_name_with_id_df(
            db=db,
            df=df_normalized,
            ref_model=model,
            df_name_column=df_col,
            model_name_attr=model_name_attr,
            id_column_name="id",
            final_column_name=f"{df_col}_id",
        )

    db.commit()

df_normalized.head()


  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df.replace("", np.nan, inplace=True)


NameError: name 'Resource' is not defined

In [10]:
dataframes = [df, df2, df3]

clean_dataframes = [clean_the_gsheets(df) for df in dataframes]

clean_dataframes[2].head()


  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df.replace("", np.nan, inplace=True)
  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df.replace("", np.nan, inplace=True)


Unnamed: 0,cmp_uuid_033,record_id,prepared_sample,resource,preparation_method,storage_cond,exper_abbrev,repl_no,repl_id,parameter,...,unit,created_at,updated_at,qc_result,note,analysis_type,equipment,raw_data_url,analyst_email,upload_status
0,3ee2993d-86e3-1f16-c7ea-f8d555e114,(85)e114,oak-tmpm01o(85),tomato pomace,oven dry,rt vacuum sealed,cmp04xk,1.0,cmp04xk(85)1,glucan,...,% dry weight,2025-01-23 09:00:01,NaT,pass,,compositional analysis,,,xkang2@lbl.gov,ready
1,46878ef9-1226-22a0-d5d8-cf65e241cb,(85)41cb,oak-tmpm01o(85),tomato pomace,oven dry,rt vacuum sealed,cmp04xk,2.0,cmp04xk(85)2,glucan,...,% dry weight,2025-01-23 09:00:16,NaT,pass,,compositional analysis,,,xkang2@lbl.gov,ready
2,76a7a2f4-c4e4-e60f-1187-dec6e02246,(85)2246,oak-tmpm01o(85),tomato pomace,oven dry,rt vacuum sealed,cmp04xk,3.0,cmp04xk(85)3,glucan,...,% dry weight,2025-01-23 09:00:31,NaT,pass,,compositional analysis,,,xkang2@lbl.gov,ready
3,7a136832-286b-07cb-62de-acf52f9311,(85)9311,oak-tmpm01o(85),tomato pomace,oven dry,rt vacuum sealed,cmp04xk,1.0,cmp04xk(85)1,glucose,...,% dry weight,2025-01-23 09:00:46,NaT,pass,,compositional analysis,,,xkang2@lbl.gov,ready
4,b709ecee-f9a6-a55d-a59e-93b7b863d7,(85)63d7,oak-tmpm01o(85),tomato pomace,oven dry,rt vacuum sealed,cmp04xk,2.0,cmp04xk(85)2,glucose,...,% dry weight,2025-01-23 09:01:01,NaT,pass,,compositional analysis,,,xkang2@lbl.gov,ready


In [None]:
from sqlalchemy.orm import Session
from sqlalchemy import select
import pandas as pd
import os
import sys

# --- project root discovery (unchanged) ---
path = os.getcwd()
project_root = None
while path != os.path.dirname(path):
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- imports ---
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.engine import engine
from src.ca_biositing.datamodels.ca_biositing.datamodels.schemas.generated.ca_biositing import PrimaryAgProduct

# --- query + dataframe ---
with Session(engine) as db:
    stmt = select(*PrimaryAgProduct.__table__.columns)
    rows = db.execute(stmt).mappings().all()

df = pd.DataFrame(rows)

df.head()


In [None]:
summary_stats = clean_dataframes[0].\
    groupby(['resource', 'parameter'])['value'].\
        agg(['mean', 'median', 'min', 'max', 'std', 'count'])

summary_stats

In [None]:
clean_dataframes[0][['resource', 'parameter', 'value', 'unit']].\
    groupby(['resource', 'parameter', 'unit'], as_index=False).\
    agg({'value': 'mean'}).\
    query('value > 30').\
    sort_values(by='value', ascending=False).\
    round({'value': 1})


In [17]:
list_of_param = ("Moisture", "Total solids", "Ash")

def is_it_volatile_solids(df):
    df['check'] = "VS"

    df.loc[df['parameter'].isin(list_of_param), 'check'] = "In list"
    return df

is_it_volatile_solids(df)

df[['check', 'parameter']]



KeyError: 'parameter'

In [None]:

#This defines a function to calculate the square root of the 'value' column in a DataFrame
def sqrtvalue(df):
    df = df.assign(sqrtvalue = df['value'] ** 0.5)
    return df

#List comprehension to apply sqrtvalue to each DataFrame
clean_rooted_df = [sqrtvalue(df) for df in clean_dataframes]

# Display the head of the third DataFrame
clean_rooted_df[2].head()

In [None]:
cmpana_raw = cmpana.extract(project_root=project_root)

cmpana_raw.head()

In [20]:
from sqlalchemy.orm import Session
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.engine import engine
from src.ca_biositing.datamodels.ca_biositing.datamodels.schemas.generated.ca_biositing import *
from src.ca_biositing.pipeline.ca_biositing.pipeline.utils.name_id_swap import (
    replace_name_with_id_df,
)   

#This extractst the raw proximate data
df = cmpana.extract(project_root=project_root)

#this cleans the names to lowercase and parses data into a standard format. Also renames the column to match with what will be in the database
test_df = clean_the_gsheets(df).rename(columns={'parameter': 'name'})

#this replaces the names with IDs
with Session(engine) as db:
    parameter_ids = replace_name_with_id_df(
        db=db,
        df=test_df,
        ref_model=Parameter,
        name_column_name="name",   # column in df + table
        id_column_name="id",            # PK column in table
        final_column_name="parameter_id"
    )

##I EVENTUALLY WANT SOME LOGS ABOUT HOW MANY WERE ADDED, HOW MANY RETRIEVED, ETC. MAYBE PUT THAT IN THE 
#resource_id_mapping = df_with_ids.rename(columns={"id": "resource_id"})

#resource_id_mapping


  df = df.applymap(lambda s: s.lower() if isinstance(s, str) else s)
  df.replace("", np.nan, inplace=True)


TypeError: replace_name_with_id_df() got an unexpected keyword argument 'name_column_name'. Did you mean 'id_column_name'?

In [None]:
parameter_ids


In [None]:
import numpy as np
from prefect import get_run_logger

resource = pd.read_sql("SELECT id, name FROM resource", con=engine)

resource['name'] = resource['name'].str.lower()

resource['name'] = resource['name'].replace('', np.nan)
resource.dropna(subset=['name'], inplace=True)


resource


In [None]:
resource = pd.read_sql("SELECT id, name FROM resource", con=engine)

#this converts the entire dataframe to lowercase
df = df.map(lambda x: x.lower() if isinstance(x, str) else x)

print(df)

In [None]:
field_sample = pd.read_sql_query("SELECT * FROM field_sample", con=engine)

field_sample = field_sample[['id', 'name', 'resource_id']]

field_sample

In [None]:
#This is a get_or_create type module for data normalization.

# Extract a df from a gsheet
df = cmpana.extract(project_root=project_root)

# Cleans the df names and coerces data types
df = clean_the_gsheets(df)

# Replace empty strings with NaN
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)


# These are columns that need to be normalized, AKA replaced with IDs.
# This is a mapping that has first, what it is called in pandas "resource"
# then, the SQLAlchemy model "Resource", and then what it is called in the
# database "name"


NORMALIZE_COLUMNS = {
    "resource": (Resource, "name"),
    "prepared_sample": (PreparedSample, "name"),
    "preparation_method": (PreparationMethod, "name"),
    "parameter": (Parameter, "name"),
    "unit": (Unit, "name"),
    "analyst_email": (Contact, "email"),
    "analysis_type": (AnalysisType, "name"),
    "primary_ag_product": (PrimaryAgProduct, "name")
}

df_normalized = df.copy()

with Session(engine) as db:
    for df_col, (model, model_name_attr) in NORMALIZE_COLUMNS.items():
        if df_col not in df_normalized.columns:
            continue

        df_normalized = replace_name_with_id_df(
            db=db,
            df=df_normalized,
            ref_model=model,
            df_name_column=df_col,
            model_name_attr=model_name_attr,
            id_column_name="id",
            final_column_name=f"{df_col}_id",
        )

    db.commit()

df_normalized.head()
