In [None]:
import os
import sys
import pandas as pd
import janitor as jn
from IPython.display import display

# --- Robustly find the project root ---
# The project root is the directory containing the 'pixi.toml' file.
path = os.getcwd()
project_root = None
while path != os.path.dirname(path): # Stop at the filesystem root
    if 'pixi.toml' in os.listdir(path):
        project_root = path
        break
    path = os.path.dirname(path)

if not project_root:
    raise FileNotFoundError("Could not find project root containing 'pixi.toml'.")

# --- Add project root to sys.path ---
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added project root '{project_root}' to sys.path")
else:
    print(f"Project root '{project_root}' is already in sys.path")

# --- Import the module ---
try:
    from src.ca_biositing.pipeline.ca_biositing.pipeline.etl.extract import proximate
    print("Successfully imported 'proximate' module.")
except ImportError as e:
    print(f"Failed to import 'proximate' module: {e}")
    print(f"\nFull sys.path: {sys.path}")

# --- Run the extraction ---
if 'proximate' in locals():
    try:
        # Pass the project_root to the extract function
        df = proximate.extract(project_root=project_root)
        if df is not None:
            print("\nSuccessfully extracted data.")
            display(df.head())
        else:
            print("\nExtraction returned no data. Check the logs above for errors.")
    except Exception as e:
        print(f"\nAn error occurred during extraction: {e}")

In [None]:
df.clean_names()

In [None]:
df[['Record_ID', 'Source_codename', 'Prepared_sample']]

In [None]:
## Convert 'Value' column to numeric
pv_df = df[['Parameter', 'Value']]

pv_df['Value'] = pd.to_numeric(pv_df['Value'], errors='coerce')

In [None]:
pv_df

if pv_df.groupby('Parameter').mean() is not None:
    print("Mean values by Parameter:")
    display(pv_df.groupby('Parameter').mean())
else:
    print("No numeric values available to compute means.")

In [None]:
df2 = df[['Resource', 'Parameter', 'Value', 'Unit']]

df2['Value'] = pd.to_numeric(df2['Value'], errors='coerce')

In [None]:
df2

In [None]:
df2 = df2.dropna(subset=['Value'])

In [None]:
summary_stats = df2.groupby(['Resource', 'Parameter'])['Value'].agg(['mean', 'median', 'min', 'max', 'std', 'count'])

In [None]:
summary_stats