In [None]:
# Cell 1: Imports
import pandas as pd
import numpy as np
import sqlite3
import os
import yaml
from icoscp.cpb.dobj import Dobj
from icoscp_core.icos import bootstrap
from icoscp import cpauth



In [None]:
# Cell 2: Read YAML config
OSVAS='/home/pn56/OSVASgh/'  # Main OSVAS path
Station_name='Loobos'
os.chdir(OSVAS)

CONFIG_PATH = f"config_files/Stations/{Station_name}.yml"

with open(CONFIG_PATH, "r") as f:
    config = yaml.safe_load(f)

station_info = config["Station_metadata"]
validation_data = config["Validation_data"]

start_date = pd.to_datetime(validation_data["validation_start"], utc=True)
end_date = pd.to_datetime(validation_data["validation_end"], utc=True)

datasets = {k: v for k, v in validation_data.items() if k.startswith("dataset") or k.startswith("dataset_")}

# Get access to ICOS
# Authenticate using cookie (adjust if needed)
cookie_path = "icos_cookie.txt"  # Or point to config
cookie_token = open(cookie_path, "r").readline().strip()
meta, data = bootstrap.fromCookieToken(cookie_token)
cpauth.init_by(data.auth)

#Test: If the authentication went well, these lines of code will not fail:
import icoscp
from icoscp.dobj import Dobj
obj_flux='https://meta.icos-cp.eu/objects/dDlpnhS3XKyZjB22MUzP_nAm'
dobj_flux=Dobj(obj_flux).data

In [None]:
# Cell 3: Helper functions

def fetch_flux_data(doi):
    dobj = Dobj(doi)
    df = dobj.data
    return df

def process_data(df, variable_map, station_info, start, end):
    df['valid_dttm'] = pd.to_datetime(df['TIMESTAMP'], utc=True)
    df = df[(df['valid_dttm'] >= start) & (df['valid_dttm'] <= end)].copy()

    # Drop rows with missing required vars
    source_vars = list(variable_map.values())
    df = df.dropna(subset=source_vars)

    # Add station metadata
    df['SID'] = station_info['SID']
    df['lat'] = station_info['lat']
    df['lon'] = station_info['lon']
    df['elev'] = station_info['elev']

    # Rename variables
    df = df.rename(columns={v: k for k, v in variable_map.items()})
    selected_columns = ['valid_dttm', 'SID', 'lat', 'lon', 'elev'] + list(variable_map.keys())

    return df[selected_columns]

def upsample_to_common_timedelta(datasets, dfs, common_td):
    dfs_resampled = []

    for name, df in zip(datasets.keys(), dfs):
        orig_td = pd.to_timedelta(datasets[name]["timedelta"], unit="m")
        if orig_td == common_td:
            dfs_resampled.append(df)
        else:
            df = df.set_index("valid_dttm")
            df = df.resample(common_td).interpolate(method="linear")
            df = df.reset_index()
            dfs_resampled.append(df)

    return dfs_resampled



In [None]:
# Cell 4: Main loop over datasets, abort if no data in range
dfs = []
timedeltas = []

for ds_name, ds_info in datasets.items():
    print(f"Processing {ds_name} from DOI: {ds_info['doi']}")
    doi = ds_info["doi"]
    timedelta_minutes = ds_info["timedelta"]
    timedeltas.append(timedelta_minutes)

    variable_map = {k: v for k, v in ds_info["variables"].items() if v is not None}

    # Fetch and process
    df_raw = fetch_flux_data(doi)
    df_processed = process_data(df_raw, variable_map, station_info, start_date, end_date)

    # Abort if no data in the specified time window
    if df_processed.empty:
        raise RuntimeError(
            f"❌ No data found in the time window ({start_date} to {end_date}) "
            f"for dataset {ds_name} (DOI: {doi}). Aborting."
        )

    dfs.append(df_processed)


In [None]:
# Cell 5: Harmonize resolution
common_td = pd.to_timedelta(min(timedeltas), unit="m")  # Choose finest resolution
dfs_resampled = upsample_to_common_timedelta(datasets, dfs, common_td)



In [None]:
# Cell 6: Merge all datasets
from functools import reduce
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['valid_dttm', 'SID', 'lat', 'lon', 'elev'], how='outer'), dfs_resampled)
df_merged = df_merged.sort_values("valid_dttm").reset_index(drop=True)
# Convert to Unix timestamp in seconds
df_merged["valid_dttm"] = pd.to_datetime(df_merged["valid_dttm"], utc=True)
df_merged["valid_dttm"] = df_merged["valid_dttm"].apply(lambda x: int(x.timestamp()))


In [None]:
# Cell 7: Save to SQLite
year = pd.to_datetime(start_date).year
output_dir = f"sqlites/data/observations/{station_info['Station_name']}"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, f"OBSTABLE_{year}.sqlite")

with sqlite3.connect(output_file) as conn:
    df_merged.to_sql("SYNOP", conn, if_exists="replace", index=False)

    conn.execute("DROP TABLE IF EXISTS tmp")
    columns = ',\n'.join([f"{col} REAL" for col in df_merged.columns])
    conn.execute(f"CREATE TABLE tmp ({columns})")
    conn.execute("INSERT INTO tmp SELECT * FROM SYNOP")
    conn.execute("DROP TABLE SYNOP")
    conn.execute("ALTER TABLE tmp RENAME TO SYNOP")

print(f"✅ Data written to {output_file}")