In [1]:
# %pip install earthaccess

#SETUP
import earthaccess
import xarray as xr
import numpy as np
from PIL import Image, ImageEnhance

In [2]:
#PARAMETERS
tspan = ("2024-09-22", "2024-09-28")
bbox = (-125, 32, -116, 38)
clouds = (0, 50)

In [3]:
#DATA SEARCH
results = earthaccess.search_data(
    short_name="PACE_OCI_L2_AOP",
    temporal=tspan,
    bounding_box=bbox,
    cloud_cover=clouds,
)
paths = earthaccess.open(results)
datatree = xr.open_datatree(paths[0])
dataset = xr.merge(datatree.to_dict().values())

QUEUEING TASKS | :   0%|          | 0/8 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/8 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/8 [00:00<?, ?it/s]

In [4]:
#results

In [4]:
len(results)

8

In [5]:
results[0]

In [None]:
#pip install https://seabass.gsfc.nasa.gov/wiki/seabass_tools/sb_utilities-0.0.2.tar.gz


In [6]:
# List l2 flags, then build them into a dict
l2_flags_list = [
    "ATMFAIL",
    "LAND",
    "PRODWARN",
    "HIGLINT",
    "HILT",
    "HISATZEN",
    "COASTZ",
    "SPARE",
    "STRAYLIGHT",
    "CLDICE",
    "COCCOLITH",
    "TURBIDW",
    "HISOLZEN",
    "SPARE",
    "LOWLW",
    "CHLFAIL",
    "NAVWARN",
    "ABSAER",
    "SPARE",
    "MAXAERITER",
    "MODGLINT",
    "CHLWARN",
    "ATMWARN",
    "SPARE",
    "SEAICE",
    "NAVFAIL",
    "FILTER",
    "SPARE",
    "BOWTIEDEL",
    "HIPOL",
    "PRODFAIL",
    "SPARE",
]

In [7]:
L2_FLAGS = {flag: 1 << idx for idx, flag in enumerate(l2_flags_list)}

# Bailey and Werdell 2006 exclusion criteria
EXCLUSION_FLAGS = [
    #"LAND",
    #"HIGLINT",
    #"HILT",
    #"STRAYLIGHT",
    "CLDICE",
    #"ATMFAIL",
    #"LOWLW",
    #"FILTER",
    #"NAVFAIL",
    #"NAVWARN",
]


In [8]:
# Short names for earthaccess lookup
SAT_LOOKUP = {
    "PACE_AOP": "PACE_OCI_L2_AOP",
}


In [9]:
##---------------------------------------------------------------------------##
#                 Load the OCI sensor file and return F0.                     #
##---------------------------------------------------------------------------##

def get_f0(wavelengths=None, window_size=10):
    """Load the OCI sensor file and return F0.

    Defaults to returning the full table. Input obs_time to correct for the
    Earth-Sun distance.

    Parameters
    ----------
    sensor_file : str or pathlib.Path
        Path to the OCI satellite sensor file containing wavelengths and F0.
    wavelengths : array-like, optional
        Wavelengths at which to compute the average irradiance.
        If None, returns the full wavelength and irradiance table.
    window_size : int, optional
        Bandpass filter size for mean filtering to selected wavelengths, in nm.

    Returns
    -------
    tuple of np.ndarray
        A tuple containing:
        - f0_spectra : np.ndarray
            The extraterrestrial solar irradiance, in uW/cm^2/nm.
        - f0_wave : np.ndarray
            The corresponding wavelengths, in nm.

    """
    with open(OCI_SENSOR_FILE, "r") as file_in:
        for line in file_in:
            if "Nbands" in line:
                (key, nbands) = line.split("=")
                break

    wl = np.zeros(int(nbands), dtype=float)
    f0 = np.zeros(int(nbands), dtype=float)
    with open(OCI_SENSOR_FILE, "r") as file_in:
        for line in file_in:
            if "=" in line:
                (key, value) = line.split("=")
                if "Lambda" in key:
                    idx = re.findall(r"\d+", key)
                    wvlidx = int(idx[0]) - 1
                    wl[wvlidx] = float(value)
                if "F0" in key:
                    idx = re.findall(r"\d+", key)
                    wvlidx = int(idx[1]) - 1
                    f0[wvlidx] = float(value)

    if wavelengths is not None:
        f0_wave = np.array(wavelengths)
        f0_spectra = bandpass_avg(f0, wl, window_size, f0_wave)
    else:
        f0_wave = wl
        f0_spectra = f0

    return f0_spectra, f0_wave


In [10]:
##---------------------------------------------------------------------------##
#                 Apply a band-pass filter to the data.                       #
##---------------------------------------------------------------------------##
def bandpass_avg(
        data,
        input_wavelengths,
        window_size=10,
        target_wavelengths=None
        ):
    """Apply a band-pass filter to the data.

    Parameters
    ----------
    data : np.ndarray
        1D or 2D array containing the spectral data (samples x wavelengths).
        If 1D, it's assumed to be a single sample.
    input_wavelengths : np.ndarray
        1D array of wavelength values corresponding to the columns of data.
    window_size : int, optional
        Size of the window to use for averaging. Default is 10 nm.
    target_wavelengths : np.ndarray, optional
        1D array of target wavelengths for filtered values.
        If None, the input wavelengths are used.

    Returns
    -------
    np.ndarray
        1D or 2D array containing the band-pass filtered data.

    """
    data = np.atleast_2d(data)
    half_window = window_size / 2
    num_samples, num_input_wavelengths = data.shape
    if target_wavelengths is None:
        target_wavelengths = input_wavelengths

    filtered_data = np.empty((num_samples, len(target_wavelengths))) * np.nan

    for idx, target_wl in enumerate(target_wavelengths):
        start = target_wl - half_window
        end = target_wl + half_window
        cols_in_range = np.where(
            (input_wavelengths >= start) & (input_wavelengths <= end)
        )[0]
        if cols_in_range.size > 0:
            filtered_data[:, idx] = np.nanmean(data[:, cols_in_range], axis=1)

    return filtered_data if num_samples > 1 else filtered_data.flatten()

In [11]:
##---------------------------------------------------------------------------##
#         Process a dataframe to create a dictionary of data products.        #
##---------------------------------------------------------------------------##

def get_column_prods(df, type_prefix):
    """Process a dataframe to create a dictionary of data products.

    Parameters
    ----------
    df : pandas DataFrame
        Extracted dataframes from read_extract_file
    type_prefix : str
        Prefix to identify the product columns, e.g. "aoc"

    Returns
    -------
    data_dict
        dictionary mapping data product with their wavelengths and columns.

    """
    data_dict = {}
    pattern = rf"{type_prefix}_(\w+?)(\d*\.?\d+)?$"

    for col in df.columns:
        match = re.match(pattern, col)
        if match:
            product = match.group(1)
            wavelength = match.group(2) if match.group(2) else None
            if product not in data_dict:
                data_dict[product] = {"wavelengths": [], "columns": []}
            data_dict[product]["columns"].append(col)
            if wavelength:
                if "." in wavelength:
                    data_dict[product]["wavelengths"].append(float(wavelength))
                else:
                    data_dict[product]["wavelengths"].append(int(wavelength))
    return data_dict


In [12]:
##---------------------------------------------------------------------------##
#                Read SeaBASS file and returns just the data.                 #
##---------------------------------------------------------------------------##

import pandas as pd
import builtins

def read_sb(filename_sb):
    """Read SeaBASS .sb file, parse header and data, 
       then attach profile_lat, profile_lon, profile_time."""
    # 1) Load all lines
    with builtins.open(filename_sb, "r") as f:
        lines = [l.rstrip("\n") for l in f]

    # 2) Find where the header ends
    endh = next(i for i, L in enumerate(lines) if L == "/end_header")

    # 3) Parse header into a dict, but only lines with "/" **and** "="
    headers = {}
    for line in lines[:endh]:
        if not line.startswith("/") or "=" not in line:
            continue
        key, val = line[1:].split("=", 1)  # strip "/" then split
        headers[key] = val

    # 4) Read the CSV portion into a DataFrame
    df = pd.read_csv(
        filename_sb,
        skiprows=endh + 1,
        names=headers["fields"].split(","),
        na_values=headers.get("missing", "")
    )

    # 5) Build the datetime index (your existing routine)
    get_sb_datetime(df)

    # 6) Extract & clean metadata from headers
    #    Strip off any "[...]" before converting to float
    lat_str = headers["north_latitude"].split("[", 1)[0]
    lon_str = headers["east_longitude"].split("[", 1)[0]
    lat = float(lat_str)
    lon = float(lon_str)

    #    Strip "[GMT]" from the time field
    time_str = headers["start_time"].split("[", 1)[0]
    dt0 = pd.to_datetime(headers["start_date"] + " " + time_str)

    # 7) Attach them as new columns on every row
    df["profile_lat"]  = lat
    df["profile_lon"]  = lon
    df["profile_time"] = dt0

    return df



In [13]:
##---------------------------------------------------------------------------##
#     Parse datetime from different combinations of dates and times.          #
##---------------------------------------------------------------------------##

def get_sb_datetime(df):
    """Parse datetime from different combinations of dates and times."""
    if all(col in df.columns for col in ["year", "month", "day",
                                         "hour", "minute", "second"]):
        df["datetime"] = pd.to_datetime(df[["year", "month", "day",
                                            "hour", "minute", "second"]])
    elif all(col in df.columns for col in ["year", "month", "day", "time"]):
        df["datetime"] = pd.to_datetime(
            df["year"].astype(str) + df["month"].astype(str).str.zfill(2)
            + df["day"].astype(str).str.zfill(2) + ' ' + df["time"])
    elif all(col in df.columns for col in ["date", "time"]):
        df["datetime"] = pd.to_datetime(
            df["date"].astype(str) + ' ' + df["time"])
    elif all(col in df.columns for col in ["year", "month", "day"]):
        df["datetime"] = pd.to_datetime(df[["year", "month", "day"]])
    elif all(col in df.columns for col in ["date", "hour",
                                           "minute", "second"]):
        df["datetime"] = pd.to_datetime(
            df["date"].astype(str) + ' ' + df["hour"].astype(str).str.zfill(2)
            + ':' + df["minute"].astype(str).str.zfill(2) + ':'
            + df["second"].astype(str).str.zfill(2))
    else:
        print("Unrecognized date/time format in DataFrame columns."
              "\nMay be a profile, but doublecheck.")
        return

    # Reindex the dataframe with the new datetime
    df.set_index("datetime", inplace=True)


In [14]:

file_path = '/home/jovyan/shared-public/pace-hackweek/SeePACE/'
file_path += 'Hackweek_PACE-PAX_Rrs/NRL/PACE-PAX/'
file_path += 'PACE-PAX_Shearwater/archive/'
file_path += 'PVST_POL_PACE-PAX_Shearwater_above_water_radiometry_nflh_NRL_20240906_St_1_R1.sb'
df = read_sb(file_path)
df

Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.


Unnamed: 0,Wavelength,Rrs,Es,Lt,Lsky,Rrs_sd,Es_sd,Lt_sd,Lsky_sd,profile_lat,profile_lon,profile_time
0,350,0.003501,41.319572,0.374287,5.454650,0.000178,2.223760,0.012333,0.103090,34.2247,-119.6792,2024-09-06 21:11:00
1,351,0.003514,41.645705,0.373883,5.404339,0.000207,2.262834,0.013471,0.102088,34.2247,-119.6792,2024-09-06 21:11:00
2,352,0.003513,41.640951,0.372514,5.372438,0.000213,2.254194,0.014639,0.102735,34.2247,-119.6792,2024-09-06 21:11:00
3,353,0.003627,42.019692,0.377753,5.351352,0.000088,2.246336,0.012535,0.103807,34.2247,-119.6792,2024-09-06 21:11:00
4,354,0.003683,42.871810,0.382332,5.329134,0.000036,2.250763,0.010715,0.104403,34.2247,-119.6792,2024-09-06 21:11:00
...,...,...,...,...,...,...,...,...,...,...,...,...
721,1071,0.000393,50.975482,0.032284,0.245038,0.001673,1.892287,0.089971,0.035988,34.2247,-119.6792,2024-09-06 21:11:00
722,1072,0.000143,50.493906,0.020623,0.272989,0.001141,0.874190,0.060150,0.064116,34.2247,-119.6792,2024-09-06 21:11:00
723,1073,-0.000226,49.964856,0.003168,0.298876,0.001101,0.769092,0.055973,0.097953,34.2247,-119.6792,2024-09-06 21:11:00
724,1074,0.000165,50.412369,0.023002,0.303368,0.001666,3.064564,0.086094,0.047056,34.2247,-119.6792,2024-09-06 21:11:00


In [15]:
# assume df is your 726×9 DataFrame with a “Wavelength” column
df_wide = df.set_index("Wavelength").T

print(df_wide.shape)   # → (9, 726)
df_wide.index.name = None          # drop the index name if you like
df_wide.columns.name = "λ (nm)"    # optional: name the wavelength axis

df_wide


(11, 726)


λ (nm),350,351,352,353,354,355,356,357,358,359,...,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075
Rrs,0.003501,0.003514,0.003513,0.003627,0.003683,0.003659,0.003737,0.003935,0.003821,0.003871,...,0.00055,0.00012,-0.001066,-0.000936,-0.000392,0.000393,0.000143,-0.000226,0.000165,-0.000296
Es,41.319572,41.645705,41.640951,42.019692,42.87181,43.295639,43.558171,43.016972,44.34717,44.325722,...,50.073565,49.766057,48.425902,48.972293,49.99225,50.975482,50.493906,49.964856,50.412369,49.457887
Lt,0.374287,0.373883,0.372514,0.377753,0.382332,0.383283,0.386338,0.390555,0.390681,0.392049,...,0.043926,0.022312,-0.036569,-0.031821,-0.006909,0.032284,0.020623,0.003168,0.023002,-0.001708
Lsky,5.45465,5.404339,5.372438,5.351352,5.329134,5.338909,5.307509,5.253226,5.250999,5.232545,...,0.344527,0.344054,0.314731,0.289037,0.256812,0.245038,0.272989,0.298876,0.303368,0.26271
Rrs_sd,0.000178,0.000207,0.000213,0.000088,0.000036,0.000173,0.000168,0.00011,0.000167,0.000157,...,0.000631,0.000289,0.001632,0.001355,0.001046,0.001673,0.001141,0.001101,0.001666,0.001981
Es_sd,2.22376,2.262834,2.254194,2.246336,2.250763,2.227197,2.219295,2.226539,2.449722,2.491744,...,1.859967,1.76014,2.88785,2.358109,1.775841,1.892287,0.87419,0.769092,3.064564,2.563841
Lt_sd,0.012333,0.013471,0.014639,0.012535,0.010715,0.010898,0.012751,0.01481,0.012315,0.012664,...,0.03096,0.010298,0.07591,0.066558,0.055003,0.089971,0.06015,0.055973,0.086094,0.094886
Lsky_sd,0.10309,0.102088,0.102735,0.103807,0.104403,0.104651,0.100955,0.097901,0.106514,0.110186,...,0.058612,0.088324,0.047263,0.078623,0.089678,0.035988,0.064116,0.097953,0.047056,0.047706
profile_lat,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,...,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247,34.2247
profile_lon,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,...,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792,-119.6792


In [16]:
import glob
import os
import pandas as pd

# 1) Point this at your “archive” folder containing all the .sb files
archive_dir = "/home/jovyan/shared-public/pace-hackweek/SeePACE/" \
            + "Hackweek_PACE-PAX_Rrs/NRL/PACE-PAX/PACE-PAX_Shearwater/archive"

# 2) Grab a sorted list of all the .sb paths
sb_files = sorted(glob.glob(os.path.join(archive_dir, "*.sb")))

# 3) Loop over them, reading each one and collecting into a list
df_list = []
for sb_path in sb_files:
    df = read_sb(sb_path)          # your metadata‐aware reader
    df_list.append(df)

# 4) Stack them into a single DataFrame
all_profiles = pd.concat(df_list, ignore_index=True)

# 5) Inspect
print(all_profiles.shape)   # → (number_of_profiles*726, number_of_columns)
all_profiles.head()

# 6) (Optional) Save to disk for fast reload later
all_profiles.to_csv("all_SeaBASS_profiles.csv", index=False)
# or
all_profiles.to_pickle("all_SeaBASS_profiles.pkl")



Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame columns.
May be a profile, but doublecheck.
Unrecognized date/time format in DataFrame 

In [17]:
import pandas as pd

# From CSV (human‐readable, but a little slower to load)
df_csv = pd.read_csv("all_SeaBASS_profiles.csv")
df_csv


Unnamed: 0,Wavelength,Rrs,Es,Lt,Lsky,Rrs_sd,Es_sd,Lt_sd,Lsky_sd,profile_lat,profile_lon,profile_time
0,350,0.003501,41.319572,0.374287,5.454650,0.000178,2.223760,0.012333,0.103090,34.2247,-119.6792,2024-09-06 21:11:00
1,351,0.003514,41.645705,0.373883,5.404339,0.000207,2.262834,0.013471,0.102088,34.2247,-119.6792,2024-09-06 21:11:00
2,352,0.003513,41.640951,0.372514,5.372438,0.000213,2.254194,0.014639,0.102735,34.2247,-119.6792,2024-09-06 21:11:00
3,353,0.003627,42.019692,0.377753,5.351352,0.000088,2.246336,0.012535,0.103807,34.2247,-119.6792,2024-09-06 21:11:00
4,354,0.003683,42.871810,0.382332,5.329134,0.000036,2.250763,0.010715,0.104403,34.2247,-119.6792,2024-09-06 21:11:00
...,...,...,...,...,...,...,...,...,...,...,...,...
26857,1071,-0.000524,47.045571,-0.015877,0.162609,0.001095,1.049333,0.050013,0.023816,34.3406,-119.6475,2024-09-26 20:22:00
26858,1072,0.000058,47.191733,0.011105,0.148459,0.000410,1.396961,0.018904,0.028819,34.3406,-119.6475,2024-09-26 20:22:00
26859,1073,0.000470,48.127742,0.032211,0.188250,0.001322,1.233449,0.061650,0.056633,34.3406,-119.6475,2024-09-26 20:22:00
26860,1074,0.000858,48.392412,0.053234,0.261307,0.001073,0.793555,0.048896,0.118824,34.3406,-119.6475,2024-09-26 20:22:00


In [18]:
# assume df is your 26862×12 DataFrame loaded from CSV/pickle
import pandas as pd

# 0) Load your combined SeaBASS CSV (set the correct path/filename)
df = pd.read_csv("all_SeaBASS_profiles.csv")

# 1) Pivot so each profile (profile_time, lat, lon) becomes one row,
#    and each Wavelength becomes its own column holding the Rrs value.
wide = df.pivot(
    index=["profile_time", "profile_lat", "profile_lon"],
    columns="Wavelength",
    values="Rrs"
)

# 2) Turn the pivot index back into columns and rename them:
wide = (
    wide
    .reset_index()
    .rename(columns={
        "profile_time":"datetime",
        "profile_lat":"lat",
        "profile_lon":"lon"
    })
)

# 3) Convert 'datetime' to real Timestamp and split out date & time strings:
wide["datetime"] = pd.to_datetime(wide["datetime"])
wide["date"]     = wide["datetime"].dt.strftime("%Y%m%d")
wide["time"]     = wide["datetime"].dt.strftime("%H:%M:%S")

# 4) Reorder: metadata first, then wavelengths in ascending order
wls = sorted(c for c in wide.columns if isinstance(c, (int, float)))
wide = wide[["datetime", "date", "time", "lat", "lon"] + wls]

# 5) Inspect
print(wide.shape)   # → (number_of_profiles, 5 + number_of_wavelengths)
wide

(37, 731)


Wavelength,datetime,date,time,lat,lon,350,351,352,353,354,...,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075
0,2024-09-06 21:11:00,20240906,21:11:00,34.2247,-119.6792,0.003501,0.003514,0.003513,0.003627,0.003683,...,0.00055,0.00012,-0.001066,-0.000936,-0.000392,0.000393,0.000143,-0.000226,0.000165,-0.000296
1,2024-09-08 19:28:00,20240908,19:28:00,34.3779,-120.0773,0.001397,0.001493,0.001629,0.00165,0.001647,...,0.001979,0.001414,0.00017,0.000861,0.001528,0.001255,4.2e-05,-0.000563,0.00034,0.000179
2,2024-09-08 22:09:00,20240908,22:09:00,34.3691,-119.6386,0.003461,0.003526,0.003651,0.003754,0.003756,...,0.000331,0.000284,-0.000393,-0.000731,-0.000495,-0.00015,-0.000641,-0.000585,0.000447,1.3e-05
3,2024-09-12 18:18:00,20240912,18:18:00,34.3676,-120.0343,0.003865,0.003929,0.004043,0.004049,0.004002,...,0.002587,0.001858,0.000239,-1.3e-05,2e-05,0.000216,0.001822,0.002368,0.000989,0.001558
4,2024-09-12 19:20:00,20240912,19:20:00,34.3559,-119.6483,0.004575,0.004592,0.004677,0.004697,0.004683,...,0.000795,0.000543,0.000294,-0.000138,6.2e-05,0.000999,0.001474,0.001641,0.001509,0.00162
5,2024-09-12 21:05:00,20240912,21:05:00,34.3685,-119.6358,0.004348,0.004402,0.004428,0.004514,0.004541,...,0.000378,0.000732,0.000392,-0.000104,-0.000116,0.000367,-0.000178,-0.000569,0.000158,0.00035
6,2024-09-13 17:45:00,20240913,17:45:00,34.3272,-119.5062,0.004156,0.004583,0.005754,0.00555,0.004971,...,-0.008188,-0.004141,0.008704,0.002662,-0.000686,0.004873,0.006212,0.00293,0.000156,-0.002717
7,2024-09-13 19:19:00,20240913,19:19:00,34.3349,-119.5721,0.004493,0.004563,0.004509,0.004554,0.004535,...,0.000784,-0.000997,-0.003354,-0.000157,0.001572,-0.000364,-0.001071,-0.001686,-0.003521,-0.0021
8,2024-09-13 21:12:00,20240913,21:12:00,34.123,-119.7015,0.004409,0.004446,0.00441,0.004423,0.004398,...,0.003337,0.003526,0.002365,0.001381,0.001052,0.001402,0.002771,0.003191,0.000956,0.000638
9,2024-09-14 20:50:00,20240914,20:50:00,34.1318,-119.3766,0.002806,0.002944,0.002828,0.00277,0.002815,...,-0.000705,-0.001468,-0.000606,0.000119,0.000153,-9.7e-05,0.001471,0.000891,-0.003561,-0.002162


In [65]:

# (re)build the wide table once and for all:
df = pd.read_csv("all_SeaBASS_profiles.csv", parse_dates=["profile_time"])
df_wide = df.pivot(
    index=["profile_time","profile_lat","profile_lon"],
    columns="Wavelength",
    values="Rrs"
).reset_index().rename(columns={
    "profile_time":"datetime",
    "profile_lat":  "lat",
    "profile_lon":  "lon"
})
# split out date/time if you need them
df_wide["date"] = df_wide["datetime"].dt.strftime("%Y%m%d")
df_wide["time"] = df_wide["datetime"].dt.strftime("%H:%M:%S")
# reorder columns
wls = sorted(c for c in df_wide.columns if isinstance(c,(int,float)))
df = df_wide[["datetime","date","time","lat","lon"] + wls]
df

Wavelength,datetime,date,time,lat,lon,350,351,352,353,354,...,1066,1067,1068,1069,1070,1071,1072,1073,1074,1075
0,2024-09-06 21:11:00,20240906,21:11:00,34.2247,-119.6792,0.003501,0.003514,0.003513,0.003627,0.003683,...,0.00055,0.00012,-0.001066,-0.000936,-0.000392,0.000393,0.000143,-0.000226,0.000165,-0.000296
1,2024-09-08 19:28:00,20240908,19:28:00,34.3779,-120.0773,0.001397,0.001493,0.001629,0.00165,0.001647,...,0.001979,0.001414,0.00017,0.000861,0.001528,0.001255,4.2e-05,-0.000563,0.00034,0.000179
2,2024-09-08 22:09:00,20240908,22:09:00,34.3691,-119.6386,0.003461,0.003526,0.003651,0.003754,0.003756,...,0.000331,0.000284,-0.000393,-0.000731,-0.000495,-0.00015,-0.000641,-0.000585,0.000447,1.3e-05
3,2024-09-12 18:18:00,20240912,18:18:00,34.3676,-120.0343,0.003865,0.003929,0.004043,0.004049,0.004002,...,0.002587,0.001858,0.000239,-1.3e-05,2e-05,0.000216,0.001822,0.002368,0.000989,0.001558
4,2024-09-12 19:20:00,20240912,19:20:00,34.3559,-119.6483,0.004575,0.004592,0.004677,0.004697,0.004683,...,0.000795,0.000543,0.000294,-0.000138,6.2e-05,0.000999,0.001474,0.001641,0.001509,0.00162
5,2024-09-12 21:05:00,20240912,21:05:00,34.3685,-119.6358,0.004348,0.004402,0.004428,0.004514,0.004541,...,0.000378,0.000732,0.000392,-0.000104,-0.000116,0.000367,-0.000178,-0.000569,0.000158,0.00035
6,2024-09-13 17:45:00,20240913,17:45:00,34.3272,-119.5062,0.004156,0.004583,0.005754,0.00555,0.004971,...,-0.008188,-0.004141,0.008704,0.002662,-0.000686,0.004873,0.006212,0.00293,0.000156,-0.002717
7,2024-09-13 19:19:00,20240913,19:19:00,34.3349,-119.5721,0.004493,0.004563,0.004509,0.004554,0.004535,...,0.000784,-0.000997,-0.003354,-0.000157,0.001572,-0.000364,-0.001071,-0.001686,-0.003521,-0.0021
8,2024-09-13 21:12:00,20240913,21:12:00,34.123,-119.7015,0.004409,0.004446,0.00441,0.004423,0.004398,...,0.003337,0.003526,0.002365,0.001381,0.001052,0.001402,0.002771,0.003191,0.000956,0.000638
9,2024-09-14 20:50:00,20240914,20:50:00,34.1318,-119.3766,0.002806,0.002944,0.002828,0.00277,0.002815,...,-0.000705,-0.001468,-0.000606,0.000119,0.000153,-9.7e-05,0.001471,0.000891,-0.003561,-0.002162


In [66]:
##---------------------------------------------------------------------------##
#                             Satellite Utilities                             #
##---------------------------------------------------------------------------##
def parse_quality_flags(flag_value):
    """Parse bitwise flag into a list of flag names.

    Parameters
    ----------
    flag_value : int
        The integer representing the combined bitwise quality flags.

    Returns
    -------
    list of str
        List of flag names that are set in the flag_value.

    """
    return [
        flag_name for flag_name, value in L2_FLAGS.items()
        if (flag_value & value) != 0
    ]

In [67]:
def get_fivebyfive(file, latitude, longitude, rrs_wavelengths):
    """Get stats on 5x5 box around station coordinates of a satellite granule.

    This checks l2flags and runs statistics on valid pixels and returns their
    valid count, the coefficient of variance (cv), and the Rrs values.

    Parameters
    ----------
    file : earthaccess granule object
        Satellite granule from earthaccess.
    latitude : float
        In decimal degrees for Aeronet-OC site for matchups
    longitude : float
        In decimal degrees (negative West) for Aeronet-OC site for matchups
    rrs_wavelengths ; numpy array
        Rrs wavelengths (from wavelength_3d for OCI)

    Returns
    -------
    dict
        A dictionary of the processed 5x5 box with:
            - "sat_datetime": pd.datetime
                Datetime of the overall granule start time
            - "sat_cv": float
                Median coefficient of variation of Rrs(405nm - 570nm)
            - "sat_latitude": float
                Latitude of center pixel
            - "sat_longitude": float
                Longitude of center pixel
            - "sat_pixel_valid": float
                Number of valid pixels in 5x5 box based on l2 flags

    Notes
    -----
    This is set to use just Rrs data for the demo. As an exercise, make this
    function more generalized by adding an input for the desired product and
    removing the wavelength dependency (if not needed) as well as the cv
    calculation. This will also require refactoring the `match_data` function.
    """
    with xr.open_dataset(file, group="navigation_data") as ds_nav:
        sat_lat = ds_nav["latitude"].values
        sat_lon = ds_nav["longitude"].values

    # Calculate the Euclidean distance for 2D lat/lon arrays
    distances = np.sqrt((sat_lat - latitude) ** 2 + (sat_lon - longitude) ** 2)

    # Find the index of the minimum distance
    # Dimensions are (lines, pixels)
    min_dist_idx = np.unravel_index(np.argmin(distances), distances.shape)
    center_line, center_pixel = min_dist_idx

    # Get indices for a 5x5 box around the center pixel
    line_start = max(center_line - 2, 0)
    line_end = min(center_line + 2 + 1, sat_lat.shape[0])
    pixel_start = max(center_pixel - 2, 0)
    pixel_end = min(center_pixel + 2 + 1, sat_lat.shape[1])

    # Extract the data
    # NOTE: This is hard-coded to Rrs from an L2 AOP file.
    with xr.open_dataset(file, group="geophysical_data") as ds_data:
        rrs_data = (
            ds_data["Rrs"].isel(
                number_of_lines=slice(line_start, line_end),
                pixels_per_line=slice(pixel_start, pixel_end),
            ).values
        )
        flags_data = (
            ds_data["l2_flags"].isel(
                number_of_lines=slice(line_start, line_end),
                pixels_per_line=slice(pixel_start, pixel_end),
            ).values
        )

    # Calculate the bitwise OR of all flags in EXCLUSION_FLAGS to get a mask
    exclude_mask = sum(L2_FLAGS[flag] for flag in EXCLUSION_FLAGS)

    # Create a boolean mask
    # True means the flag value does not contain any of the EXCLUSION_FLAGS
    valid_mask = np.bitwise_and(flags_data, exclude_mask) == 0

    # Get stats and averages
    if valid_mask.any():
        rrs_valid = rrs_data[valid_mask]
        rrs_std_initial = np.std(rrs_valid, axis=0)
        rrs_mean_initial = np.mean(rrs_valid, axis=0)

        # Exclude spectra > 1.5 stdevs away
        std_mask = np.all(
            np.abs(rrs_valid - rrs_mean_initial) <= 1.5 * rrs_std_initial,
            axis=1
        )
        rrs_std = np.std(rrs_valid[std_mask], axis=0)
        rrs_mean = np.mean(rrs_valid[std_mask], axis=0).flatten()

        # Matchup criteria uses cv as median of 405-570nm
        rrs_cv = rrs_std / rrs_mean
        rrs_cv_median = np.median(
            rrs_cv[(rrs_wavelengths >= 405) & (rrs_wavelengths <= 570)]
        )
    else:
        rrs_cv_median = np.nan
        rrs_mean = np.nan * np.empty_like(rrs_wavelengths)

    # Put in dictionary of the row
    row = {
        "sat_datetime": pd.to_datetime(
            file.granule["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"],
            utc=0
        ),
        "sat_cv": rrs_cv_median,
        "sat_latitude": sat_lat[center_line, center_pixel],
        "sat_longitude": sat_lon[center_line, center_pixel],
        "sat_pixel_valid": np.sum(valid_mask),
    }

    # Add mean spectra to the row dictionary
    for wavelength, mean_value in zip(rrs_wavelengths, rrs_mean):
        key = f"sat_rrs{int(wavelength)}"
        row[key] = mean_value

    return row


In [68]:
# We do not use this. 


def get_sat_ts_matchups(
    start_date,
    end_date,
    latitude,
    longitude,
    sat="PACE_AOP",
    selected_dates=None
):
    """Make satellite timeseries of matchups from single station.

    Caution: If the date or coordinates aren't formatted correctly, it might
    pull a huge granule list and take forever to run. If it takes more than 45
    seconds to print the number of granules, just kill the process.

    Uses the earthaccess package. Defaults to the PACE OCI L2 IOP datasets,
    but other satellites can be used if they have a corresponding short_name
    in the SAT_LOOKUP dictionary.

    Workflow:
        1. Get list of matchup granules
        2. Loop through each file and:
            2a. Find closest pixel to station, extract 5x5 pixel box
            2b. Exclude pixels based on l2_flags
            2c. Filtered mean to get single spectra
            2d. Compute statistics and save data row
        3. Organize output pandas dataframe

    Parameters
    ----------
    start_date : datetime or str
        Beginning of Aeronet data to run.
    end_date : datetime or str, optional
        End of Aeronet data to run.
    latitude : float
        In decimal degrees for Aeronet-OC site for matchups
    longitude : float
        In decimal degrees (negative West) for Aeronet-OC site for matchups
    sat : str
        Name of satellite to search. Must be in SAT_LOOKUP dict constant.
    selected_dates : list of str, optional
        If given, only pull granules if the dates are in this list

    Returns
    -------
    pandas DataFrame object
        Flattened table of all satellite granule matchups.

    """
    # Look up short name from constants
    if sat not in SAT_LOOKUP.keys():
        raise ValueError(
            f"{sat} is not in the lookup dictionary. Available "
            f"sats are: {', '.join(SAT_LOOKUP)}"
        )
    short_name = SAT_LOOKUP[sat]

    # Format search parameters
    time_bounds = (f"{start_date}T00:00:00", f"{end_date}T23:59:59")

    # Run Earthaccess data search
    results = earthaccess.search_data(
        point=(longitude, latitude),
        temporal=time_bounds,
        short_name=short_name
    )
    if selected_dates is not None:
        filtered_results = [
            result
            for result in results
            if result["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"][:10]
            in selected_dates
        ]
        print(f"Filtered to {len(filtered_results)} Granules.")
        files = earthaccess.open(filtered_results)
    else:
        files = earthaccess.open(results)

    # Pull out Rrs wavelengths for easier processing
    with xr.open_dataset(files[0], group="sensor_band_parameters") as ds_bands:
        rrs_wavelengths = ds_bands["wavelength_3d"].values

    # Loop through files and process
    sat_rows = []
    for idx, file in enumerate(files):
        granule_date = pd.to_datetime(
            file.granule["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"]
        )
        print(f"Running Granule: {granule_date}")
        row = get_fivebyfive(file, latitude, longitude, rrs_wavelengths)
        sat_rows.append(row)

    return pd.DataFrame(sat_rows)

In [73]:
# New match up data (We do not use this just for test)
import pandas as pd

def match_data(
    df_sat,
    df_field,
    cv_max=0.15,
    senz_max=60.0,
    min_percent_valid=55.0,
    max_time_diff=180,
    std_max=1.5,
):
    """Create matchup dataframe based on selection criteria."""
    # Setup
    time_window = pd.Timedelta(minutes=max_time_diff)
    df_match_list = []

    # (Optionally filter field by solar zenith)
    df_field_filtered = df_field.copy()

    # 1) Pull the datetime (may be tz-aware) out of the index
    df_field_filtered["field_datetime"] = df_field_filtered.index
    # ← added: make sure field_datetime is tz-naive
    df_field_filtered["field_datetime"] = (
        pd.to_datetime(df_field_filtered["field_datetime"])
          .dt.tz_localize(None)
    )  # ← added

    # 2) Rename lat/lon into the names used below  
    df_field_filtered["field_latitude"]  = df_field_filtered["lat"]  
    df_field_filtered["field_longitude"] = df_field_filtered["lon"]

    # Filter satellite data based on cv threshold
    df_sat_filtered = df_sat[df_sat["sat_cv"] <= cv_max] 
    # Filter satellite data based on percent good pixels
    df_sat_filtered = df_sat_filtered[
        df_sat_filtered["sat_pixel_valid"] >= min_percent_valid * 25 / 100
    ]

    for _, sat_row in df_sat_filtered.iterrows():
        # 1) Strip the UTC tag off the sat timestamp:
        sat_time = sat_row["sat_datetime"].tz_convert(None)

        # 2) Now subtract your field datetimes (which are tz-naive) from that:
        time_diff = abs(df_field_filtered["field_datetime"] - sat_time)

        # 3) Continue with your masks:
        time_mask = time_diff <= time_window
        lat_mask  = abs(df_field_filtered["field_latitude"] - sat_row["sat_latitude"]) <= 0.2
        lon_mask  = abs(df_field_filtered["field_longitude"] - sat_row["sat_longitude"]) <= 0.2

        field_matches = df_field_filtered[time_mask & lat_mask & lon_mask]

        if field_matches.shape[0] > 5:
            # Filter by Standard Deviation for rrs columns
            rrs_cols = [
                col for col in field_matches.columns
                if col.startswith("field_rrs")
                and 400 <= int(col.rsplit("_rrs")[1]) <= 700
            ]
            if rrs_cols:
                mean_spectra = field_matches[rrs_cols].mean(axis=0)
                std_spectra  = field_matches[rrs_cols].std(axis=0)
                within_std   = (
                    abs(field_matches[rrs_cols] - mean_spectra) 
                    <= std_max * std_spectra
                )
                field_matches = field_matches[within_std.all(axis=1)]

        if not field_matches.empty:
            # Select the best match based on time delta
            time_diff   = abs(
                field_matches["field_datetime"] - sat_row["sat_datetime"]
            )
            best_match = field_matches.loc[time_diff.idxmin()]
            df_match_list.append({**best_match.to_dict(), **sat_row.to_dict()})

    df_match = pd.DataFrame(df_match_list)
    return df_match


In [74]:
# pull the first station’s coords out of your SeaBASS/field DataFrame
station_lat = df["lat"].iloc[0]
station_lon = df["lon"].iloc[0]

# suppose your datetime column is called 'field_datetime'
#unique_days = df.index.date             # e.g. array([datetime.date(2024,9,22), ...])
#unique_days_str = sorted({d.strftime("%Y-%m-%d") for d in unique_days}) # e.g. ['2024-09-22', '2024-09-23', ...]
# now call the satellite‐matchup routine with just those floats
#df_satellite = get_sat_ts_matchups(
   # start_date="2024-03-01",
   # end_date="2024-03-05",
   # latitude=station_lat,    # e.g. 34.2163
    #longitude=station_lon,    # e.g. -119.5980
   # sat="PACE_AOP",            # only if you want to override the default
   # selected_dates=unique_days_str
#)

df_satellite = get_sat_ts_matchups(
    start_date= "2024-09-26",
    end_date=   "2024-09-26",
    latitude=   station_lat,
    longitude=  station_lon,
    sat="PACE_AOP"
)


QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

Running Granule: 2024-09-26 20:25:11+00:00


In [76]:
matchups = match_data(
    df_satellite,   # <-- DataFrame of sat rows, not the metadata list
    df,             # your in-situ SeaBASS DataFrame
    cv_max=0.60,
    senz_max=60.0,
    min_percent_valid=55.0,
    max_time_diff=240,
    std_max=1.5,
)
matchups

In [82]:
#Final matchup file
import pandas as pd

def match_data(
    df_sat,
    df_field,
    cv_max=0.15,
    senz_max=60.0,
    min_percent_valid=55.0,
    max_time_diff=180,
    std_max=1.5,
):
    """Create matchup dataframe based on selection criteria."""
    time_window   = pd.Timedelta(minutes=max_time_diff)
    df_match_list = []

    # 1) prepare your field table
    df_field_filtered = df_field.copy()

    # pull real datetimes out of the index
    df_field_filtered["field_datetime"] = df_field_filtered.index
    # ensure tz-naive
    df_field_filtered["field_datetime"] = (
        pd.to_datetime(df_field_filtered["field_datetime"])
          .dt.tz_localize(None)
    )

    # rename lat/lon
    df_field_filtered["field_latitude"]  = df_field_filtered["lat"]
    df_field_filtered["field_longitude"] = df_field_filtered["lon"]

    # 2) filter satellite rows
    df_sat_filtered = df_sat[df_sat["sat_cv"] <= cv_max]
    df_sat_filtered = df_sat_filtered[
        df_sat_filtered["sat_pixel_valid"] >= min_percent_valid * 25/100
    ]

    for _, sat_row in df_sat_filtered.iterrows():
        # ---- HIGHLIGHTED: drop tz once, store in sat_time ----
        sat_time = sat_row["sat_datetime"].tz_convert(None)

        # first masking by time
        time_diff = abs(df_field_filtered["field_datetime"] - sat_time)
        time_mask = time_diff <= time_window

        lat_mask = abs(df_field_filtered["field_latitude"] - sat_row["sat_latitude"]) <= 0.2
        lon_mask = abs(df_field_filtered["field_longitude"] - sat_row["sat_longitude"]) <= 0.2

        field_matches = df_field_filtered[time_mask & lat_mask & lon_mask]

        # apply your stdev filtering if you like...
        if field_matches.shape[0] > 5:
            rrs_cols = [
                c for c in field_matches.columns
                if c.startswith("field_rrs")
                and 400 <= int(c.rsplit("_rrs")[1]) <= 700
            ]
            if rrs_cols:
                mean_spectra = field_matches[rrs_cols].mean(axis=0)
                std_spectra  = field_matches[rrs_cols].std(axis=0)
                mask         = (
                    abs(field_matches[rrs_cols] - mean_spectra)
                    <= std_max * std_spectra
                )
                field_matches = field_matches[mask.all(axis=1)]

        if not field_matches.empty:
            # ---- HIGHLIGHTED: use the same tz-naive sat_time here ----
            time_diff   = abs(field_matches["field_datetime"] - sat_time)
            best_match  = field_matches.loc[time_diff.idxmin()]
            df_match_list.append({**best_match.to_dict(), **sat_row.to_dict()})

    return pd.DataFrame(df_match_list)


In [80]:
# Run code only for one date as test

import pandas as pd

# -------------------------------
# 1) Rebuild the wide in-situ table
# -------------------------------
# (a) load the long‐form CSV (one row per λ)
df_long = pd.read_csv("all_SeaBASS_profiles.csv",
                      parse_dates=["profile_time"])

# (b) pivot to one row per cast, columns=Rrs at each λ
df_wide = (
    df_long
    .pivot(index=["profile_time","profile_lat","profile_lon"],
           columns="Wavelength",
           values="Rrs")
    .reset_index()
    .rename(columns={
        "profile_time":"datetime",
        "profile_lat":"lat",
        "profile_lon":"lon"
    })
)

# (c) optional: split date & time
df_wide["date"] = df_wide["datetime"].dt.strftime("%Y%m%d")
df_wide["time"] = df_wide["datetime"].dt.strftime("%H:%M:%S")

# (d) reorder so meta first, then numeric λ in ascending order
wls     = sorted(c for c in df_wide.columns if isinstance(c,(int,float)))
df_wide = df_wide[["datetime","date","time","lat","lon"] + wls]

# confirm
print(df_wide.shape)          
print(df_wide.columns.tolist())  
# → ['datetime','date','time','lat','lon',350,351,…,1075]


# -------------------------------
# 2) Make sure datetime is the index
# -------------------------------
df_wide = df_wide.set_index("datetime")


# -------------------------------
# 3) Run your satellite‐matchup
# -------------------------------
# pull your station coords from the very first cast
station_lat = df_wide["lat"].iloc[0]
station_lon = df_wide["lon"].iloc[0]

# fetch the PACE_AOP series at that location & date
df_satellite = get_sat_ts_matchups(
    start_date="2024-09-26",
    end_date  ="2024-09-26",
    latitude  = station_lat,
    longitude = station_lon,
    sat       ="PACE_AOP"
)

# now call your existing match_data
matchups = match_data(
    df_satellite,
    df_wide,
    cv_max            = 0.60,
    senz_max          = 60.0,
    min_percent_valid = 55.0,
    max_time_diff     = 240,
    std_max           = 1.5,
)

matchups


(37, 731)
['datetime', 'date', 'time', 'lat', 'lon', 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 53

QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

Running Granule: 2024-09-26 20:25:11+00:00


Unnamed: 0,date,time,lat,lon,350,351,352,353,354,355,...,sat_rrs706,sat_rrs707,sat_rrs708,sat_rrs709,sat_rrs711,sat_rrs712,sat_rrs713,sat_rrs714,sat_rrs717,sat_rrs719
0,20240926,20:22:00,34.3406,-119.6475,0.00413,0.004322,0.004447,0.004257,0.004062,0.004217,...,0.000134,0.000127,0.000124,0.000118,0.000109,0.000107,0.00011,0.000107,7.6e-05,0.000136


In [81]:
# 1) we assume df_wide already exists and has index=datetime, plus lat & lon columns
#    if you need to rebuild it, see the prior pivot code.

# pull your station coords (they're constant for these casts)
station_lat = df_wide["lat"].iloc[0]
station_lon = df_wide["lon"].iloc[0]

# 2) collect the unique days you sampled
unique_days = sorted({ts.strftime("%Y-%m-%d") for ts in df_wide.index})
print("Sampling days:", unique_days)

# 3) grab **all** PACE granules on those days at your lat/lon
df_sat_all = get_sat_ts_matchups(
    start_date     = unique_days[0],
    end_date       = unique_days[-1],
    latitude       = station_lat,
    longitude      = station_lon,
    sat            = "PACE_AOP",
    selected_dates = unique_days
)

print("Found", len(df_sat_all), "satellite rows across all days.")

# 4) now do the full matchup pass in one go
matchups_all = match_data(
    df_sat_all,
    df_wide,
    cv_max            = 0.60,
    senz_max          = 60.0,
    min_percent_valid = 55.0,
    max_time_diff     = 240,
    std_max           = 1.5,
)

print("Got", len(matchups_all), "total matchups:")
matchups_all


Sampling days: ['2024-09-06', '2024-09-08', '2024-09-12', '2024-09-13', '2024-09-14', '2024-09-15', '2024-09-17', '2024-09-18', '2024-09-19', '2024-09-20', '2024-09-21', '2024-09-22', '2024-09-23', '2024-09-25', '2024-09-26']
Filtered to 18 Granules.


QUEUEING TASKS | :   0%|          | 0/18 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/18 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/18 [00:00<?, ?it/s]

Running Granule: 2024-09-06 20:27:04+00:00
Running Granule: 2024-09-12 20:36:31+00:00
Running Granule: 2024-09-13 19:32:34+00:00
Running Granule: 2024-09-13 21:10:51+00:00
Running Granule: 2024-09-14 20:06:54+00:00
Running Granule: 2024-09-15 20:41:13+00:00
Running Granule: 2024-09-17 20:11:28+00:00
Running Granule: 2024-09-18 20:45:46+00:00
Running Granule: 2024-09-19 19:41:46+00:00
Running Granule: 2024-09-19 21:20:03+00:00
Running Granule: 2024-09-20 20:16:03+00:00
Running Granule: 2024-09-21 20:50:20+00:00
Running Granule: 2024-09-22 19:46:20+00:00
Running Granule: 2024-09-22 21:24:37+00:00
Running Granule: 2024-09-23 20:20:39+00:00
Running Granule: 2024-09-25 19:50:55+00:00
Running Granule: 2024-09-25 21:29:12+00:00
Running Granule: 2024-09-26 20:25:11+00:00
Found 18 satellite rows across all days.
Got 9 total matchups:


Unnamed: 0,date,time,lat,lon,350,351,352,353,354,355,...,sat_rrs706,sat_rrs707,sat_rrs708,sat_rrs709,sat_rrs711,sat_rrs712,sat_rrs713,sat_rrs714,sat_rrs717,sat_rrs719
0,20240906,21:11:00,34.2247,-119.6792,0.003501,0.003514,0.003513,0.003627,0.003683,0.003659,...,3.2e-05,3.4e-05,3.3e-05,2.842929e-05,2.385809e-05,2.6e-05,2.3e-05,2.4e-05,-3e-06,9.1e-05
1,20240912,21:05:00,34.3685,-119.6358,0.004348,0.004402,0.004428,0.004514,0.004541,0.004538,...,0.000253,0.000245,0.00024,0.000228001,0.0002190918,0.000213,0.000208,0.000199,0.000164,0.000258
2,20240917,20:24:00,34.261,-119.6197,0.004914,0.004915,0.004887,0.004971,0.004996,0.004988,...,4.3e-05,4.3e-05,4.1e-05,3.920123e-05,3.74008e-05,3.5e-05,3.7e-05,3.7e-05,2.3e-05,0.000113
3,20240918,20:48:00,34.2994,-119.758,0.00342,0.003421,0.003537,0.003656,0.003669,0.003632,...,8.7e-05,8.4e-05,7.9e-05,7.727302e-05,7.418272e-05,7.2e-05,7.3e-05,7.2e-05,5.4e-05,0.000177
4,20240919,19:39:00,34.2861,-119.5197,0.003676,0.003727,0.003864,0.003843,0.003768,0.003819,...,-2.1e-05,-1.7e-05,-1.4e-05,-1.288826e-05,-1.088861e-05,-1e-05,-1.6e-05,-1.5e-05,-2e-05,0.000245
5,20240919,22:29:00,34.2774,-119.675,0.003078,0.003167,0.0031,0.003109,0.003116,0.00311,...,-7.2e-05,-7.3e-05,-7e-05,-5.999878e-05,-5.759895e-05,-5e-05,-4.3e-05,-3.2e-05,-3e-06,0.000235
6,20240920,20:40:00,34.3619,-119.719,0.003744,0.003724,0.00369,0.00379,0.003833,0.003803,...,5e-06,4e-06,1e-06,6.680687e-07,-4.991889e-07,2e-06,1e-06,-4e-06,-1.9e-05,0.00012
7,20240925,21:56:00,34.1872,-119.6993,0.003192,0.003204,0.00325,0.003366,0.003443,0.003475,...,-8.7e-05,-7.6e-05,-6.7e-05,-5.39991e-05,-4.472651e-05,-3.7e-05,-3e-05,-4.4e-05,-8.9e-05,0.000176
8,20240926,20:22:00,34.3406,-119.6475,0.00413,0.004322,0.004447,0.004257,0.004062,0.004217,...,0.000134,0.000127,0.000124,0.0001182012,0.0001091998,0.000107,0.00011,0.000107,7.6e-05,0.000136


In [80]:
# 1) How many satrows you started with:
print("Total satrows:", len(df_satellite))

# 2) After CV filter:
sat1 = df_satellite[df_satellite["sat_cv"] <= 0.60]
print("After CV ≤ 0.60:", len(sat1))

# 3) After valid-pixel filter (55% of 25 pixels = 13.75 pixels):
sat2 = sat1[sat1["sat_pixel_valid"] >= 55 * 25 / 100]
print("After ≥55% valid pixels:", len(sat2))


Total satrows: 9
After CV ≤ 0.60: 2
After ≥55% valid pixels: 1


In [82]:
# 1) Grab the one surviving sat row
sat2 = df_satellite[df_satellite["sat_cv"] <= 0.60]
sat2 = sat2[sat2["sat_pixel_valid"] >= 55 * 25 / 100]
example = sat2.iloc[0]
df_field_filtered = df_field
# 2) Print its timestamp and your field‐data time range
print("Satellite time (naive):", example.sat_datetime.tz_convert(None))
print("Field time range:   ", df_field_filtered["field_datetime"].min(),
      "to", df_field_filtered["field_datetime"].max())

# 3) Compute the actual time differences
td = abs(df_field_filtered["field_datetime"] - example.sat_datetime.tz_convert(None))
print("Min time diff (min):", td.min().total_seconds()/60,
      "  Max time diff (min):", td.max().total_seconds()/60)

# 4) Print its lat/lon and your station coords
print("Satellite lat/lon:", example.sat_latitude, example.sat_longitude)
print("Station   lat/lon:", station_lat, station_lon)

# 5) Compute the spatial differences
dl = abs(df_field_filtered["field_latitude"]  - example.sat_latitude)
dlo = abs(df_field_filtered["field_longitude"] - example.sat_longitude)
print("Max Δlat:", dl.max(), "  Max Δlon:", dlo.max())


Satellite time (naive): 2024-09-26 20:25:11
Field time range:    2024-09-22 16:22:40 to 2024-09-22 16:52:40
Min time diff (min): 5972.516666666666   Max time diff (min): 6002.516666666666
Satellite lat/lon: 34.21881 -119.604904
Station   lat/lon: 34.2163 -119.598
Max Δlat: 0.14201103515625135   Max Δlon: 0.0576041748046805


In [136]:
# pull the first station’s coords out of your SeaBASS/field DataFrame
station_lat = df["lat"].iloc[0]
station_lon = df["lon"].iloc[0]

# suppose your datetime column is called 'field_datetime'
#unique_days = df.index.date             # e.g. array([datetime.date(2024,9,22), ...])
#unique_days_str = sorted({d.strftime("%Y-%m-%d") for d in unique_days}) # e.g. ['2024-09-22', '2024-09-23', ...]
# now call the satellite‐matchup routine with just those floats
#df_satellite = get_sat_ts_matchups(
   # start_date="2024-03-01",
   # end_date="2024-03-05",
   # latitude=station_lat,    # e.g. 34.2163
    #longitude=station_lon,    # e.g. -119.5980
   # sat="PACE_AOP",            # only if you want to override the default
   # selected_dates=unique_days_str
#)

df_satellite = get_sat_ts_matchups(
    start_date= "2024-09-26",
    end_date=   "2024-09-26",
    latitude=   station_lat,
    longitude=  station_lon,
    sat="PACE_AOP"
)


QUEUEING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/1 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/1 [00:00<?, ?it/s]

Running Granule: 2024-09-26 20:25:11+00:00
Unique L2 flags in your 5×5 box: [0]
Exclude mask (hex): 0x200
Valid pixels count: 25 / 25


In [137]:
matchups = match_data(
    df_satellite,   # <-- DataFrame of sat rows, not the metadata list
    df,             # your in-situ SeaBASS DataFrame
    cv_max=0.60,
    senz_max=60.0,
    min_percent_valid=55.0,
    max_time_diff=240,
    std_max=1.5,
)
matchups

In [89]:
# 1) Rebuild your field‐frame adapter (once), if needed:
df_field_filtered = df.copy()
df_field_filtered["field_datetime"]  = df_field_filtered.index
df_field_filtered["field_latitude"]  = df_field_filtered["lat"]
df_field_filtered["field_longitude"] = df_field_filtered["lon"]

# 2) Get the two sat‐rows that passed CV & pixel‐valid
sat2 = df_satellite[df_satellite["sat_cv"] <= 0.60]
sat2 = sat2[sat2["sat_pixel_valid"] >= 55 * 25 / 100]
print("Surviving granules:\n", sat2[["sat_datetime","sat_cv","sat_pixel_valid"]])

# 3) For each of those, compute the time difference to your station samples:
for _, row in sat2.iterrows():
    sat_t_naive = row.sat_datetime.tz_convert(None)
    # compare against each field sample:
    deltas = abs(df_field_filtered["field_datetime"] - sat_t_naive)
    print(f"\nGranule at {sat_t_naive} UTC:")
    print("  Min Δt (min):", deltas.min().total_seconds()/60)
    print("  Max Δt (min):", deltas.max().total_seconds()/60)


Surviving granules:
 Empty DataFrame
Columns: [sat_datetime, sat_cv, sat_pixel_valid]
Index: []


In [90]:
# 1) Inspect the full sat‐stats table
print(df_satellite[["sat_datetime","sat_cv","sat_pixel_valid"]])

# 2) See the range of CV values
print("CV  min/max:", df_satellite["sat_cv"].min(), "/", df_satellite["sat_cv"].max())

# 3) See the pixel‐valid counts
print("Valid pixels min/max:", 
      df_satellite["sat_pixel_valid"].min(), "/", 
      df_satellite["sat_pixel_valid"].max())


               sat_datetime  sat_cv  sat_pixel_valid
0 2024-09-22 19:46:20+00:00     NaN                0
1 2024-09-22 21:24:37+00:00     NaN                0
CV  min/max: nan / nan
Valid pixels min/max: 0 / 0


In [91]:
exclude_mask = 0
for flag in EXCLUSION_FLAGS:
    exclude_mask |= L2_FLAGS[flag]


In [92]:
print("Unique L2 flags in your 5×5 box:", np.unique(flags_data))
print("Exclude mask (hex):", hex(exclude_mask))


NameError: name 'flags_data' is not defined

In [99]:
for name, bit in L2_FLAGS.items():
    if bit & 1073742368:
        print(name, hex(bit))


HISATZEN 0x20
CLDICE 0x200
PRODFAIL 0x40000000


In [133]:
# 1) Drop any None‐rows (if you added PRODFAIL skipping)
df_sat_clean = df_satellite.dropna(subset=["sat_cv"]).copy()

# 2) Inspect what you actually have now
print(df_sat_clean[["sat_datetime","sat_cv","sat_pixel_valid"]])

print("CV min/max:", df_sat_clean["sat_cv"].min(), "/", df_sat_clean["sat_cv"].max())
print("Valid-pixels min/max:", 
      df_sat_clean["sat_pixel_valid"].min(), "/", 
      df_sat_clean["sat_pixel_valid"].max())


               sat_datetime    sat_cv  sat_pixel_valid
5 2024-09-25 21:29:12+00:00  0.123433               21
6 2024-09-26 20:25:11+00:00  0.084610               25
CV min/max: 0.084609754383564 / 0.12343292683362961
Valid-pixels min/max: 21 / 25


In [134]:
# 1) Clean up your satellite rows: drop any granules that returned None
df_sat_clean = df_satellite.dropna(subset=["sat_cv"]).copy()

# 2) Prepare your field DataFrame so match_data finds the right columns
df_field_prepped = df.copy()
df_field_prepped["field_datetime"]  = df_field_prepped.index
df_field_prepped["field_latitude"]  = df_field_prepped["lat"]
df_field_prepped["field_longitude"] = df_field_prepped["lon"]

# 3) Now run the matchup with those two DataFrames:
matchups = match_data(
    df_sat_clean,
    df_field_prepped,
    cv_max=0.60,
    senz_max=60.0,
    min_percent_valid=55.0,
    max_time_diff=240,
    std_max=1.5,
)
print("Number of matchups:", len(matchups))
matchups


Number of matchups: 0
