### Reading in SeaBASS (.sb) Datafiles

##### Ceridwyn Hunter - 2025/08/04
_____________________________________________________

In [13]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [1]:
# Imports

import re
import pandas as pd
import xarray as xr
from io import StringIO
from pathlib import Path
from datetime import datetime
import plotly.express as px

In [2]:
# ─── 1) DEFINE SB_PARSER ───────────────────────────────────────────────────────

def parse_sb(file_path):
    text = Path(file_path).read_text()
    header, body = text.split('/end_header', 1)

    # 1a) extract header entries
    metadata = {}
    for line in header.splitlines():
        if not line.startswith('/'): 
            continue
        if '=' not in line:    
            continue
        key, val = line[1:].split('=', 1)
        key, val = key.strip(), re.sub(r'\[.*?\]', '', val).strip()
        if key in {
            'station','data_file_name',
            'start_date','end_date',
            'start_time','end_time',
            'fields',
            'north_latitude','south_latitude',
            'west_longitude','east_longitude'
        }:
            metadata[key] = val

    # 1b) parse the /fields list & store it
    fields_list = [f.strip() for f in metadata['fields'].split(',')]
    metadata['fields_list'] = fields_list

    # 1c) read the body into a DataFrame
    df = pd.read_csv(
        StringIO(body.strip()),
        sep=',',
        names=fields_list,
        comment='/'
    )

    # 1d) detect single‐spectrum files & fill lat/lon if missing
    is_spectrum = ('lat' not in df.columns or 'lon' not in df.columns)
    metadata['single_spectrum'] = is_spectrum
    if is_spectrum:
        lat0 = float(metadata.get('north_latitude', metadata.get('south_latitude', 0)))
        lon0 = float(metadata.get('west_longitude',  metadata.get('east_longitude',  0)))
        df['lat'], df['lon'] = lat0, lon0

    # 1e) ensure date/time exist
    if 'date' not in df.columns:
        df['date'] = metadata['start_date']
    if 'time' not in df.columns:
        df['time'] = metadata['start_time']

    # 1f) build datetime column
    df['datetime'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['time'])

    # 1g) parse header start/end datetimes
    def _pd(d, t):
        t_clean = re.sub(r'\[.*?\]', '', t)
        dfmt = '%Y-%m-%d' if '-' in d else '%Y%m%d'
        return datetime.strptime(f"{d} {t_clean}", f"{dfmt} %H:%M:%S")
    metadata['start_datetime'] = _pd(metadata['start_date'], metadata['start_time'])
    metadata['end_datetime']   = _pd(metadata['end_date'],   metadata['end_time'])

    # 1h) keep only the five columns you care about
    df = df[['date','time','lat','lon' '','datetime']]

    return metadata, df


In [3]:
# ─── 2.a) COLLECT & PREPARE ALL FILES ──────────────────────────────────────────

root = Path("/home/jovyan/shared-public/pace-hackweek/SeePACE/Hackweek_PACE-PAX_09_22-28")
metadata_list = []
search_types = ['Rrs', 'AOP'] #  Add additional search parameters if desired ***********************

for sb in root.rglob('*.sb'):
    meta, df = parse_sb(sb)

    # collapse spectral files to exactly one row
    if meta['single_spectrum']:
        df = df.head(1)

    meta['data'] = df
    # detect which of your search_types appear in the original fields
    meta['Data_Type'] = [
        t for t in search_types
        if any(t in f for f in meta['fields_list'])
    ]
    metadata_list.append(meta)


In [4]:
# ─── 2.b) CONFIRM PATH/FILES EXIST ──────────────────────────────────────────

# i. Does that directory exist?
print("Exists (T/F) ", root.exists())
print("Is directory (T/F) ", root.is_dir())

# ii. If it does, list the first few entries to check
if root.exists() and root.is_dir():
    for i, p in enumerate(root.iterdir()):
        print("-", p.name)
        if i >= 9:   # only showing the first 10 items
            break
else:
    print("Path not found—check your spelling or mount points.")


Exists (T/F)  True
Is directory (T/F)  True
- CCNY
- .ipynb_checkpoints
- NASA_GSFC
- myplot2.html
- myplot.html
- Interactive Plot Test.ipynb
- NRL


In [5]:
# ─── 3) BUILD SUMMARY DATAFRAME ─────────────────────────────────────────────

summary_df = pd.DataFrame([{
    'Station':        m['station'],
    'Data_File_Name': m['data_file_name'],
    'Start_Datetime': m['start_datetime'],
    'End_Datetime':   m['end_datetime'],
    'Data_Type':      m['Data_Type']
} for m in metadata_list])

display(summary_df)


Unnamed: 0,Station,Data_File_Name,Start_Datetime,End_Datetime,Data_Type
0,30,PACE-PAX_Shearwater_2024_GER_St_30.sb,2024-09-22 19:25:46,2024-09-22 19:25:46,[Rrs]
1,31,PACE-PAX_Shearwater_2024_GER_St_31.sb,2024-09-22 20:34:41,2024-09-22 20:34:41,[Rrs]
2,32,PACE-PAX_Shearwater_2024_GER_St_32.sb,2024-09-23 18:05:43,2024-09-23 18:05:43,[Rrs]
3,33,PACE-PAX_Shearwater_2024_GER_St_33.sb,2024-09-23 20:08:33,2024-09-23 20:08:33,[Rrs]
4,34,PACE-PAX_Shearwater_2024_GER_St_34.sb,2024-09-25 19:36:20,2024-09-25 19:36:20,[Rrs]
...,...,...,...,...,...
67,36,PVST_POL_PACE-PAX_Shearwater_above_water_radio...,2024-09-25 21:56:00,2024-09-25 21:58:00,[Rrs]
68,37,PVST_POL_PACE-PAX_Shearwater_above_water_radio...,2024-09-25 23:07:00,2024-09-25 23:09:00,[Rrs]
69,38,PVST_POL_PACE-PAX_Shearwater_above_water_radio...,2024-09-26 17:06:00,2024-09-26 17:08:00,[Rrs]
70,39,PVST_POL_PACE-PAX_Shearwater_above_water_radio...,2024-09-26 19:37:00,2024-09-26 19:39:00,[Rrs]


In [6]:
# ─── 4) BUILD & CONCATENATE XR DATASETS ─────────────────────────────────────

ds_list    = []
file_names = []

for m in metadata_list:
    df = m['data'].reset_index(drop=True).reset_index().rename(columns={'index':'record'})
    ds = xr.Dataset(
        {
            'lat':      ('record', df['lat']),
            'lon':      ('record', df['lon']),
            'datetime': ('record', df['datetime'])
        },
        coords={'record': df['record']}
    )
    # promote to a 2-D Dataset along new 'file' dim
    ds = ds.expand_dims(file=[m['data_file_name']])
    ds_list.append(ds)
    file_names.append(m['data_file_name'])

ds_combined = xr.concat(
    ds_list,
    dim='file',
    coords='minimal',
    compat='override'
)

print(ds_combined)

<xarray.Dataset> Size: 18kB
Dimensions:   (file: 72, record: 10)
Coordinates:
  * record    (record) int64 80B 0 1 2 3 4 5 6 7 8 9
  * file      (file) object 576B 'PACE-PAX_Shearwater_2024_GER_St_30.sb' ... ...
Data variables:
    lat       (file, record) float64 6kB 33.68 nan nan nan ... nan nan nan nan
    lon       (file, record) float64 6kB -119.6 nan nan nan ... nan nan nan nan
    datetime  (file, record) datetime64[ns] 6kB 2024-09-22T19:25:46 NaT ... NaT


In [7]:
display(metadata_list[30]['data'])

Unnamed: 0,date,time,lat,lon,datetime
0,20240923,19:04:25,34.2272,-119.6012,2024-09-23 19:04:25
1,20240923,19:08:24,34.222,-119.5931,2024-09-23 19:08:24
2,20240923,19:35:02,34.2133,-119.5854,2024-09-23 19:35:02
3,20240923,19:39:54,34.2128,-119.5864,2024-09-23 19:39:54


_____________________________________________________
### Making a quick plot of your SeaBASS datafiles in space


In [12]:

west, east  = -120.0, -118.0
south, north =  33.0,   35.0


# Add Lat/Lon columns to the summary dataframe
summary_df['Lat'] = [m['data']['lat'].iloc[0] for m in metadata_list]
summary_df['Lon'] = [m['data']['lon'].iloc[0] for m in metadata_list]

# Create interactive geo-scatter plot
fig = px.scatter_geo(
    summary_df,
    lat='Lat',
    lon='Lon',
    hover_name='Data_File_Name',
    hover_data={
        'Start_Datetime': True,
        'End_Datetime': True,
        'Data_Type': True,
        'Lat': False,
        'Lon': False
    },
    projection='natural earth'
)

# zoom to the bbox
fig.update_geos(
    lonaxis_range=[west, east],
    lataxis_range=[south, north],
    showcountries=True,      # optional: turn on country outlines
    landcolor="LightGreen",  # optional styling
)

fig.update_layout(title='Geographical Locations (Zoomed)')
fig.show()


In [11]:
fig = px.scatter_geo(
    summary_df,
    lat='Lat',
    lon='Lon',
    hover_name='Data_File_Name',
    hover_data={
        'Start_Datetime': True,
        'End_Datetime': True,
        'Data_Type': True,
        'Lat': False,
        'Lon': False
    },
    projection='natural earth'
)

fig.show()