In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import os
from pathlib import Path
from tqdm import tqdm
import gzip
import holoviews as hv

from utig_radar_loading import file_util, stream_util, geo_util, opr_gps_file_generation

In [None]:
pd.options.mode.copy_on_write = True
tqdm.pandas()
hv.extension('bokeh')

In [None]:
use_cache = True
cache_dir = "outputs/file_index.csv"
base_path = "/kucresis/scratch/data/UTIG"

df_files = file_util.load_file_index_df(base_path, cache_dir, read_cache=use_cache)

df_artifacts = file_util.create_artifacts_df(df_files)

In [None]:
# Just deal with UTIG2 for now
df_artifacts = df_artifacts[df_artifacts['dataset'] == 'UTIG2']

In [None]:
def arrange_by_transect(df_artifacts, streams):
    """
    Group by transects (unique combinations of (prj, set, trn)) and pull out paths
    to the desired data streams.

    streams is a dictionary mapping names of data categories to a list of acceptable
    stream types. For example:
    { "gps": ["GPSnc1", "GPSnc2"],
      "radar": ["RADnh5", "RADnh6"] }

    The resulting dataframe will have two columns per entry in the streams dictionary:
    <data category>_stream_type will contain the matched stream type and
    <data category>_path will contain the path to the data file.

    If multiple matching stream types are available, preference will be given to the
    first stream type in the list. If no matching stream types are available, columns
    will be filled with NaN.
    """
    
    def agg_fn(group):
        df = pd.DataFrame(index=[0])
        
        # Look for requested data streams
        for data_category in streams.keys():
            df[f"{data_category}_stream_type"] = np.nan
            df[f"{data_category}_path"] = np.nan
            for stream_type in streams[data_category]:
                if stream_type in group['stream'].values:
                    df[f"{data_category}_stream_type"] = stream_type
                    df[f"{data_category}_path"] = group.loc[group['stream'] == stream_type, 'full_path'].values[0]
                    break

        # Add in any other unique keys
        for k in group:
            if k in ['full_path', 'stream', 'processing_level', 'processing_type']:
                continue
            
            if len(group[k].unique()) == 1:
                df[k] = group[k].values[0]

        return df

    df = df_artifacts.groupby(['prj', 'set', 'trn']).apply(agg_fn, include_groups=False)
    df.index = df.index.droplevel(-1)
    return df

df_transects = arrange_by_transect(df_artifacts, {
    "gps": ["GPSnc1"],
    "radar": ["RADnh5"]
})
df_transects

In [None]:
# df_transects = df_artifacts.groupby(['dataset', 'prj', 'set', 'trn']).agg(list).reset_index()
# df_transects.head()

# df_transects['has_radar'] = df_transects['artifact'].apply(lambda x: 'RAD' in str(x))
# df_transects['has_gps'] = df_transects['artifact'].apply(lambda x: ('GPSnc1' in str(x)) or ('GPStp2' in str(x)))
# #df_transects.head()

In [None]:
def get_start_timestamp(transect):
    # Iterate over stream data until we find one that has a valid context file
    
    fp = transect['gps_path']
    ct_df = stream_util.load_ct_file(fp, read_csv_kwargs={'nrows': 1})
    ct_df = stream_util.parse_CT(ct_df)

    return ct_df.iloc[0]['TIMESTAMP']

def get_end_timestamp(transect):
    fp = transect['gps_path']
    
    # Read last few bytes and extract last line
    with gzip.open(fp, 'rb') as f:
        f.seek(-2, os.SEEK_END)
        while f.read(1) != b'\n':
            f.seek(-2, os.SEEK_CUR)
        last_line = f.readline().decode()
    
    # Load and parse just the last line
    from io import StringIO
    ct_columns = ['prj', 'set', 'trn', 'seq', 'clk_y', 'clk_n', 'clk_d', 'clk_h', 'clk_m', 'clk_s', 'clk_f', 'tim']
    ct_df = pd.read_csv(StringIO(last_line), sep=r'\s+', names=ct_columns, index_col=False)
    ct_df = stream_util.parse_CT(ct_df)
    return ct_df.iloc[0]['TIMESTAMP']

def season_from_datetime(d):
    if d.month >= 6:
        return d.year
    else:
        return d.year - 1

df_all_seasons = df_transects

df_all_seasons['start_timestamp'] = df_all_seasons.apply(get_start_timestamp, axis=1)
df_all_seasons['season'] = df_all_seasons['start_timestamp'].apply(season_from_datetime)
df_all_seasons = df_all_seasons.sort_values('prj')
#df_all_seasons.to_csv('tmp.csv')
df_all_seasons.head()

In [None]:
print(f"The following seasons were found in the dataset:")
print(df_all_seasons['season'].unique())

### Select a single season to extract

In [None]:
season_year = 2018
season_name = "2018_Antarctica_BaslerJKB"

In [None]:
df_season = df_all_seasons[df_all_seasons['season'] == season_year]
df_season = df_season.sort_values(by='start_timestamp')

In [None]:
# Merge segments

last_segment_ct = stream_util.load_ct_file(df_season.iloc[0]['radar_path'])
# Start tim difference threshold at 99th percentile
tim_diff_threshold = np.percentile(np.diff(last_segment_ct['tim']), 99)
print(f"Using 'tim' difference threshold: {tim_diff_threshold} (10s of us)")

df_season['segment_path'] = ""
df_season['segment_date_str'] = ""
df_season['segment_number'] = -1
current_segment_datestring = df_season.iloc[0]['start_timestamp'].strftime("%Y%m%d")
current_segment_idx = 1

df_season.iloc[0, df_season.columns.get_loc('segment_date_str')] = current_segment_datestring
df_season.iloc[0, df_season.columns.get_loc('segment_path')] = f"{current_segment_datestring}_{current_segment_idx:02d}"
df_season.iloc[0, df_season.columns.get_loc('segment_number')] = current_segment_idx


print(f"Initial segment path is: {df_season.iloc[0]['segment_path']}")

for row_iloc in tqdm(range(1, len(df_season))):
    curr_segment_ct = stream_util.load_ct_file(df_season.iloc[row_iloc]['radar_path'])
    tim_delta_from_last = curr_segment_ct['tim'].iloc[0] - last_segment_ct['tim'].iloc[-1]

    if np.abs(tim_delta_from_last) > tim_diff_threshold:
        new_datestring = df_season.iloc[row_iloc]['start_timestamp'].strftime("%Y%m%d")
        if new_datestring == current_segment_datestring:
            current_segment_idx += 1
        else:
            current_frame_idx = 1
            current_segment_idx = 1
            current_segment_datestring = new_datestring

        print(f"Segment path changed to {current_segment_datestring}_{current_segment_idx:02d}. Delta in 'tim' was {tim_delta_from_last}")

    df_season.iloc[row_iloc, df_season.columns.get_loc('segment_date_str')] = current_segment_datestring
    df_season.iloc[row_iloc, df_season.columns.get_loc('segment_path')] = f"{current_segment_datestring}_{current_segment_idx:02d}"
    df_season.iloc[row_iloc, df_season.columns.get_loc('segment_number')] = current_segment_idx

    last_segment_ct = curr_segment_ct

df_season.head()

In [None]:
def load_gps_data(transects_df):
    segment_dfs = []

    for _, row in tqdm(transects_df.iterrows(), total=len(transects_df)):

        f = row['gps_path']
        
        df = stream_util.load_gzipped_stream_file(f, debug=False, parse=True, parse_kwargs={'use_ct': True})

        line_length_km = stream_util.calculate_track_distance_km(df)

        _, _, line_length_m_shapely = geo_util.project_split_and_simplify(df['LON'].values, df['LAT'].values, calc_length=True, simplify_tolerance=100)

        necessary_keys = ['prj', 'set', 'trn', 'clk_y', 'LAT', 'LON', 'TIMESTAMP']
        for k in necessary_keys:
            if k not in df:
                df[k] = np.nan

        df_sub = df[['prj', 'set', 'trn', 'clk_y', 'LAT', 'LON', 'TIMESTAMP']]

        df_sub['segment_path'] = row['segment_path']

        segment_dfs.append(df_sub)
    return segment_dfs

segment_dfs = load_gps_data(df_season)

In [None]:
paths = []
for segment_path in df_season['segment_path'].unique():
    dfs_list_tmp = [df for df in segment_dfs if df['segment_path'].iloc[0] == segment_path]
    _, p = geo_util.create_path(dfs_list_tmp)
    p = p.relabel(f"Segment {segment_path}")
    paths.append(p)

p = stream_util.create_antarctica_basemap() * hv.Overlay(paths)
p = p.opts(aspect='equal', frame_width=500, frame_height=500, tools=['hover'])
p.opts(title=season_name)

In [None]:
df_season

In [None]:
df_season.iloc[0:1].copy()

## Break segments into frames

In [None]:
break_distance = 50 # km

frame_outputs = {}
all_entries = []

segment_paths = df_season['segment_path'].unique()
for seg in segment_paths:
    print(f"Processing segment: {seg}")
    seg_df = df_season[df_season['segment_path'] == seg].sort_values('start_timestamp')
    # Note: Should have already been sorted, but just in case

    frame_idx = 1 # Frame index we're currently assigning
    accumulated_km = 0 # Sum of line-km currently assigned to frame_idx
    transect_iloc = 0 # Index of the current transect being processed

    frame_outputs[seg] = {frame_idx: []}
    last_x, last_y = None, None

    for transect_iloc in range(len(seg_df)):
        print(f" -> Allocating transect {transect_iloc} {seg_df.index[transect_iloc]}")

        # Load the geometry of this transect
        df = stream_util.load_gzipped_stream_file(
            seg_df.iloc[transect_iloc]['gps_path'],
            debug=False, parse=True, parse_kwargs={'use_ct': True}
            )

        x_proj, y_proj, line_length_m = geo_util.project_split_and_simplify(
            df['LON'].values, df['LAT'].values, calc_length=True, simplify_tolerance=None)
        
        x_proj = x_proj[:-1]
        y_proj = y_proj[:-1]

        # Calculate the along-track distance, accounting for possible distance from the
        # end of the last transect
        deltas = np.sqrt(np.diff(x_proj)**2 + np.diff(y_proj)**2) / 1000  # Convert to km
        if last_x:
            deltas = np.insert(deltas, 0, np.sqrt((x_proj[0] - last_x)**2 + (y_proj[0] - last_y)**2) / 1000)
        else:
            deltas = np.insert(deltas, 0, 0)
        dist = np.cumsum(deltas)
        #print(f"Transect total length is {dist[-1]} km")
        # print(x_proj)
        # print(y_proj)
        # print(dist)
        # raise Exception("test")

        # Allocate parts of this transect to frames
        transect_start_tim = df['tim'].iloc[0]
        transect_start_idx = 0
        while transect_start_tim < df['tim'].iloc[-1]:
            # Find the 'tim' index that fits into the current segment
            remaining_distance = break_distance - accumulated_km

            dists_from_idx = np.maximum(0, dist - dist[transect_start_idx])
            #print(f"With transect_start_idx={transect_start_idx}, remaining distance in this transect is {dists_from_idx[-1]} km")

            break_idx = np.argmin(np.abs(dists_from_idx - remaining_distance))
            break_tim = df['tim'].iloc[break_idx]

            entry = seg_df.iloc[transect_iloc:transect_iloc+1].copy()
            entry['gps_idx_start'] = transect_start_idx
            entry['gps_idx_stop'] = break_idx
            entry['tim_start'] = transect_start_tim
            entry['tim_stop'] = break_tim
            entry['frame_number'] = frame_idx

            all_entries.append(entry)

            # Add an entry to this frame and update distance
            frame_outputs[seg][frame_idx].append(entry)
            accumulated_km += dist[break_idx] - dist[transect_start_idx]
            print(f"   -> Assigned indices {transect_start_idx} to {break_idx} (distance {dist[break_idx] - dist[transect_start_idx]} km) to frame {frame_idx}, now at {accumulated_km} km")

            # Move transect start index
            transect_start_idx = break_idx
            transect_start_tim = break_tim

            # Check if the frame is full
            if accumulated_km >= 0.98*break_distance:
                print(f"    Frame {frame_idx} is full with {accumulated_km} km")
                frame_idx += 1
                accumulated_km = 0
                frame_outputs[seg][frame_idx] = []
            

        last_x, last_y = x_proj[-1], y_proj[-1]


In [None]:
frames_plan_df = pd.concat(all_entries).reset_index().set_index(['segment_date_str', 'segment_number', 'frame_number'])

In [None]:
def make_segment_gps_file(x):
    x = x.reset_index()
    print(f"{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}")
    gps_paths = list(x['gps_path'].unique())
    output_path = f"outputs/gps/{season_name}/gps_{x['segment_date_str'].iloc[0]}_{x['segment_number'].iloc[0]}.mat"

    opr_gps_file_generation.generate_gps_file(gps_paths, output_path, format='hdf5')
    
    return output_path

frames_plan_df.groupby(['segment_date_str', 'segment_number']).apply(make_segment_gps_file)

In [None]:
# Verify that we actually wrote an HDF5 file

import h5py

fn = '/kucresis/scratch/tteisberg_sta/scripts/python/utig_radar_loading/outputs/gps/2018_Antarctica_BaslerJKB/gps_20190114_1.mat'

with h5py.File(fn, 'r') as f:
    print("Keys: %s" % f.keys())