In [61]:
import pandas as pd
import numpy as np
## spatial filtering
import geopandas as gpd
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('seaducks'), '..')))
from seaducks.utils import iho_region_geometry,discard_undrogued_drifters,assign_each_position_a_bin,identify_time_series_segments, downsample_to_daily,get_corners,haversine_distance,format_coordinates
from seaducks.data_processing.derived_quantities import sst_gradient_pointwise,interpolate_sst_gradient
from shapely import points

In [62]:
dataset = pd.read_hdf('../data/corrected_velocity_drifter_full.h5')

In [63]:
df = dataset.copy()

In [64]:
lon_lim_W = -83
lon_lim_E = -40

df = df.query('@lon_lim_W < lon < @lon_lim_E').copy()

In [5]:
import xarray as xr
filename=r"CMEMS_West_NA_sst_1993.nc"
file_directory = r"D:\PhD\ocean-datasets\copernicus-data"
output_path = os.path.join(file_directory, filename)

dataset = xr.open_dataset(output_path)
sst = dataset['analysed_sst'].isel(time=0)

lat, lon = sst.indexes.values()

In [65]:
arrays_by_year = {}
file_directory = r"D:\PhD\ocean-datasets\copernicus-data"
for year in np.arange(1993,2020):

    filename=f"CMEMS_West_NA_sst_{year}.nc"
    output_path = os.path.join(file_directory, filename)
    
    dataset = xr.open_dataset(output_path)
    
    arrays_by_year[year] = dataset['analysed_sst']

arrays_by_year


FileNotFoundError: [Errno 2] No such file or directory: b'D:\\PhD\\ocean-datasets\\copernicus-data\\CMEMS_West_NA_sst_1993.nc'

In [None]:
sst_array = dataset['analysed_sst']

In [None]:
sst_array.coords

lat_str = sst_array.coords['latitude'].values.astype(str)  # Convert lat values to strings
lon_str = sst_array.coords['longitude'].values.astype(str)  # Convert lat values to strings

In [8]:
sst_array = sst_array.assign_coords(lat_str=('latitude', lat_str))
sst_array = sst_array.assign_coords(lon_str=('longitude', lon_str))

In [9]:
sst_array

In [10]:

time = sst_array['time'].values
lat = sst_array['lat_str'].values
lon = sst_array['lon_str'].values
lat_grid = np.array(lat)
lon_grid = np.array(lon)

In [11]:
h = 0.05                       # degrees
earth_radius = 6371
#h = np.deg2rad(h)*earth_radius # convert to metres

## Aside: Check SST gradient works pointwise

In [12]:
sst_idx = 4
lat_val = sst_array['lat_str'][sst_idx]
lon_val = sst_array['lon_str'][sst_idx]
time_val = time[sst_idx]
lat_neighbours = [format_coordinates(float(lat_val)+ii*h) for ii in np.arange(-2,3,1)]
lon_neighbours = [format_coordinates(float(lon_val)+jj*h) for jj in np.arange(-2,3,1)]

lat_neighbours

['0.1250', '0.1750', '0.2250', '0.2750', '0.3250']

In [13]:
sst_x_neighbours = sst_array.loc[time_val,lat_val, lon_val].values
sst_y_neighbours = sst_array.loc[time_val,lat_val, lon_val].values

In [14]:
sst_x_neighbours

array(298.02999334)

In [15]:
sst_gradient_pointwise(sst_array,(lat_val,lon_val),time_val)

(0.010192311305930346, 0.007194572686535672)

## Interpolating SST gradients to drifter positions

In [23]:
lat_grid_float = [float(val) for val in lat_grid]
lon_grid_float = [float(val) for val in lon_grid]
test_df = assign_each_position_a_bin(df.head(9),lat_grid_float,lon_grid_float,bin_size = 0.05)
test_df

Unnamed: 0,lon,lat,id,time,drogue,u,v,Wx,Wy,Tx,...,v_av,adt,sla,ugos,vgos,adt_err,lon_var,lat_var,lon_bin_size_0.05,lat_bin_size_0.05
1,-60.344002,37.969002,7708593,1993-01-01,True,-15.553,16.743999,12.07761,1.499027,0.254826,...,0.221598,0.730673,0.043759,-0.12704,0.252754,0.113529,0.10961,0.073453,"(-60.375, -60.325]","(37.925, 37.975]"
4,-40.561001,56.797001,7702897,1993-01-01,True,5.719,1.207,17.356215,2.461097,0.874598,...,-0.031946,-0.619196,-0.047426,0.009987,-0.00879,0.013229,0.004689,0.00342,"(-40.575, -40.525]","(56.775, 56.825]"
6,-60.153,36.118999,7701863,1993-01-01,True,52.662003,45.841,11.607692,3.963145,0.249163,...,0.495355,0.29183,-0.352319,0.491027,0.495859,0.057165,0.008439,0.006066,"(-60.175, -60.125]","(36.075, 36.125]"
20,-44.977001,17.966,7712334,1993-01-01,True,-8.535,-0.051,-8.326294,-4.56338,-0.125412,...,-0.005756,0.44286,0.012753,-0.014531,-0.03806,0.014293,0.005477,0.003981,"(-45.025, -44.975]","(17.925, 17.975]"
23,-50.573002,33.060001,7702008,1993-01-01,True,10.243,-6.59,9.770402,2.022355,0.152257,...,-0.073303,0.523113,0.044176,0.017035,-0.081205,0.020019,0.005221,0.0038,"(-50.575, -50.525]","(33.025, 33.075]"
28,-45.449001,23.767,7708594,1993-01-01,True,-19.798,-2.008,-6.067034,-5.806329,-0.079075,...,-0.001869,0.433594,-0.029101,-0.015528,-0.013118,0.012284,0.000219,0.00017,"(-45.475, -45.425]","(23.725, 23.775]"
33,-46.783001,36.48,7712326,1993-01-01,True,7.044,-30.161999,17.758003,6.619777,0.796861,...,-0.225871,0.190379,-0.259014,-0.106132,-0.26112,0.077673,0.004968,0.00362,"(-46.825, -46.775]","(36.475, 36.525]"
40,-41.259998,53.855999,7702076,1993-01-01,True,10.378,-17.363001,14.825249,-0.466872,0.511928,...,-0.057084,-0.589112,-0.02118,0.011623,-0.040094,0.01216,0.000176,0.000138,"(-41.275, -41.225]","(53.825, 53.875]"
54,-40.803001,33.633999,7714456,1993-01-01,True,-5.791,0.811,6.262232,7.324899,0.096579,...,0.061864,0.180863,-0.166515,-0.19232,0.063003,0.021357,0.00026,0.000201,"(-40.825, -40.775]","(33.625, 33.675]"


In [24]:
variables = list(test_df.columns)
corners = test_df.groupby(["lon_bin_size_0.05", "lat_bin_size_0.05"], sort=False, observed=False)[variables]


In [None]:
test_df.loc[:,'corners'] =  corners.apply(lambda x:x).index.map(lambda idx: get_corners(idx)) # this doesn't work because there are still nans in the columns
# new function content

test_df['year'] = test_df['time'].apply(lambda t:t.year)


In [29]:
idx = 0

corners = test_df['corners'].iloc[idx]
drifter_lat_str = str(test_df['lat'].iloc[idx])
drifter_lon_str = str(test_df['lon'].iloc[idx])
drifter_time_val = test_df['time'].iloc[idx]

print(drifter_time_val)

1993-01-01 00:00:00


In [37]:
interpolate_sst_gradient(drifter_lat_str,drifter_lon_str,drifter_time_val,[sst_array],corners)

(-0.010032717808263593, -0.011499763434247915)

In [40]:
test_df['time'].apply(lambda x: arrays_by_year[x.year])

KeyboardInterrupt: 

In [53]:
#test_df[['sst_x_derivative', 'sst_y_derivative']] = 
test_df.apply(
    lambda row: interpolate_sst_gradient(str(row['lat']),str(row['lon']),row['time'],[arrays_by_year[row['year']].sel(time=row['time'])],row['corners']),
    axis=1,
    result_type='expand'
)

IndexError: too many indices

In [54]:
time = pd.Timestamp(test_df['time'].values[0])
sst = arrays_by_year[time.year].sel(time=time)
lat, lon = sst.indexes.values()

In [60]:
test_df.groupby(['year']).apply(lambda row: interpolate_sst_gradient(str(row['lat']),str(row['lon']),row['time'],[arrays_by_year['year']],row['corners'])),
    axis=1,
    result_type='expand'
)

IndentationError: unexpected indent (2700043086.py, line 2)

In [57]:
# new option. Calculate sst by year. As in group the sst array by year and then pass it in to the interpolate per year.


def my_func(df):
    lats = df['lat'].apply(lambda x:str(x))
    lons = df['lon'].apply(lambda x:str(x))
    times = df['time']

    print(arrays_by_year[])

#interpolate_sst_gradient(str(row['lat']),str(row['lon']),row['time'],[arrays_by_year[row['year']].sel(time=row['time'])],row['corners'])

my_func(test_df)

In [None]:
iho_file_path = '../data/world_seas_iho_v3/World_Seas_IHO_v3.shp'
world_seas = gpd.read_file(iho_file_path)


In [None]:
iho_region = 'North Atlantic Ocean'
region = iho_region_geometry(iho_file_path,iho_region)

In [None]:
# unit conversion cm/s -> m/s
df.loc[:, 'u']/=100
df.loc[:, 'v']/=100

In [None]:
for var in ['u','v','Tx','Ty','Wy','Wx']:
            extreme_val_mask = np.abs(df[var] )> 900
            df.loc[extreme_val_mask,var] = np.nan

In [None]:
mask=np.abs(df[['u','v','Tx','Ty','Wy','Wx']])>900
mask.sum().sum()

In [None]:
undrogued_mask = df['drogue'] == False
print(undrogued_mask.sum())

In [None]:
df = discard_undrogued_drifters(df).copy()
undrogued_mask = df['drogue'] == False

In [None]:
undrogued_mask.sum()

In [None]:
df.query('lon_var >=0.25 or lat_var >= 0.25').shape

In [None]:
df = df.query('lon_var<0.25 and lat_var<0.25').copy()

In [None]:
lon_lim_W = -83 
lon_lim_E = -40
df.query('@lon_lim_W > lon or lon > @lon_lim_E').shape

In [None]:
df = df.query('@lon_lim_W < lon < @lon_lim_E').copy()

In [None]:
df.query('@lon_lim_W > lon or lon > @lon_lim_E').shape

In [None]:
drifter_locs = points(df[["lon","lat"]].values).tolist() # (lon,lat) in (x,y) form for geometry
region_mask = [loc.within(region) for loc in drifter_locs]

In [None]:
print(np.array(region_mask).sum()/len(drifter_locs)) # proportion of drifters in the NA region

In [None]:
df = df[region_mask].copy() 

In [None]:
print(np.array(region_mask).sum()/df.shape[0])

In [None]:
df.groupby('id')['time'].transform(identify_time_series_segments).loc[35553860]

In [None]:
df.loc[:,'segment_id'] = df.groupby('id')['time'].transform(identify_time_series_segments)

In [None]:
def test_butterworth_filter(time_series: np.ndarray, latitude: np.ndarray, order: int=5) -> np.ndarray: 
    """
    Applies a 1D Butterworth filter to each column of the input time series data.

    Parameters:
    - time_series: A 2D numpy array of shape (N, P) where N is the number of time points and P is 
        the number of variables.
    - latitude: A 1D numpy array of latitude values corresponding to each time point.
    - order: An integer specifying the order of the Butterworth filter. Default value is 5.

    Returns:
    - A 2D numpy array of the same shape as the input array, with filtered data.
    """
    time_series_len,num_time_series = time_series.shape
    dtype = time_series.dtype
    # initialise output with same shape and dtype as input
    out = np.zeros(time_series.shape,dtype=dtype) 

    # temporarily set missing values to zero
    nan_mask = np.isnan(time_series)

    # prevent changes to the time series outside of this function
    time_series = time_series.copy()
    time_series[nan_mask] = 0

    sample_freq = 1/(6*60*60) #Hz
    nyquist_freq = 0.5*sample_freq 

    return out

def test_apply_butterworth_filter(df: pd.DataFrame) -> pd.DataFrame:
    """
    Applies the butterworth_filter function to each covariate column in the DataFrame.

    This function filters the 'u', 'v', 'Wx', 'Wy', 'Tx', and 'Ty' columns of the DataFrame,
    creating copies if they do not already exist, and replacing the filtered values back
    into the DataFrame.

    Parameters:
    - df: A pandas DataFrame containing the data to be filtered. The DataFrame must contain
          a 'lat' column and the columns specified in cols_to_filter.

    Returns:
    - A pandas DataFrame with the filtered data.
    """
    lat = df['lat'].values
    time_dependent_vars = ['u','v','Wx','Wy','Tx','Ty']

    # prevent changes to the data outside of this function
    for var in time_dependent_vars:
        if var + '_filtered' not in df.columns:
            df[var + '_filtered'] = df[var].copy()
    vars_to_filter = [var + '_filtered' for var in time_dependent_vars]
    
    time_series = df[vars_to_filter].values
    filtered_vars = test_butterworth_filter(time_series,lat)

    df[vars_to_filter] = filtered_vars
    return df




In [None]:
variables = list(df.columns)

In [None]:
test_df = df.copy()

In [None]:

test_df = test_df.groupby(['id', 'segment_id'])[variables]

In [None]:
test_df = test_df.apply(test_apply_butterworth_filter)

In [None]:
test_df = downsample_to_daily(test_df).drop(['segment_id','id'],axis=1)

In [None]:
test_df.reset_index()

In [None]:
## testing sst gradient
import numpy as np
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname('seaducks'), '..')))
from seaducks.utils import stencil_mask

# numerical differentiation
from scipy.ndimage import convolve1d
from astropy.convolution import convolve

h = 1

kernel_stencil = np.array([1/(12*h),-8/(12*h),0, 8/(12*h),-1/(12*h)])[::-1]
kernel_central = np.array([-1/(2*h),0,1/(2*h)])[::-1]
kernel_onesided = np.array([-1/h,1/h])[::-1]

row = np.array([12.0,8.0,np.nan,-8.0,24.0,np.nan,12.0,np.nan])

# evaluate derivatives
dx_stencil = stencil_mask(row,len(kernel_stencil))*convolve(row,kernel_stencil,normalize_kernel=False,nan_treatment='fill',boundary=None,
                    mask=np.isnan(row))
dx_central = stencil_mask(row,len(kernel_central))*convolve(row,kernel_central,normalize_kernel=False,nan_treatment='fill',boundary=None,
                    mask=np.isnan(row))
dx_right = convolve1d(row,kernel_onesided,mode="constant",cval=np.nan)
dx_left = np.roll(dx_right,shift=1,axis=0)


print(f"x: {row}")
print(f"dx_stencil: {dx_stencil}")
print(f"dx_central: {dx_central}")
print(f"dx_left: {dx_left}")
print(f"dx_right: {dx_right}")
