In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import geopy.distance
from tqdm import tqdm
from pathlib import Path
from scipy import interpolate
import matplotlib.pyplot as plt
from concorde.tools import get_list, readFort22, from_mag_to_uv
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
ccolors = plt.rcParams['axes.prop_cycle'].by_key()['color']

pd.options.display.max_rows = 10

import warnings
warnings.filterwarnings("ignore")
from matplotlib.offsetbox import AnchoredText

In [2]:
pathout = Path(r'../models/adcirc/concorde/batch02/_postprocessing/_preprocessForNN')

In [3]:
dctTracks0 = pd.read_pickle(pathout/'dct_tracksAll_batch02_lengthCorr_tides_resampled.pkl')
dctTracks1 = pd.read_pickle(pathout/'dct_tracksAll_batch02_estuaries_lengthCorr_tides_resampled.pkl')

In [4]:
cols0 = ['lon', 'lat', 'wind_speed', 'pressure', 'rad_to_max_ws', 'heading_dir', 'forward_speed', 'forward_speed_u', 'forward_speed_v']
cols1 = [f'{x}_fft' for x in cols0]
cols3 = ['Duck', 'Oregon', 'Hatteras', 'Beaufort', 'Wilmington', 'Wrightsville', 'Albemarle', 'Pamlico', 'Neuse']
cols21 = [f'dist_to_{x.lower()}' for x in cols3[:6]]
cols22 = [f'dist_to_{x}' for x in cols3[6:]]

cols = cols0 + cols1 + cols21 + cols22 + cols3 + ['Boundary']

In [22]:
auxcols = ['Albemarle', 'dist_to_Albemarle', 'Pamlico', 'dist_to_Pamlico', 'Neuse', 'dist_to_Neuse']
dctTracks = {}
for ik, k in enumerate(dctTracks0.keys()):
    aux = pd.concat([dctTracks0[k], dctTracks1[k][auxcols]], axis = 1)
    aux = aux.loc[:, cols]
    dctTracks[k] = aux
    # break
    if ik == 0:
        aux.to_csv(pathout/'dct_tracksAll_batch02_ALL_lengthCorr_tides_resampled_SAMPLE.csv')

with open(pathout/'dct_tracksAll_batch02_ALL_lengthCorr_tides_resampled.pkl', 'wb') as fout:
    pickle.dump(dctTracks, fout)

### Zero Padding

In [6]:
lengths = []
for k in sorted(dctTracks.keys()):
    l = len(dctTracks[k])
    lengths.append(l)
dfl = pd.DataFrame({'length': lengths, 'run': dctTracks.keys()})
dfl.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
length,1813.0,99.923883,28.306754,28.0,79.0,109.0,118.0,235.0


In [7]:
inpArrPadded = []
for k in dctTracks.keys():
    aux = dctTracks[k]
    padLength = dfl['length'].max() - len(aux)
    auxPadded = np.pad(aux.values, ((padLength, 0), (0, 0)), mode = 'constant')
    inpArrPadded.append(auxPadded)
inpArrPadded = np.asarray(inpArrPadded, dtype = float)

np.save(pathout/f'arr_tracksAll_batch02_ALL_lengthCorr_tides_resampled.npy', inpArrPadded, allow_pickle=False)

## Test train split

## Augment data

In [8]:
np.random.seed(42)

### Augment all storms

In [9]:
pathin = Path(r'../models/adcirc/concorde/batch02/_postprocessing/_preprocessForNN')
## data from fort.22
dctTracks = pd.read_pickle(pathin/'dct_tracksAll_batch02_ALL_lengthCorr_tides_resampled.pkl')
## data from fort.63
dctWL0 = pd.read_pickle(pathin.parent/'time_series_water_level_at_NOAA_NC_closest.pkl')
dctWL1 = pd.read_pickle(pathin.parent/'time_series_water_level_at_NC_estuaries_closest.pkl')
dctWL = {}
for k in dctWL0.keys():
    aux = dctWL1[k]
    aux.columns = [f'zeta_pnt00{i}' for i in range(6, 9)]
    dctWL[k] = pd.concat([dctWL0[k], aux], axis = 1)
# ## zero padded input array
inpArrPadded = np.load(pathin/f'arr_tracksAll_batch02_ALL_lengthCorr_tides_resampled.npy')

## fort.63 of base simulation
baseNOAA = pd.read_csv(pathin.parent/'time_series_water_level_at_NOAA_NC_closest_baseSim.csv', index_col = 0, parse_dates = True)
baseNOAA = baseNOAA.tz_localize(None)
baseOregon = pd.read_csv(pathin.parent/'time_series_water_level_at_NOAA_NC_closest_baseSim_newOregon.csv', index_col = 0, parse_dates = True)
baseOregon = baseOregon.tz_localize(None)
baseDom = pd.read_csv(pathin.parent/'time_series_water_level_at_domCenter_baseSim.csv', index_col = 0, parse_dates = True)
baseDom = baseDom.tz_localize(None)
baseEst = pd.read_csv(pathin.parent/'time_series_water_level_at_NC_estuaries_closest_baseSim.csv', index_col = 0, parse_dates = True)
baseEst = baseEst.tz_localize(None)


baseNOAA['Oregon'] = baseOregon['Oregon'].values
baseNOAA = pd.concat([baseNOAA, baseEst], axis = 1)
baseNOAA['Boundary'] = baseDom.values.reshape(-1)

## read outputs
dfout0 = pd.read_csv(pathin.parent/'max_water_level_at_NC_NOAA_stations.csv', index_col = 0)
dfout1 = pd.read_csv(pathin.parent/'max_water_level_at_NC_NC_estuaries.csv', index_col = 0).sort_index()
dfout1.index = [f'{i:04d}' for i in dfout1.index]
dfout = pd.concat([dfout0.iloc[:-1, :], dfout1], axis = 1)
### remove base simulation
arrOut = np.array(dfout).reshape((dfout.shape[0], dfout.shape[1], 1))

In [10]:
dummydummy = baseNOAA.describe().T
for i in dummydummy.index:
    ma = dummydummy.loc[i, 'max']
    mi = dummydummy.loc[i, 'min']
    print(f'Tidal range {i} = {ma - mi:0.2f}')

Tidal range Duck = 1.93
Tidal range Oregon = 0.53
Tidal range Hatteras = 0.21
Tidal range Beaufort = 2.05
Tidal range Wilmington = 1.78
Tidal range Wrightsville = 2.40
Tidal range Albemarle = 0.13
Tidal range Pamlico = 0.19
Tidal range Neuse = 0.17
Tidal range Boundary = 0.73


In [20]:
augmented_inputs = []
augmented_outputs = []
nrep_per_storm = 50
for ik, k in tqdm(enumerate(dctTracks.keys())):
    track = dctTracks[k]
    dfWL = dctWL[k]
    dfWL = dfWL.tz_localize(None)
    ## intersect timestep with storm k and remove time series at model boundary
    dftide = baseNOAA.loc[baseNOAA.index.isin(dfWL.index), :].iloc[:, :-1]
    dfWL.columns = dftide.columns
    ## get only surge
    dfSurgeOnly = dfWL - dftide
    ## duration of storm 
    durSurge = dfSurgeOnly.index[-1] - dfSurgeOnly.index[0]
    ## last possible random start
    lastStart = baseNOAA.index[-1] - durSurge
    ixLastStart = baseNOAA.index.to_list().index(lastStart)
    ## random starting date ensuring all track time series is included in the new tide time series
    for r in range(nrep_per_storm):
        ixRandomStart = np.random.randint(0, ixLastStart)
        randomStart = baseNOAA.index[ixRandomStart]
        ## subset
        newTide = baseNOAA.loc[randomStart:randomStart+durSurge, :].resample('1H').mean()
        ## change dates
        newTide.index = dfWL.index
        ## new tide added to the surge only series after ramp
        newWL = (newTide.iloc[:, :-1] + dfSurgeOnly).loc[track.index[0]:, :]
        ## get max total water level
        maxNewWL = newWL.max(axis = 0)

        ## define zero padded input array as dataframe to replace new tide values
        inpArrCp = pd.DataFrame(inpArrPadded[ik, :, :])
        inpArrCp2 = inpArrCp.copy()
        ## change values
        inpArrCp2.iloc[-len(track):, -10:] = newTide.loc[newTide.index.isin(track.index), :].values
        augmented_inputs.append(inpArrCp2.values)

        augmented_outputs.append(maxNewWL.values.reshape((maxNewWL.shape[0], 1)))

augmented_inputs = np.asarray(augmented_inputs)
augmented_outputs = np.asarray(augmented_outputs)

mergedInputs = np.concatenate([inpArrPadded, augmented_inputs], axis = 0)
mergedOutputs = np.concatenate([arrOut, augmented_outputs], axis = 0)

pathoutNN = Path(r'../models/NNmodel/inputs/random_split')

X_train, X_test, Y_train, Y_test, idx_train, idx_test = train_test_split(mergedInputs, mergedOutputs, range(mergedInputs.shape[0]),
                                                                         test_size=0.15, random_state=42, shuffle=True)

# scaler = MinMaxScaler()
scaler = StandardScaler()

X_train_res = X_train.reshape(X_train.shape[0] * X_train.shape[1], X_train.shape[2])
mask = X_train_res == 0
X_train_res[mask] = np.nan
X_train_sc = scaler.fit_transform(X_train_res)
X_train_sc = np.nan_to_num(X_train_sc, nan=-9999)
X_train_sc = X_train_sc.reshape(X_train.shape)
print(X_train_sc.shape)
print(Y_train.shape)

X_test_res = X_test.reshape(X_test.shape[0] * X_test.shape[1], X_test.shape[2])
mask = X_test_res == 0
X_test_res[mask] = np.nan
X_test_sc = scaler.transform(X_test_res)
X_test_sc = np.nan_to_num(X_test_sc, nan=-9999)
X_test_sc = X_test_sc.reshape(X_test.shape)
print(X_test_sc.shape)
print(Y_test.shape)

np.save(pathoutNN/f'X_train_standardScaled_allInputs_augmentedAllX{nrep_per_storm:02d}_ALL.npy', X_train_sc, allow_pickle = False)
np.save(pathoutNN/f'y_train_augmentedAllX{nrep_per_storm:02d}_ALL.npy', Y_train, allow_pickle = False)
np.save(pathoutNN/f'X_test_standardScaled_allInputs_augmentedAllX{nrep_per_storm:02d}_ALL.npy', X_test_sc, allow_pickle = False)
np.save(pathoutNN/f'y_test_augmentedAllX{nrep_per_storm:02d}_ALL.npy', Y_test, allow_pickle = False)
np.savetxt(pathoutNN/f'indices_train_standardScaled_allInputs_augmentedAllX{nrep_per_storm:02d}_ALL.txt', idx_train, fmt='%d')
np.savetxt(pathoutNN/f'indices_test_standardScaled_allInputs_augmentedAllX{nrep_per_storm:02d}_ALL.txt', idx_test, fmt='%d')

with open(pathoutNN/f'standarScaler_augmentedAllX{nrep_per_storm:02d}_ALL.pkl', 'wb') as file:
    pickle.dump(scaler, file)

1813it [05:06,  5.91it/s]


(78593, 235, 37)
(78593, 9, 1)
(13870, 235, 37)
(13870, 9, 1)
