In [1]:
# Import packages 
import pandas as pd
import numpy as np
import netCDF4
import h5netcdf
import xarray as xr
from os.path import join, exists
import joblib
from glob import glob
import datetime as dt
import sys, os
import pyresample
import itertools
from pathlib import Path

#Filters
from scipy.ndimage import uniform_filter, maximum_filter, gaussian_filter

#Custom Packages
sys.path.append('/home/samuel.varga/python_packages/WoF_post') #WoF post package
sys.path.append('/home/samuel.varga/python_packages/wofs_ml_severe/')
sys.path.append('/home/samuel.varga/python_packages/MontePython/')
sys.path.append('/home/samuel.varga/projects/deep_learning/')

from wofs.post.utils import (
    save_dataset,
    load_multiple_nc_files,
)
from main.dl_2to6_data_pipeline import get_files, load_dataset
from collections import ChainMap

#Plotting packages
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import shapely
import cartopy
import cartopy.crs as ccrs
from cartopy.feature import NaturalEarthFeature
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader
from cartopy.feature import ShapelyFeature
from wofs.plotting.wofs_colors import WoFSColors
from wofs_ml_severe.data_pipeline.storm_report_loader import StormReportLoader
%matplotlib inline

lookup_file: /home/samuel.varga/python_packages/WoF_post/wofs/data/psadilookup.dat


In [2]:
#Get list of Patch files - convert cases to datetime
path_base = f'/work/samuel.varga/data/2to6_hr_severe_wx/DEEP_LEARNING/SummaryFiles/'
file_base = f'wofs_DL2TO6_16_16_data.feather'
meta_file_base = f'wofs_DL2TO6_16_16_meta.feather'
out_path = '/work/samuel.varga/data/DEEP_LEARNING/'

In [3]:
dates=[d for d in os.listdir(path_base) if '.txt' not in d]

paths=[] #Valid paths for worker function
bad_paths=[]
for d in dates:
    if d[4:6] !='05': 
        continue

    times = [t for t in os.listdir(join(path_base, d)) if 'basemap' not in t] #Init time

    for t in times:
        path = join(path_base, d , t)
        if exists(join(path,file_base)):
            paths.append(path)

In [4]:
print(paths[0])
print(f'Num Total Paths: {len(paths)} ')

/work/samuel.varga/data/2to6_hr_severe_wx/DEEP_LEARNING/SummaryFiles/20200518/1800
Num Total Paths: 961 


In [5]:
#Check files to see where bad MRMS data, drop cases from list of files
for path in paths:
    ds = xr.load_dataset(join(join(path_base, path), file_base))
    if np.any(ds['MESH_severe__4km'].values<0) or np.any(ds['MRMS_DZ'].values<0):
        print('Bad path found')
        bad_paths.append(path)
        paths.remove(path)
    ds.close()
print(f'Num Paths w/ no Missing data: {len(paths)}') 

Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Bad path found
Num Paths w/ no Missing data: 949


In [6]:
#Convert remaining files into train/validation/test based on day
temp_paths=[path.split('/')[-2][0:8]+path.split('/')[-1] for path in paths] #Different domains on the same day are treated as identical for the purposes of T/T split
dates=[pd.to_datetime(path, format=f'%Y%m%d%H%M') for path in temp_paths]
np.unique([date.strftime('%Y%m%d') for date in dates])

array(['20190501', '20190502', '20190503', '20190506', '20190507',
       '20190508', '20190509', '20190510', '20190513', '20190514',
       '20190515', '20190516', '20190517', '20190518', '20190520',
       '20190521', '20190522', '20190523', '20190524', '20190526',
       '20190528', '20190529', '20190530', '20200501', '20200504',
       '20200505', '20200506', '20200507', '20200508', '20200513',
       '20200515', '20200518', '20200519', '20200520', '20200521',
       '20200522', '20200526', '20200527', '20200528', '20200529',
       '20210503', '20210504', '20210505', '20210507', '20210510',
       '20210513', '20210514', '20210517', '20210519', '20210521',
       '20210524', '20210525', '20210526', '20210528', '20220502',
       '20220505', '20220506', '20220511', '20220512', '20220516',
       '20220518', '20220520', '20220523', '20220524', '20220526',
       '20220527', '20220529', '20220530', '20230501', '20230502',
       '20230503', '20230504', '20230505', '20230508', '202305

In [27]:
#Split into train/test
from sklearn.model_selection import KFold as kfold, train_test_split
import random

all_dates = np.unique([date.strftime('%Y%m%d') for date in dates])
random.Random(42).shuffle(all_dates)
train_dates, test_dates = train_test_split(all_dates, test_size=0.3)
print('Training Dates:')
print(train_dates)

print('Testing Dates:')
print(test_dates)

#Split training set into 5 folds
train_folds = kfold(n_splits = 5, random_state=42, shuffle=True).split(train_dates)

#with open(f'/work/samuel.varga/data/dates_split_deep_learning.pkl', 'wb') as date_file:
    #pickle.dump({'train_dates':train_dates,'test_dates':test_dates}, date_file)

Training Dates:
['20210525' '20200526' '20200515' '20220502' '20200529' '20190509'
 '20220529' '20230504' '20190521' '20230522' '20230509' '20200506'
 '20190520' '20190524' '20200504' '20200528' '20220511' '20200505'
 '20190503' '20210507' '20230523' '20190526' '20200518' '20230526'
 '20190513' '20210521' '20220524' '20230510' '20200501' '20190514'
 '20230516' '20190501' '20210528' '20230521' '20230517' '20220527'
 '20200507' '20190517' '20190522' '20190529' '20210524' '20190528'
 '20220526' '20210504' '20230501' '20190510' '20230511' '20210517'
 '20220518' '20210519' '20230525' '20200522' '20230518' '20230515'
 '20220523' '20190506' '20210514' '20190516' '20230530' '20210526'
 '20200519' '20210510' '20210505']
Testing Dates:
['20230531' '20190518' '20220506' '20190530' '20200520' '20190508'
 '20230503' '20190507' '20210503' '20230519' '20230524' '20210513'
 '20220516' '20190523' '20200521' '20220530' '20200527' '20200508'
 '20220505' '20200513' '20230512' '20230505' '20190502' '202205

In [8]:
for i, (train_index, val_index) in enumerate(train_folds):
    print(f'Rotation: {i}')
    print(train_index, val_index)
    print(len(list(np.array(paths)[np.isin(np.array([date.strftime('%Y%m%d') for date in dates]), train_dates[train_index])])))
    print(len(list(np.array(paths)[np.isin(np.array([date.strftime('%Y%m%d') for date in dates]), train_dates[val_index])])))

Rotation: 0
[ 1  2  3  4  6  7  8 10 11 13 14 15 17 18 19 20 21 22 23 24 26 27 28 29
 30 31 32 33 34 35 37 38 39 41 42 44 45 46 47 48 49 50 51 52 53 54 56 58
 59 62] [ 0  5  9 12 16 25 36 40 43 55 57 60 61]
534
129
Rotation: 1
[ 0  1  2  5  7  9 10 11 12 14 15 16 18 20 21 22 23 24 25 26 27 28 29 30
 31 32 35 36 37 38 39 40 41 42 43 44 45 47 50 51 52 54 55 56 57 58 59 60
 61 62] [ 3  4  6  8 13 17 19 33 34 46 48 49 53]
521
142
Rotation: 2
[ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 16 17 18 19 20 21 22 23 25 28
 29 33 34 35 36 37 38 39 40 42 43 44 46 47 48 49 50 51 52 53 55 56 57 60
 61 62] [11 15 24 26 27 30 31 32 41 45 54 58 59]
527
136
Rotation: 3
[ 0  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 22 24 25 26 27
 28 30 31 32 33 34 36 38 40 41 42 43 45 46 48 49 51 52 53 54 55 57 58 59
 60 61 62] [ 1  2 21 23 29 35 37 39 44 47 50 56]
539
124
Rotation: 4
[ 0  1  2  3  4  5  6  8  9 11 12 13 15 16 17 19 21 23 24 25 26 27 29 30
 31 32 33 34 35 36 37 39 40 41 43 44 45 46 47 48 49 50 

In [26]:
def save_rotation_nc(rot_num, train_ind, val_ind, unique_dates, path_list, date_list, filenames=None):
    '''rot_num: int - rotation number
        train_ind: list - list of indices for training folds - indices correspond to day in training_dates
        val_ind: list - list of indices for validation folds - indices correspond to day in training_dates
        unique_dates: list - list of unique dates in training set
        path_list: list - list of file paths of length N that contain directory info and init time
        date_list: list - list of dates of length N, with each date being YYYYmmdd for the corresponding path in path_list
    '''
    #Get list of paths for current rotation
    training_paths=list(np.array(path_list)[np.isin(np.array([date.strftime('%Y%m%d') for date in date_list]), unique_dates[train_ind])])
    validation_paths=list(np.array(path_list)[np.isin(np.array([date.strftime('%Y%m%d') for date in date_list]), unique_dates[val_ind])])
    
    #Add the filename to each of the paths
    print('Appending Filenames')
    training_file_paths = [join(path, file_base) for path in training_paths[:10]]
    training_meta_paths=[join(path, meta_file_base) for path in training_paths[:10]]
    validation_file_paths = [join(path, file_base) for path in validation_paths[:10]]
    validation_meta_paths=[join(path, meta_file_base) for path in validation_paths[:10]]
    
    
    #Create Training Data
    print(f'Saving training data for Rot {rot_num}')
    ds = [xr.open_dataset(f) for f in training_file_paths]
    ds = xr.concat(ds, dim='patch_no')
    #Save mean/variance for use in scaling <-----
    #ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__training_data'))
    return ds
    ds.close()
    
    print(f'Saving metadata for Rot {rot_num}')
    meta_ds = [xr.open_dataset(f) for f in training_meta_paths]
    meta_ds = xr.concat(meta_ds)
    #meta_ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__training_meta'))
    meta_ds.close()
    
    #Create validation data
    print(f'Saving validation data for Rot {rot_num}')
    ds = [xr.open_dataset(f) for f in validation_file_paths]
    ds = xr.concat(datasets)
    #ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__validation_data'))
    ds.close()
    
    print(f'Saving metadata for Rot {rot_num}')
    meta_ds = [xr.open_dataset(f) for f in validation_meta_paths]
    meta_ds = xr.concat(meta_ds)
   # meta_ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__validation_meta'))
    meta_ds.close()
                          
    return None

In [28]:
#Save training folds:
for i, (train_ind, val_ind) in enumerate(train_folds):
    d = save_rotation_nc(i, train_ind, val_ind, train_dates, paths, dates)

Appending Filenames
Saving training data for Rot 0
Appending Filenames
Saving training data for Rot 1
Appending Filenames
Saving training data for Rot 2
Appending Filenames
Saving training data for Rot 3
Appending Filenames
Saving training data for Rot 4


In [None]:
#Save testing set
#save_rotation_nc(rotation, paths, dates, test_dates, ('wofs_dl_severe__2to6hr__test_data','wofs_dl_severe__2to6hr__test_meta'))

In [None]:
#Rename directory to data_workflow