In [4]:
# Import packages 
import pandas as pd
import numpy as np
import netCDF4
import h5netcdf
import xarray as xr
from os.path import join, exists
import joblib
from glob import glob
import datetime as dt
import sys, os
import pyresample
import itertools
from pathlib import Path

#Filters
from scipy.ndimage import uniform_filter, maximum_filter, gaussian_filter

#Custom Packages
sys.path.append('/home/samuel.varga/python_packages/WoF_post') #WoF post package
sys.path.append('/home/samuel.varga/python_packages/wofs_ml_severe/')
sys.path.append('/home/samuel.varga/python_packages/MontePython/')
sys.path.append('/home/samuel.varga/projects/deep_learning/')

from wofs.post.utils import (
    save_dataset,
    load_multiple_nc_files,
)
from main.dl_2to6_data_pipeline import get_files, load_dataset
from collections import ChainMap

#Plotting packages
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import shapely
import cartopy
import cartopy.crs as ccrs
from cartopy.feature import NaturalEarthFeature
import cartopy.feature as cfeature
import cartopy.io.shapereader as shpreader
from cartopy.feature import ShapelyFeature
from wofs.plotting.wofs_colors import WoFSColors
from wofs_ml_severe.data_pipeline.storm_report_loader import StormReportLoader
%matplotlib inline

In [10]:
#Get list of Patch files - convert cases to datetime
path_base = f'/work/samuel.varga/data/2to6_hr_severe_wx/DEEP_LEARNING/SummaryFiles/'
file_base = f'wofs_DL2TO6_16_16_data.feather'
meta_file_base = f'wofs_DL2TO6_16_16_meta.feather'
out_path = '/work/samuel.varga/data/DEEP_LEARNING/'

In [13]:
dates=[d for d in os.listdir(path_base) if '.txt' not in d]

paths=[] #Valid paths for worker function
bad_paths=[]
for d in dates:
    if d[4:6] !='05': 
        continue

    times = [t for t in os.listdir(join(path_base, d)) if 'basemap' not in t] #Init time

    for t in times:
        path = join(path_base, d , t)
        if exists(join(path,file_base)):
            paths.append(path)

In [24]:
print(paths[0])
print(f'Num Total Paths: {len(paths)} ')

/work/samuel.varga/data/2to6_hr_severe_wx/DEEP_LEARNING/SummaryFiles/20180510/1900
Num Total Paths: 673 


In [33]:
#Check files to see where bad MRMS data, drop cases from list of files
for path in paths:
    ds = xr.load_dataset(join(join(path_base, path), file_base))
    if np.any(ds['MESH_severe__4km'].values<0) or np.any(ds['MRMS_DZ'].values<0):
        print('Bad path found')
        bad_paths.append(path)
        paths.remove(path)
print(f'Num Paths w/ no Missing data: {len(paths)}') 

In [63]:
#Convert remaining files into train/validation/test based on day
temp_paths=[path.split('/')[-2][0:8]+path.split('/')[-1] for path in paths] #Different domains on the same day are treated as identical for the purposes of T/T split
dates=[pd.to_datetime(path, format=f'%Y%m%d%H%M') for path in temp_paths]
np.unique([date.strftime('%Y%m%d') for date in dates])

array(['20180501', '20180502', '20180503', '20180504', '20180507',
       '20180509', '20180510', '20180511', '20180512', '20180514',
       '20180515', '20180516', '20180519', '20180521', '20180523',
       '20180524', '20180525', '20180527', '20180528', '20180529',
       '20180530', '20180531', '20190501', '20190502', '20190503',
       '20190506', '20190507', '20190508', '20190509', '20190510',
       '20190513', '20190514', '20190515', '20190516', '20190517',
       '20190518', '20190520', '20190521', '20190522', '20190523',
       '20190524', '20190526', '20190528', '20190529', '20190530',
       '20200501', '20200504', '20200505', '20200506', '20200507',
       '20200508', '20200513', '20200515', '20200518', '20200519',
       '20200520', '20200521', '20200522', '20200526', '20200527',
       '20200528', '20200529', '20230501', '20230502', '20230503',
       '20230504', '20230505', '20230508', '20230509', '20230510',
       '20230511', '20230512', '20230515', '20230516', '202305

In [None]:
#Split into train/test
from sklearn.model_selection import KFold as kfold

all_dates = np.unique([date.strftime('%Y%m%d') for date in dates])
random.Random(42).shuffle(all_dates)
train_dates, test_dates = train_test_split(all_dates, test_size=0.3)
print(test_dates)

#Split training set into 5 folds
train_folds = kfold(n_splits = 5, random_state=42).split(train_dates)

with open(f'/work/samuel.varga/data/dates_split_deep_learning.pkl', 'wb') as date_file:
    pickle.dump({'train_dates':train_dates,'test_dates':test_dates}, date_file)

In [None]:
def save_rotation_nc(rot_num, path_list, date_list, rot_dates, filenames=None):
    #Get list of paths for current rotation
    rotation_paths=path_list[np.array([date.strftime('%Y%m%d') for date in date_list]).isin(rot_dates)] 
    
    #Add the filename to each of the paths
    print('Appending Filename')
    rotation_file_paths = [join(path, file_base) for path in rotation_paths]
    rotation_meta_paths=[join(path, meta_file_base) for path in rotation_paths]
    
    
    #Open and concat the datasets, then save
    print(f'Saving ML Data for Rot {rot_num}')
    ds = [xr.open_dataset(f) for f in rotation_file_paths]
    ds = xr.concat(datasets)
    ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__data'))
    ds.close()
    
    print(f'Saving metadata for Rot {rot_num}')
    meta_ds = [xr.open_dataset(f) for f in meta_file_paths]
    meta_ds = xr.concat(meta_ds)
    meta_ds.to_netcdf(join(out_path, filenames[0] if filenames else f'wofs_dl_severe__2to6hr__rot_{rot_num}__meta'))
    meta_ds.close()

In [None]:
#Save training folds:
for rotation in enumerate([train_folds]):
    save_rotation_nc(rotation, paths, dates, train_folds[rotation])

In [None]:
#Save testing set
save_rotation_nc(rotation, paths, dates, test_dates, ('wofs_dl_severe__2to6hr__test__data','wofs_dl_severe__2to6hr__test__meta'))