# Generate test data for iterative ensemble algorithms

Test data is of the GEFS forecasts of the 2018 hurricanes 

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot  as plt
import numpy as np
import copy
from datetime import datetime  
from datetime import timedelta  
import os
os.environ["PROJ_LIB"] = "C:\\ProgramData\\Anaconda3\\Library\\share";
from mpl_toolkits.basemap import Basemap
import numpy as np
import pickle

In [2]:
def parse_lat_lon(lat, lon):
    '''
        parse the latitude, longitude values 
        input is an array of the format 214N, 859W 
        (lat: 0 - 900 tenths of degrees, lon: 0 - 900 tenths of degrees,)
    '''   
    lat_ = []
    lon_ = []
    for x in lat:
        if 'N' in x:
            lat_.append(float(x.strip('N')))
        elif 'S' in x:
            lat_.append(-1*float(x.strip('S')))
    
    for x in lon:
        if 'E' in x:
            lon_.append(float(x.strip('E')))
        elif 'W' in x:
            lon_.append(-1*float(x.strip('W')))

    lat_ = np.array(lat_)/10.0
    lon_ = np.array(lon_)/10.0
    
    return lat_, lon_

def normalized_xy_Coords(lat, lon, m):
    nF = max(m.xmax, m.ymax)

    x, y = np.array(m(lon,lat,inverse=False))
    
    return [x/nF, y/nF]

def xy_Coords(lat, lon, m):

    x, y = np.array(m(lon,lat,inverse=False))
    
    return [x, y]


In [3]:
# Set some parameters
wdir = 'C:/Users/vravindr/Dropbox/my_workspace/tracking storms/'

data_dir = wdir + 'data/2018/'
ens_data_dir = data_dir + 'ensemble/'
bt_data_dir = data_dir + 'best_track/'

xfn = ['al012018', 'al022018', 'al032018', 'al042018', 'al052018', 'al062018', 'al072018', 'al082018',
       'al092018', 'al102018', 'al112018', 'al122018', 'al132018', 'al142018', 'al152018', 'al162018']
basin ='Al'
Storms = ['ALBERTO', 'BERYL', 'CHRIS', 'DEBBY', 'ERNESTO', 'FLORENCE', 'GORDON', 'HELENE',
          'ISAAC', 'JOYCE', 'ELEVEN', 'KIRK', 'LESLIE', 'MICHAEL', 'NADINE', 'OSCAR']


'''
al012018 ALBERTO
al022018 BERYL
al032018 CHRIS
al042018 DEBBY 
al052018 ERNESTO
al062018 FLORENCE
al072018 GORDON
al082018 HELENE
al092018 ISAAC
al102018 JOYCE
al112018 ELEVEN
al122018 KIRK
al132018 LESLIE
al142018 MICHAEL
al152018 NADINE
al162018 OSCAR
'''    
# Storm to consider (all the forecasts of the storm are used)
ens_tn = ['AC00', 'AP01','AP02','AP03','AP04','AP05','AP06','AP07','AP08','AP09','AP10','AP11','AP12','AP13','AP14',
          'AP15','AP16','AP17','AP18','AP19','AP20']
ens_mean_tn = ['AEMN'] # ensemble mean track name
best_tn = ['BEST']

forecast_periods = [0, 6, 12, 18, 24, 30, 36, 42, 48] # forecast horizon is 48hrs, and time step is 6 hrs, origin is from 0hrs

nens = len(ens_tn)
ntst = len(forecast_periods) # number of time steps. 6hrs is the time step for the GEFS forecasts.

    
if(basin == 'Al'):
    # North Atlantic basin borders: lat: 0 deg to 50 deg, lon: 10W to 100W
    # Choose standard parallels: 16.67N and  33.33N, central lon: 45W, llcrnrlon=-100 , llcrnrlat=0, urcrnrlon=-10, urcrnrlat=50
    proj = Basemap(resolution='l',projection='eqdc',\
                lat_1=16.67,lat_2=33.33,lon_0=-45.0,
                llcrnrlon=-100 , llcrnrlat=0 , urcrnrlon=-10, urcrnrlat=50  )

nF = max(proj.xmax, proj.ymax) # normalizing factor of the coordinates (X, Y) used later
print(nF)

cols_names = ['BASIN', 'CY', 'YYYYMMDDHH', 'TECHNUM/MIN', 'TECH', 'TAU', 'LatN/S', 'LonE/W', 'VMAX', 'MSLP', 'TY', 'RAD', 'WINDCODE', 
        'RAD1', 'RAD2', 'RAD3', 'RAD4', 'POUTER', 'ROUTER', 'RMW', 'GUSTS', 'EYE', 'SUBREGION', 'MAXSEAS', 'INITIALS', 'DIR', 
        'SPEED', 'STORMNAME', 'DEPTH', 'SEAS', 'SEASCODE', 'SEAS1', 'SEAS2', 'SEAS3', 'SEAS4', 'USERDEFINED1', 'userdata1',
        'USERDEFINED2', 'userdata2', 'USERDEFINED3', 'userdata3', 'USERDEFINED4', 'userdata4', 'USERDEFINED5', 'userdata5']


The dedent function was deprecated in Matplotlib 3.1 and will be removed in 3.3. Use inspect.cleandoc instead.


9173846.41491117


In [4]:
datafn = 0

ensemble_data = np.ndarray(shape=(1, nens*ntst), dtype=float)
mean_ens_data = np.ndarray(shape=(1, ntst), dtype=float)
bt_label_data = np.ndarray(shape=(1, ntst), dtype=float)

nD = 0 # number of data-sets collected
    
for _xfn in xfn: # loop over all the avialble data files
    
    ens_fn = ens_data_dir + 'a' + _xfn + '.dat'
    bt_fn = bt_data_dir + 'b' + _xfn + '.dat'

    ens_data = pd.read_csv(ens_fn,names=cols_names, engine='python')
    bt_data = pd.read_csv(bt_fn,names=cols_names, engine='python')

    # parse and replace the latitude and longitude values
    [lat, lon] = parse_lat_lon(ens_data['LatN/S'], ens_data['LonE/W'])
    ens_data['LatN/S'] = lat
    ens_data['LonE/W'] = lon

    [lat, lon] = parse_lat_lon(bt_data['LatN/S'], bt_data['LonE/W'])
    bt_data['LatN/S'] = lat
    bt_data['LonE/W'] = lon

    ###################################################################################

    ens_data2 = copy.deepcopy(ens_data)

    # populate the Storm name in rows which dont have it
    storm_name = ens_data2.iloc[0]['STORMNAME'] # initialize storm name
    for index, row in ens_data.iterrows():
        if(row['STORMNAME'] is None):
            ens_data2.iat[index, 27]= storm_name
        else:
            storm_name = row['STORMNAME'] 

    ens_data2 = ens_data2.loc[ens_data2['TAU']>= min(forecast_periods)]
    ens_data2 = ens_data2.loc[ens_data2['TAU']<= max(forecast_periods)]

    # Add X, Y coordinates to the dataframes
    [X, Y] = xy_Coords(np.array(ens_data2['LatN/S']),np.array(ens_data2['LonE/W']), proj)
    ens_data2.insert(len(ens_data2.columns), "X", X) 
    ens_data2.insert(len(ens_data2.columns), "Y", Y) 

    [X, Y] = xy_Coords(np.array(bt_data['LatN/S']),np.array(bt_data['LonE/W']), proj)
    bt_data.insert(len(bt_data.columns), "X", X) 
    bt_data.insert(len(bt_data.columns), "Y", Y) 
    ########################################################################################

  
    st = Storms[datafn]
    # extract the data containing the storm name
    ens_data3 = ens_data2[ens_data2['STORMNAME'].str.contains(st)]


    # find unique forecasts
    forecasts = ens_data3['YYYYMMDDHH'].unique()

    # regrid the (lat, lon) of the storms to (X,Y), and add the (X,Y) cols

    d1 = copy.deepcopy(ens_data3)    

    ''' loop over each forecast of the data file'''
    for fc in forecasts:

        d2 = d1[d1['YYYYMMDDHH'] == fc]

        # extract set of [ensembles, mean ensemble, best-track] of this forecast, place them one below another
        ens_d = pd.DataFrame(columns=['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W'])
        bt_d = pd.DataFrame(columns=['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W'])   
        tracks = ens_tn + ens_mean_tn + best_tn
        for tn in tracks:
            d3 = copy.deepcopy(d2)
            d3 = d3[(d3['TECH'].str.contains(tn))]  
            d3 = d3.drop_duplicates(subset ="TAU") # drop rows with duplicate forecast period
            d3 = d3.sort_values(by='TAU', ascending=True) # make sure they are in ascending order
            ens_d = ens_d.append(d3, sort=False)
            # for best tracks there is no "forecast" date, rather the date in the column corresponds to the date of the track info
            if(tn =='BEST'):
                bd1 = copy.deepcopy(bt_data)
                fc_dt = datetime.strptime(str(fc), '%Y%m%d%H')
                array = []
                for fp in forecast_periods:
                    dt = fc_dt + timedelta(hours = fp)
                    array.append(dt.strftime('%Y%m%d%H'))
                bd1 = bd1.loc[bd1['YYYYMMDDHH'].isin(array)]
                bd1 = bd1.drop_duplicates(subset ="YYYYMMDDHH") # drop rows with duplicate dates
                bd1 = bd1.sort_values(by='YYYYMMDDHH', ascending=True) # make sure they are in ascending order

        if(len(ens_d) == (nens+1)*ntst and len(bd1)==ntst): # sometimes some enseble tracks may not have certain forecasts, 
            # and/or best track data may not be avialable. In that case the data is not added to the dataset.
            nD = nD + 1
            ed = ens_d[~ens_d['TECH'].str.contains(ens_mean_tn[0])]
            med = ens_d[ens_d['TECH'].str.contains(ens_mean_tn[0])]
            if(nD ==1): # very first dataset
                ensemble_data = ed.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()
                mean_ens_data = med.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()
                bt_label_data = bd1.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()
                print(nD)
                print(bd1['STORMNAME'].iloc[0])
            else:
                ensemble_data = np.vstack((ensemble_data, ed.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()))
                mean_ens_data = np.vstack((mean_ens_data, med.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()))
                bt_label_data = np.vstack((bt_label_data, bd1.filter(['YYYYMMDDHH', 'TECH', 'LatN/S', 'LonE/W', 'X', 'Y']).to_numpy()))
                print(nD)
                print(bd1['STORMNAME'].iloc[0])
        
    ''' End loop over forecasts '''
    datafn = datafn + 1 # increment data file number


1
    ALBERTO
2
    ALBERTO
3
    ALBERTO
4
    ALBERTO
5
    ALBERTO
6
    ALBERTO
7
    ALBERTO
8
    ALBERTO
9
    ALBERTO
10
    ALBERTO
11
    ALBERTO
12
    ALBERTO
13
    ALBERTO
14
    ALBERTO
15
    ALBERTO
16
      BERYL
17
      BERYL
18
      BERYL
19
      BERYL
20
      BERYL
21
      BERYL
22
      BERYL
23
      BERYL
24
      BERYL
25
      BERYL
26
      BERYL
27
      BERYL
28
      BERYL
29
      BERYL
30
      BERYL
31
      BERYL
32
      BERYL
33
      BERYL
34
      BERYL
35
      BERYL
36
      BERYL
37
      BERYL
38
      BERYL
39
      BERYL
40
      BERYL
41
      BERYL
42
      BERYL
43
      BERYL
44
      BERYL
45
      BERYL
46
      BERYL
47
      BERYL


KeyboardInterrupt: 

In [5]:
print("Number of data sets collected: ", nD)

if(nD>0):
    ensemble_data = ensemble_data.reshape(nD,nens*ntst,6)
    mean_ens_data = mean_ens_data.reshape(nD,ntst,6)
    bt_label_data = bt_label_data.reshape(nD,ntst,6)

    with open('2018_ensTest_6hrs.pickle', 'wb') as handle:
        pickle.dump(ensemble_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('2018_meanEnsTest_48hrs.pickle', 'wb') as handle:
        pickle.dump(mean_ens_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('2018_bestLabelTest_48hrs.pickle', 'wb') as handle:
        pickle.dump(bt_label_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

Number of data sets collected:  318


In [6]:
'''
with open('ensTest.pickle', 'rb') as handle:
    ensemble_data = pickle.load(handle)
'''


"\nwith open('ensTest.pickle', 'rb') as handle:\n    ensemble_data = pickle.load(handle)\n"