In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import os
# enable autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# define filenames
from config import NC_FILE_NAMES, DF_LABELS
base_dir = '/mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/'
raw_data_dir = os.path.join(base_dir, 'raw_datasets/')
dest_dir = os.path.join(base_dir, 'processed_datasets/')

filename_skn_locations = os.path.join(raw_data_dir, 'SKNlocations.csv')
filename_stations = os.path.join(raw_data_dir, 'FilledDataset2012.xlsx')
dest_filename = os.path.join(dest_dir, 'dataset_6grid.csv')

# Load stations data

In [3]:
# process skn location data
print(f'Loading file: {filename_skn_locations}')
df_locations = pd.read_csv(filename_skn_locations)
# column names => lower case
df_locations.columns = map(str.lower, df_locations.columns)
df_locations = df_locations.rename(columns={'lat_dd': 'lat', 'lon_dd': 'lon'})

# convert the longitude from (-180, 180) to (0, 360)
df_locations['lon_updated'] = df_locations['lon'] + 360
# for each row, compute the coordinates of the closest girdpoint in netCDF files
def closest_grid(lat, lon):
    # to save computation, only include grid points around Hawaii
    lat_grids = np.arange(12.5, 27.5, 2.5)
    lon_grids = np.arange(200, 212.5, 2.5)
    
    xx, yy = np.meshgrid(lat_grids, lon_grids)
    d = np.sqrt((xx-lat)**2 + (yy-lon)**2)
    target_idx = np.where(d == d.min())
    
    return tuple(np.array([xx[target_idx], yy[target_idx]]).reshape(-1,))

df_locations['closest_grid'] = df_locations.apply(lambda row: closest_grid(row['lat'], row['lon_updated']), axis=1)

Loading file: /mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/raw_datasets/SKNlocations.csv


In [4]:
df_locations.head()

Unnamed: 0,skn,name,lat,lon,lon_updated,closest_grid
0,1.0,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
1,1.1,MORSE FIELD,18.91368,-155.68055,204.31945,"(20.0, 205.0)"
2,1.2,KALAE S TRK STA,18.938669,-155.680549,204.319451,"(20.0, 205.0)"
3,2.0,MANUKA,19.10866,-155.825545,204.174455,"(20.0, 205.0)"
4,2.1,KAHUKU MAUKA 2.10,19.10889,-155.74667,204.25333,"(20.0, 205.0)"


In [5]:
# process rainfall data
print(f'Loading file: {filename_skn_locations}')
df_data = pd.read_excel(filename_stations, sheet_name='Data_in')
df_data.columns = map(str.lower, df_data.columns)
# melt cells into rows
df_data = df_data.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='data_in'
)

# convert string month to digit
month_to_digit = dict(
    jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
    jul=7, aug=8, sep=9, oct=10, nov=11, dec=12
)
df_data.month = df_data.month.map(month_to_digit)
# replace empty space to NaN so the downstream process can remove those values
df_data['data_in'] = df_data['data_in'].replace(r'^\s*$', np.nan, regex=True)

Loading file: /mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/raw_datasets/SKNlocations.csv


In [6]:
df_data.head()

Unnamed: 0,skn,year,month,data_in
0,1.0,1920,1,4.76
1,1.0,1921,1,7.78
2,1.0,1922,1,4.02
3,1.0,1923,1,11.47
4,1.0,1924,1,0.0


In [7]:
# we need information on how each data is collected.
df_source = pd.read_excel(filename_stations, sheet_name='Source')
print(f'Loading file: {filename_skn_locations}')
df_source.columns = map(str.lower, df_source.columns)

# melt cells into rows
df_source = df_source.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='filled'
)

b_filled = {
    # filled
    'Fill_1': True, 'Fill_2': True, 'Fill_3': True, 'Fill_4': True, 'Fill_5': True, 'NRFill': True,
    # nonfilled
    'State/NCDC': False, 'NCDC': False, 'State': False, 'Hydronet': False, 'RAWS': False,
    'SCAN': False, 'USGS': False, 'Hydronet/NCDC': False, 'HaleNet': False, 'HC&S': False,
    'AlanMair': False, 'USGS/State': False, 'USGS/NCDC': False, 'AlanMair/State': False,
}
df_source.filled = df_source.filled.map(b_filled)
df_source.month = df_source.month.map(month_to_digit)

Loading file: /mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/raw_datasets/SKNlocations.csv


In [8]:
df_source.head()

Unnamed: 0,skn,year,month,filled
0,1.0,1920,1,True
1,1.0,1921,1,True
2,1.0,1922,1,True
3,1.0,1923,1,True
4,1.0,1924,1,True


In [9]:
# merge df_data and df_source and drop rows with any NaN
df_data_w_source = pd.merge(left=df_data, right=df_source, left_on=['skn', 'year', 'month'], right_on=['skn', 'year', 'month']).dropna()

In [10]:
df_data_w_source.head()

Unnamed: 0,skn,year,month,data_in,filled
0,1.0,1920,1,4.76,True
1,1.0,1921,1,7.78,True
2,1.0,1922,1,4.02,True
3,1.0,1923,1,11.47,True
4,1.0,1924,1,0.0,True


In [11]:
# merge df_data_w_source and df_locations
df_ready_for_cdf = pd.merge(left=df_data_w_source, right=df_locations, left_on='skn', right_on='skn')
df_ready_for_cdf.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,lon_updated,closest_grid
0,1.0,1920,1,4.76,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
1,1.0,1921,1,7.78,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
2,1.0,1922,1,4.02,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
3,1.0,1923,1,11.47,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
4,1.0,1924,1,0.0,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"


## Load Reanalysis Datasets

In [12]:
unique_closest_coords = np.array([np.array(item) for item in df_locations['closest_grid'].unique()])
unique_lat = np.unique(unique_closest_coords[:, 0])
unique_lon = np.unique(unique_closest_coords[:, 1])

In [13]:
dfs = []
for filename, df_label in zip(NC_FILE_NAMES, DF_LABELS):
    filename_reanalysis = os.path.join(raw_data_dir, f'reanalysis_data/{filename}')
    df = xr.open_dataset(filename_reanalysis).loc[dict(lat=unique_lat, lon=unique_lon)].to_dataframe() # only load relevant portion
    if "level" in df.index.names: # if there is extra level then we have to drop the level
        df = df.droplevel(level="level")
    df = df.reorder_levels(['lat', 'lon', 'time'])
    df.columns = [df_label]
    dfs.append(df)

In [14]:
# combine all the netCDF files
df_cdf_combined = pd.concat(dfs, axis=1).reset_index()
df_cdf_combined['year'] = df_cdf_combined['time'].dt.year
df_cdf_combined['month'] = df_cdf_combined['time'].dt.month
df_cdf_combined['grid'] = df_cdf_combined.apply(lambda row: (row['lat'], row['lon']), axis=1)

# Merge station data and reanalysos data

In [15]:
df_w_closest_obs = pd.merge(
    left=df_ready_for_cdf,
    right=df_cdf_combined.drop(columns=['lat', 'lon']),
    left_on=['year', 'month', 'closest_grid'],
    right_on=['year', 'month', 'grid']
)

In [16]:
df_w_closest_obs.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,lon_updated,closest_grid,...,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp,grid
0,1.0,1948,1,3.2,False,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)",...,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849,"(20.0, 205.0)"
1,2.0,1948,1,5.95,True,MANUKA,19.10866,-155.825545,204.174455,"(20.0, 205.0)",...,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849,"(20.0, 205.0)"
2,2.2,1948,1,11.5,True,KAHUKU SHED 3,19.16474,-155.68228,204.31772,"(20.0, 205.0)",...,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849,"(20.0, 205.0)"
3,2.25,1948,1,5.515941,True,RESERVOIR (2940),19.160603,-155.822488,204.177512,"(20.0, 205.0)",...,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849,"(20.0, 205.0)"
4,2.26,1948,1,4.310617,True,CASTLE,19.225323,-155.778876,204.221124,"(20.0, 205.0)",...,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849,"(20.0, 205.0)"


# extract grid data

In [17]:
dfs = []
for name, group in df_cdf_combined.groupby(
    by=['time']
):
    time = group['time'].unique()[0]
    # assert time.shape[0] == 1
    group = (
        group
        .sort_values(by=['lat', 'lon'])
        .drop(columns=['lat', 'lon', 'year', 'grid', 'month'])
        .melt(id_vars=['time'])
    )
    group['label'] = group.apply(lambda row: f"{row['variable']}_{row.name % 6}", axis=1)
    group = group.drop(columns=['time', 'variable']).transpose()
    group.columns = group.loc['label']
    group = group.drop('label')
    group['time'] = [time]
    dfs.append(
        group
    )

In [18]:
df_grid = pd.concat(dfs)
df_grid['year'] = df_grid['time'].dt.year
df_grid['month'] = df_grid['time'].dt.month

In [19]:
df_grid.head()

label,air2m_0,air2m_1,air2m_2,air2m_3,air2m_4,air2m_5,air1000_500_0,air1000_500_1,air1000_500_2,air1000_500_3,...,skt_5,slp_0,slp_1,slp_2,slp_3,slp_4,slp_5,time,year,month
value,295.726959,295.300873,295.396027,295.298279,294.962952,294.857971,31.169991,31.37999,31.299995,31.519989,...,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311,1948-01-01,1948,1
value,295.328339,294.948212,294.804077,294.726898,294.499969,294.384399,31.490005,31.980003,32.330002,32.190002,...,22.042482,1015.343079,1015.235779,1015.476929,1016.235535,1015.979919,1015.875488,1948-02-01,1948,2
value,295.800781,295.274536,294.97821,295.364563,294.946594,294.533051,32.830002,32.670006,32.189995,32.87001,...,22.307774,1016.237976,1016.239624,1016.627136,1017.947021,1017.83136,1017.96936,1948-03-01,1948,3
value,296.61438,296.051941,295.796539,296.104492,295.702728,295.371246,30.32,30.140007,29.920006,30.399994,...,23.057003,1015.145142,1015.181641,1015.431641,1016.125305,1016.093994,1016.186157,1948-04-01,1948,4
value,297.482941,296.844818,296.500061,297.024323,296.539459,296.165039,29.990005,29.919991,29.710007,29.919998,...,24.052345,1016.440674,1016.371704,1016.578308,1017.76709,1017.747559,1017.908569,1948-05-01,1948,5


# Merge grid

In [20]:
df_6grid_added = pd.merge(
    left=df_w_closest_obs.drop(columns=['closest_grid', 'grid', 'time', 'lon_updated']),
    right=df_grid.drop(columns=['time']),
    left_on=['year', 'month'],
    right_on=['year', 'month'])

In [21]:
df_6grid_added.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,air2m,air1000_500,...,skt_2,skt_3,skt_4,skt_5,slp_0,slp_1,slp_2,slp_3,slp_4,slp_5
0,1.0,1948,1,3.2,False,KALAE,18.916176,-155.674994,295.396027,31.299995,...,23.385218,23.6341,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311
1,2.0,1948,1,5.95,True,MANUKA,19.10866,-155.825545,295.396027,31.299995,...,23.385218,23.6341,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311
2,2.2,1948,1,11.5,True,KAHUKU SHED 3,19.16474,-155.68228,295.396027,31.299995,...,23.385218,23.6341,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311
3,2.25,1948,1,5.515941,True,RESERVOIR (2940),19.160603,-155.822488,295.396027,31.299995,...,23.385218,23.6341,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311
4,2.26,1948,1,4.310617,True,CASTLE,19.225323,-155.778876,295.396027,31.299995,...,23.385218,23.6341,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311


In [22]:
# append elevation data
df_elevation = pd.read_excel(filename_stations, sheet_name="Header")
df_elevation.columns = map(str.lower, df_elevation.columns)
df_elev_added = pd.merge(left=df_6grid_added, right=df_elevation[['skn', 'elevft']], left_on='skn', right_on='skn').rename(columns={"elevft": "elevation"})
df_elev_added['season_wet'] = df_elev_added.apply(lambda row: 1 if row['month'] < 5 or row['month'] > 10 else 0, axis=1) # May - Oct is dry

In [23]:
df_elev_added.to_csv(dest_filename, index=False)
df_elev_added.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,air2m,air1000_500,...,skt_4,skt_5,slp_0,slp_1,slp_2,slp_3,slp_4,slp_5,elevation,season_wet
0,1.0,1948,1,3.2,False,KALAE,18.916176,-155.674994,295.396027,31.299995,...,23.194658,23.052715,1014.175476,1013.94574,1014.0849,1015.194092,1014.699646,1014.406311,35.0,1
1,1.0,1949,1,8.7,False,KALAE,18.916176,-155.674994,295.52832,32.029999,...,23.426136,23.06262,1014.218628,1014.555359,1015.127014,1015.122925,1015.357971,1015.803894,35.0,1
2,1.0,1950,1,5.14,True,KALAE,18.916176,-155.674994,295.889526,31.689995,...,23.475718,23.184557,1012.599609,1013.097717,1013.64386,1012.946472,1013.287842,1013.578979,35.0,1
3,1.0,1951,1,4.28,True,KALAE,18.916176,-155.674994,295.649872,31.349998,...,23.691616,23.522514,1016.933472,1017.046753,1017.61969,1018.097595,1018.057739,1018.303101,35.0,1
4,1.0,1952,1,8.23,True,KALAE,18.916176,-155.674994,295.321808,29.190002,...,23.605747,23.20055,1014.928284,1015.308533,1016.047974,1016.068726,1016.284241,1016.759949,35.0,1


# For some ML experiments, only use stations with more than 750 stations and only use filled data

In [24]:
# resetting recommended, to refresh te memory. Otherwise kernel is likely to die
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [25]:
import numpy as np
import xarray as xr
import pandas as pd
import os
# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
# define filenames
from config import NC_FILE_NAMES, DF_LABELS
base_dir = '/mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/'
dest_dir = os.path.join(base_dir, 'processed_datasets/')
filename_6grid = os.path.join(dest_dir, 'dataset_6grid.csv')
filename_750 = os.path.join(dest_dir, 'dataset_6grid_nonfilled_min_750.csv')

In [27]:
df = pd.read_csv(filename_6grid)

In [28]:
def assign_fold(df, n_folds=5, min_data=None, nonfilled=False):
    # assign fold for each sample.
    # this method makes sure that no data from the same month exist across different folds.
    if nonfilled: # if nonfilled == True, remove any row with filled == True
        df = df[df['filled'] != True]
    df_n_data = df.groupby('skn').size().reset_index().rename(columns={0: "n_data"})
    if min_data is not None:
        df_n_data = df_n_data[df_n_data['n_data'] > min_data]
    df = df.merge(df_n_data, left_on='skn', right_on='skn', how='inner')
    
    df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
    df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
    n_samples_total = df_len_by_month['cumsum'].iloc[-1]
    n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    
    df_len_by_month['fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    
    df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
    return df_w_fold

In [29]:
# only use stations with at least 750 samples, and only use non-filled data
df_nonfilled = assign_fold(df, min_data=750, nonfilled=True)

In [30]:
df_nonfilled.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,air2m,air1000_500,...,slp_2,slp_3,slp_4,slp_5,elevation,season_wet,n_data,len,cumsum,fold
0,54.0,1948,1,5.37,False,HAWAII NAT PK HQ,19.429741,-155.257146,295.39603,31.299995,...,1014.0849,1015.194092,1014.699646,1014.406311,3970.0,1,778,21,21,0
1,79.0,1948,1,6.17,False,KULANI CAMP,19.548635,-155.300557,295.39603,31.299995,...,1014.0849,1015.194092,1014.699646,1014.406311,5170.0,1,758,21,21,0
2,338.0,1948,1,30.89,False,HALEAKALA RANGER,20.758467,-156.245523,295.39603,31.299995,...,1014.0849,1015.194092,1014.699646,1014.406311,7030.0,1,778,21,21,0
3,250.0,1948,1,5.27,False,ULUPALAKUA RANCH,20.648174,-156.398497,295.30087,31.37999,...,1014.0849,1015.194092,1014.699646,1014.406311,1900.0,1,775,21,21,0
4,267.0,1948,1,7.8,False,KULA SANATORIUM,20.70014,-156.355519,295.30087,31.37999,...,1014.0849,1015.194092,1014.699646,1014.406311,3005.0,1,772,21,21,0


In [31]:
# print the size of each fold
for i in range(5):
    print(df_nonfilled.query(f'fold == {i}').shape)

(3673, 126)
(3703, 126)
(3686, 126)
(3700, 126)
(3709, 126)


In [32]:
df_nonfilled.to_csv(filename_750, index=False)

<hr><hr>

### Appendix: check consistency with the old version

In [34]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [1]:
import pandas as pd

In [2]:
# check 6grid datasets
df_old = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/PI-CASC/processed_data/ml_filled_dataset.csv')
df_new = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid.csv')

In [3]:
df_compare = pd.merge(df_old, df_new, left_on=['year', 'month', 'skn'], right_on=['year', 'month', 'skn'], how='outer', indicator='Exist')

In [12]:
# all rows shoudl exist on both files
assert len(df_compare['Exist'].unique()) == 1
assert df_old.shape[0] == df_new.shape[0]
for col in df_old.columns:
    if col in ['year', 'month', 'skn']: continue
    try:
        assert all(abs(df_compare[f'{col}_x'] - df_compare[f'{col}_y']) < 1e-10)
    except Exception as e:
        assert all(df_compare[f'{col}_x'] == df_compare[f'{col}_y'])

In [16]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [19]:
import pandas as pd
# check 6grid datasets
df_old = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/PI-CASC/processed_data/ml_nonfilled_min_750_dataset.csv')
df_new = pd.read_csv('/home/yusukemh/sadow_lts/personal/yusukemh/pi_casc/processed_datasets/dataset_6grid_min_750.csv')

In [20]:
df_compare = pd.merge(df_old, df_new, left_on=['year', 'month', 'skn'], right_on=['year', 'month', 'skn'], how='outer', indicator='Exist')

In [21]:
# all rows shoudl exist on both files
assert len(df_compare['Exist'].unique()) == 1
assert df_old.shape[0] == df_new.shape[0]
for col in df_old.columns:
    if col in ['year', 'month', 'skn']: continue
    try:
        assert all(abs(df_compare[f'{col}_x'] - df_compare[f'{col}_y']) < 1e-10)
    except Exception as e:
        assert all(df_compare[f'{col}_x'] == df_compare[f'{col}_y'])