In [307]:
import numpy as np
# import netCDF4 as nc
import xarray as xr
import pandas as pd

data_dir = '/mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/PI-CASC'
from config import FILE_NAMES, DF_LABELS

# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [369]:
# process skn location data
df_locations = pd.read_csv(f'{data_dir}/SKNlocations.csv')
df_locations.columns = map(str.lower, df_locations.columns)
df_locations = df_locations.rename(columns={'lat_dd': 'lat', 'lon_dd': 'lon'})

# convert the longitude from (-180, 180) to (0, 360)
df_locations['lon_updated'] = df_locations['lon'] + 360
# for each row, compute the coordinates of the closest girdpoint in netCDF files
def closest_grid(lat, lon):
    # to save computation, only include grid points around Hawaii
    lat_grids = np.arange(12.5, 27.5, 2.5)
    lon_grids = np.arange(200, 212.5, 2.5)
    
    xx, yy = np.meshgrid(lat_grids, lon_grids)
    d = np.sqrt((xx-lat)**2 + (yy-lon)**2)
    target_idx = np.where(d == d.min())
    
    return tuple(np.array([xx[target_idx], yy[target_idx]]).reshape(-1,))

df_locations['closest_grid'] = df_locations.apply(lambda row: closest_grid(row['lat'], row['lon_updated']), axis=1)

In [371]:
df_locations.head()

Unnamed: 0,skn,name,lat,lon,lon_updated,closest_grid
0,1.0,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
1,1.1,MORSE FIELD,18.91368,-155.68055,204.31945,"(20.0, 205.0)"
2,1.2,KALAE S TRK STA,18.938669,-155.680549,204.319451,"(20.0, 205.0)"
3,2.0,MANUKA,19.10866,-155.825545,204.174455,"(20.0, 205.0)"
4,2.1,KAHUKU MAUKA 2.10,19.10889,-155.74667,204.25333,"(20.0, 205.0)"


In [264]:
# process rainfall data
df_data = pd.read_excel(f"{data_dir}/FilledDataset2012.xlsx", sheet_name='Data_in')
df_data.columns = map(str.lower, df_data.columns)
# melt cells into rows
df_data = df_data.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='data_in'
)

# convert string month to digit
month_to_digit = dict(
    jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
    jul=7, aug=8, sep=9, oct=10, nov=11, dec=12
)
df_data.month = df_data.month.map(month_to_digit)
# replace empty space to NaN so the downstream process can remove those values
df_data['data_in'] = df_data['data_in'].replace(r'^\s*$', np.nan, regex=True)

In [265]:
df_data.head()

Unnamed: 0,skn,year,month,data_in
0,1.0,1920,1,4.76
1,1.0,1921,1,7.78
2,1.0,1922,1,4.02
3,1.0,1923,1,11.47
4,1.0,1924,1,0.0


In [261]:
# we need information on how each data is collected.
df_source = pd.read_excel(f"{data_dir}/FilledDataset2012.xlsx", sheet_name='Source')
df_source.columns = map(str.lower, df_source.columns)

# melt cells into rows
df_source = df_source.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='filled'
)

b_filled = {
    # filled
    'Fill_1': True, 'Fill_2': True, 'Fill_3': True, 'Fill_4': True, 'Fill_5': True, 'NRFill': True,
    # nonfilled
    'State/NCDC': False, 'NCDC': False, 'State': False, 'Hydronet': False, 'RAWS': False,
    'SCAN': False, 'USGS': False, 'Hydronet/NCDC': False, 'HaleNet': False, 'HC&S': False,
    'AlanMair': False, 'USGS/State': False, 'USGS/NCDC': False, 'AlanMair/State': False,
}
df_source.filled = df_source.filled.map(b_filled)
df_source.month = df_source.month.map(month_to_digit)

In [262]:
df_source.head()

Unnamed: 0,skn,year,month,filled
0,1.0,1920,1,True
1,1.0,1921,1,True
2,1.0,1922,1,True
3,1.0,1923,1,True
4,1.0,1924,1,True


In [266]:
assert df_source.shape == df_data.shape

In [294]:
# merge df_data and df_source and drop rows with any NaN
df_data_w_source = pd.merge(left=df_data, right=df_source, left_on=['skn', 'year', 'month'], right_on=['skn', 'year', 'month']).dropna()

In [295]:
df_data_w_source.head()

Unnamed: 0,skn,year,month,data_in,filled
0,1.0,1920,1,4.76,True
1,1.0,1921,1,7.78,True
2,1.0,1922,1,4.02,True
3,1.0,1923,1,11.47,True
4,1.0,1924,1,0.0,True


In [372]:
# merge df_data_w_source and df_locations
df_ready_for_cdf = pd.merge(left=df_data_w_source, right=df_locations, left_on='skn', right_on='skn')
df_ready_for_cdf.head()

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,lon_updated,closest_grid
0,1.0,1920,1,4.76,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
1,1.0,1921,1,7.78,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
2,1.0,1922,1,4.02,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
3,1.0,1923,1,11.47,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"
4,1.0,1924,1,0.0,True,KALAE,18.916176,-155.674994,204.325006,"(20.0, 205.0)"


In [379]:
unique_closest_coords = np.array([np.array(item) for item in df_locations['closest_grid'].unique()])
unique_lat = np.unique(unique_closest_coords[:, 0])
unique_lon = np.unique(unique_closest_coords[:, 1])

In [414]:
dfs = []
for filename, df_label in zip(FILE_NAMES, DF_LABELS):
    df = xr.open_dataset(f"{data_dir}/reanalysis_data/{filename}").loc[dict(lat=unique_lat, lon=unique_lon)].to_dataframe() # only load relevant portion
    if "level" in df.index.names: # if there is extra level then we have to drop the level
        df = df.droplevel(level="level")
    df = df.reorder_levels(['lat', 'lon', 'time'])
    df.columns = [df_label]
    dfs.append(df)

In [415]:
df_cdf_combined = pd.concat(dfs, axis=1).reset_index()

In [417]:
df_cdf_combined['year'] = df_cdf_combined['time'].dt.year
df_cdf_combined['month'] = df_cdf_combined['time'].dt.month
df_cdf_combined['grid'] = df_cdf_combined.apply(lambda row: (row['lat'], row['lon']), axis=1)

In [418]:
pd.merge(
    left=df_ready_for_cdf,
    right=df_cdf_combined.drop(columns=['lat', 'lon']),
    left_on=['year', 'month', 'closest_grid'],
    right_on=['year', 'month', 'grid']
).columns

Index(['skn', 'year', 'month', 'data_in', 'filled', 'name', 'lat', 'lon',
       'lon_updated', 'closest_grid', 'time', 'air2m', 'air1000_500', 'hgt500',
       'hgt1000', 'omega500', 'pottemp1000-500', 'pottemp1000-850', 'pr_wtr',
       'shum-uwnd-700', 'shum-uwnd-925', 'shum-vwnd-700', 'shum-vwnd-950',
       'shum700', 'shum925', 'skt', 'slp', 'grid'],
      dtype='object')

In [303]:
df_ready_for_cdf.query('lon <= -156')

Unnamed: 0,skn,year,month,data_in,filled,name,lat,lon,lon_updated,closest_grid
77780,63.0,2006,1,0.363448,False,Kaloko-Honokohau,19.672778,-156.020278,203.979722,"[20.0, 205.0]"
77781,63.0,2007,1,0.86,False,Kaloko-Honokohau,19.672778,-156.020278,203.979722,"[20.0, 205.0]"
77782,63.0,2008,1,0.31,False,Kaloko-Honokohau,19.672778,-156.020278,203.979722,"[20.0, 205.0]"
77783,63.0,2009,1,1.28,False,Kaloko-Honokohau,19.672778,-156.020278,203.979722,"[20.0, 205.0]"
77784,63.0,2010,1,0.2,False,Kaloko-Honokohau,19.672778,-156.020278,203.979722,"[20.0, 205.0]"
...,...,...,...,...,...,...,...,...,...,...
1241277,425.5,2012,11,3.791339,False,HAIKU 3.2 SE HI US,20.876100,-156.304700,203.695300,"[20.0, 202.5]"
1241278,425.5,2012,12,6.913386,False,HAIKU 3.2 SE HI US,20.876100,-156.304700,203.695300,"[20.0, 202.5]"
1241279,830.6,1957,10,2.38,False,GAGE 4 MINK,21.466829,-157.933919,202.066081,"[22.5, 202.5]"
1241287,324.3,1949,12,1.5,False,KULA,20.740136,-156.317187,203.682813,"[20.0, 202.5]"
