In [14]:
import numpy as np
import pandas as pd
import xarray as xr
import os
import time
from config import NC_FILE_NAMES, DF_LABELS, NC_ATTRIBUTES

In [2]:
# define filenames
from config import NC_FILE_NAMES, DF_LABELS
base_dir = '/mnt/lts/nfs_fs02/sadow_lab/personal/yusukemh/pi_casc/'
raw_data_dir = os.path.join(base_dir, 'raw_datasets/')
dest_dir = os.path.join(base_dir, 'processed_datasets/')

filename_skn_locations = os.path.join(raw_data_dir, 'SKNlocations.csv')
filename_stations = os.path.join(raw_data_dir, 'FilledDataset2012.xlsx')
# dest_filename = os.path.join(dest_dir, 'dataset_interp.csv')

In [32]:
df_rainfall = pd.read_excel(filename_stations, sheet_name="Data_in")
df_station_metadata = pd.read_excel(filename_stations, sheet_name="Header")

In [34]:
df_rainfall.columns = map(str.lower, df_rainfall.columns)
# melt cells into rows
df_rainfall = df_rainfall.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='data_in'
)

# convert string month to digit
month_to_digit = dict(
    jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
    jul=7, aug=8, sep=9, oct=10, nov=11, dec=12
)
df_rainfall.month = df_rainfall.month.map(month_to_digit)
# replace empty space to NaN so the downstream process can remove those values
df_rainfall['data_in'] = df_rainfall['data_in'].replace(r'^\s*$', np.nan, regex=True)

In [35]:
df_rainfall = df_rainfall.query('year >= 1948').sort_values(by=['skn', 'year', 'month']).dropna()

In [37]:
df_rainfall_w_coord = (
    df_rainfall.merge(
        right=df_station_metadata,
        left_on='skn',
        right_on='SKN'
    ).drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

In [38]:
df_rainfall_w_coord.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,NumMos,MinYear,MaxYear
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949


In [40]:
# load the cdf files
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 100

datasets = []
for filename, label, attribute in zip(NC_FILE_NAMES, DF_LABELS, NC_ATTRIBUTES):
    filename_reanalysis = os.path.join(raw_data_dir, f'reanalysis_data/{filename}')
    # Linear Interpolation
    ds = xr.open_dataset(filename_reanalysis).loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
            lat=np.linspace(
                lat_hawaii[0],
                lat_hawaii[-1],
                dim_interpolation
            ),
            lon=np.linspace(
                lon_hawaii[0],
                lon_hawaii[-1],
                dim_interpolation
            ),
            method="linear"
        ).rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })

In [41]:
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

In [42]:
start = time.time()
df_rainfall_w_coord["values"] = df_rainfall_w_coord[:].apply(lambda x: ds_combined.sel(time=f"{x.year}-{x.month}-01", lat=x.lat, lon=x.lon, method="nearest").to_array().to_numpy(), axis=1)
end = time.time()

elapsed = end - start
print("{:.3f}".format(elapsed))

2359.027


In [43]:
df_columns = pd.DataFrame(np.stack(df_rainfall_w_coord["values"].to_numpy()), columns=[item["label"] for item in datasets])
df_complete = df_rainfall_w_coord.merge(right=df_columns, left_index=True, right_index=True)

In [44]:
df_complete_cleaned = df_complete.drop(["values"], axis=1)

In [46]:
df_complete_cleaned.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,...,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.976694,28.678739,-7.07147,-60.664267,-3.335292,-27.741849,2.318151,10.345333,24.909237,1014.198085
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.568755,24.988922,0.450107,-38.805761,-3.49695,-13.066533,2.172999,8.830637,23.457119,1015.299784
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.402203,31.862511,-4.668788,-71.785439,0.292838,-8.205906,3.34715,9.370667,23.775679,1015.848733
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-4.269346,33.526352,3.466445,-41.395023,-1.941327,-4.107908,3.605182,10.080273,24.751422,1014.841114
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,...,-3.757156,35.827885,-13.206782,-92.122989,0.893711,-9.247211,3.55003,11.478545,25.866196,1015.936653


In [48]:
filename = os.path.join(dest_dir, 'dataset_interp_100.csv')
df_complete_cleaned.to_csv(filename, index=False)

# repeat above for 50

In [49]:
df_rainfall = pd.read_excel(filename_stations, sheet_name="Data_in")
df_station_metadata = pd.read_excel(filename_stations, sheet_name="Header")

df_rainfall.columns = map(str.lower, df_rainfall.columns)
# melt cells into rows
df_rainfall = df_rainfall.melt(
    id_vars=['skn', 'year'],
    var_name='month',
    value_name='data_in'
)

# convert string month to digit
month_to_digit = dict(
    jan=1, feb=2, mar=3, apr=4, may=5, jun=6,
    jul=7, aug=8, sep=9, oct=10, nov=11, dec=12
)
df_rainfall.month = df_rainfall.month.map(month_to_digit)
# replace empty space to NaN so the downstream process can remove those values
df_rainfall['data_in'] = df_rainfall['data_in'].replace(r'^\s*$', np.nan, regex=True)

df_rainfall = df_rainfall.query('year >= 1948').sort_values(by=['skn', 'year', 'month']).dropna()

In [50]:
df_rainfall_w_coord = (
    df_rainfall.merge(
        right=df_station_metadata,
        left_on='skn',
        right_on='SKN'
    ).drop(columns="SKN", axis=1)
    .rename(columns={"Lat_DD": "lat", "Lon_DD": "lon", "ElevFT": "elevation", "Name": "name"})
)

In [51]:
df_rainfall_w_coord.head()

Unnamed: 0,skn,year,month,data_in,name,lat,lon,elevation,Observer,Status2010,NumMos,MinYear,MaxYear
0,1.0,1948,1,3.2,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
1,1.0,1948,2,1.33,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
2,1.0,1948,3,2.31,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
3,1.0,1948,4,3.23,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949
4,1.0,1948,5,1.14,KALAE,18.916176,-155.674994,35.0,USCG,Discontinued,277,1924,1949


In [52]:
# load the cdf files
# load only relevant portion of netCDF files
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360
dim_interpolation = 50

datasets = []
for filename, label, attribute in zip(NC_FILE_NAMES, DF_LABELS, NC_ATTRIBUTES):
    filename_reanalysis = os.path.join(raw_data_dir, f'reanalysis_data/{filename}')
    # Linear Interpolation
    ds = xr.open_dataset(filename_reanalysis).loc[dict(lat=lat_hawaii, lon=lon_hawaii)].interp(
            lat=np.linspace(
                lat_hawaii[0],
                lat_hawaii[-1],
                dim_interpolation
            ),
            lon=np.linspace(
                lon_hawaii[0],
                lon_hawaii[-1],
                dim_interpolation
            ),
            method="linear"
        ).rename({f"{attribute}": f"{label}"})
    if "level" in ds.dims:
        ds = ds.squeeze("level").drop("level")
    datasets.append({
        "file": file_name,
        "dataset": ds,
        "label": label,
        "attribute": attribute
    })

In [53]:
ds_combined = xr.merge([datasets[i]["dataset"] for i in range(len(datasets))])

In [None]:
start = time.time()
df_rainfall_w_coord["values"] = df_rainfall_w_coord[:].apply(lambda x: ds_combined.sel(time=f"{x.year}-{x.month}-01", lat=x.lat, lon=x.lon, method="nearest").to_array().to_numpy(), axis=1)
end = time.time()

elapsed = end - start
print("{:.3f}".format(elapsed))

In [None]:
df_columns = pd.DataFrame(np.stack(df_rainfall_w_coord["values"].to_numpy()), columns=[item["label"] for item in datasets])
df_complete = df_rainfall_w_coord.merge(right=df_columns, left_index=True, right_index=True)

df_complete_cleaned = df_complete.drop(["values"], axis=1)
df_complete_cleaned.head()

In [None]:
# filename = os.path.join(dest_dir, 'dataset_interp_50.csv')
# df_complete_cleaned.to_csv(filename, index=False)