In [85]:
import numpy as np
import netCDF4 as nc
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr
import sys
# from mpl_toolkits.basemap import Basemap

In [86]:
base_dir = "../dataset"
# nc.Dataset(f"{base_dir}/")
df_locations = pd.read_csv(f"{base_dir}/SKNlocations.csv")
df_data = pd.read_excel(f"{base_dir}/FilledDataset2012.xlsx", sheet_name="Data_in")

In [87]:
df_data

Unnamed: 0,SKN,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1.0,1920,4.76,0.23,0.005188,0.43,0.30,,,0,0,0.87,,
1,1.0,1921,7.78,1.29,2.946572,0.58,0.27,,,0.58,0,0.44,,
2,1.0,1922,4.02,2.75,6.053961,1.63,0.03,,,0.04,0.74,0.95,,
3,1.0,1923,11.47,2.01,3.740279,4.18,0.27,,,1.01,4.94,2.21,,
4,1.0,1924,0,0.28,2.168428,3.73,0.62,,,0.32,0,2.23,,2.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197062,1147.0,2008,,,,,,,,,,,,
197063,1147.0,2009,,,,,,,,,,,,
197064,1147.0,2010,,,,,,,,,,,,
197065,1147.0,2011,,,,,,,,,,,,


In [88]:
df_locations.head()

Unnamed: 0,SKN,Name,Lat_DD,Lon_DD
0,1.0,KALAE,18.916176,-155.674994
1,1.1,MORSE FIELD,18.91368,-155.68055
2,1.2,KALAE S TRK STA,18.938669,-155.680549
3,2.0,MANUKA,19.10866,-155.825545
4,2.1,KAHUKU MAUKA 2.10,19.10889,-155.74667


In [89]:
df_data.head()

Unnamed: 0,SKN,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1.0,1920,4.76,0.23,0.005188,0.43,0.3,,,0.0,0.0,0.87,,
1,1.0,1921,7.78,1.29,2.946572,0.58,0.27,,,0.58,0.0,0.44,,
2,1.0,1922,4.02,2.75,6.053961,1.63,0.03,,,0.04,0.74,0.95,,
3,1.0,1923,11.47,2.01,3.740279,4.18,0.27,,,1.01,4.94,2.21,,
4,1.0,1924,0.0,0.28,2.168428,3.73,0.62,,,0.32,0.0,2.23,,2.48


## Conevrt df_data into cell-based matrix

In [90]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])

In [91]:
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()



In [92]:
df_data_by_cell.head()

Unnamed: 0,skn,year,month,data_in
0,1.0,1948,1,3.2
1,1.0,1948,2,1.33
2,1.0,1948,3,2.31
3,1.0,1948,4,3.23
4,1.0,1948,5,1.14


In [93]:
# Merge
df_data_w_coord = df_data_by_cell.merge(right=df_locations, left_on="skn", right_on="SKN")
df_data_w_coord.head()

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994
1,1.0,1948,2,1.33,1.0,KALAE,18.916176,-155.674994
2,1.0,1948,3,2.31,1.0,KALAE,18.916176,-155.674994
3,1.0,1948,4,3.23,1.0,KALAE,18.916176,-155.674994
4,1.0,1948,5,1.14,1.0,KALAE,18.916176,-155.674994


## Load the cdf files

In [94]:
# ds = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc")
lat_hawaii = [15, 17.5, 20, 22.5, 25]
lon_hawaii = np.array([-162.5, -160, -157.5, -155, -152.5]) + 360

ds_air2m = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_air1000_500 = xr.open_dataset(f"{base_dir}/air.1000-500.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_hgt500 = xr.open_dataset(f"{base_dir}/hgt500.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_hgt1000 = xr.open_dataset(f"{base_dir}/hgt1000.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_omega500 = xr.open_dataset(f"{base_dir}/omega500.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_pottemp_1000_500 = xr.open_dataset(f"{base_dir}/pottmp.1000-500.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_pottemp_1000_850 = xr.open_dataset(f"{base_dir}/pottmp.1000-850.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_pwtr = xr.open_dataset(f"{base_dir}/pwtr.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_u700 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.700.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_u925 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.925.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_v700 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.700.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_v950 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.925.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_shum700 = xr.open_dataset(f"{base_dir}/shum700.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_shum925 = xr.open_dataset(f"{base_dir}/shum925.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_skt = xr.open_dataset(f"{base_dir}/skt.mon.mean.regridded.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]
ds_slp = xr.open_dataset(f"{base_dir}/slp.mon.mean.nc").loc[dict(lat=lat_hawaii, lon=lon_hawaii)]

datasets = [ # list of tuples. (dataset object, label, attribute string in ds)
    (ds_air2m, "air2m", "air"),
    (ds_air1000_500, "air1000_500", "air"),
    (ds_hgt500, "hgt500", "hgt"),
    (ds_hgt1000, "hgt1000", "hgt"),
    (ds_omega500, "omega500", "omega"),
    (ds_pottemp_1000_500, "pottemp1000-500", "pottmp"),
    (ds_pottemp_1000_850, "pottemp1000-850", "pottmp"),
    (ds_pwtr, "pr_wtr", "pr_wtr"),
    (ds_u700, "shum-uwnd-700", "shum"),
    (ds_u925, "shum-uwnd-925", "shum"),
    (ds_v700, "shum-vwnd-700", "shum"),
    (ds_v950, "shum-vwnd-950", "shum"),
    (ds_shum700, "shum700", "shum"),
    (ds_shum925, "shum925", "shum"),
    (ds_skt, "skt", "skt"),
    (ds_slp, "slp", "slp")
]
# combine all the cdf data

In [95]:
# df_temp = pd.DataFrame()
# df_temp["air2m"] = df_data_w_coord.iloc[:10].apply(lambda x: datasets[0][0].loc[dict(time=f"{x['year']}-{x['month']}-01")].to_array(), axis=1)

In [None]:
df_data_xarray = pd.DataFrame()
for dataset in datasets:
    array = dataset[0]
    label = dataset[1]
    df_data_xarray[label] = df_data_w_coord.apply(lambda x: array.loc[dict(time=f"{x['year']}-{x['month']}-01")].to_array(), axis=1)

In [84]:
df_temp

Unnamed: 0,air2m,air1000_500,hgt500,hgt1000,omega500,pottemp1000-500,pottemp1000-850,pr_wtr,shum-uwnd-700,shum-uwnd-925,shum-vwnd-700,shum-vwnd-950,shum700,shum925,skt,slp
0,"[[[<xarray.DataArray ()>\narray(297.59113, dty...",[[[<xarray.DataArray (lon: 5)>\narray([30.5600...,[[[<xarray.DataArray (lon: 5)>\narray([5817.83...,[[[<xarray.DataArray (lon: 5)>\narray([107.580...,[[[<xarray.DataArray (lon: 5)>\narray([0.02499...,[[[<xarray.DataArray (lon: 5)>\narray([-27.681...,[[[<xarray.DataArray (lon: 5)>\narray([-3.4075...,"[[[<xarray.DataArray ()>\narray(33.23709, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([-3.2722...,[[[<xarray.DataArray (lon: 5)>\narray([-81.673...,[[[<xarray.DataArray (lon: 5)>\narray([ 0.5193...,[[[<xarray.DataArray (lon: 5)>\narray([-14.224...,[[[<xarray.DataArray (lon: 5)>\narray([2.59700...,[[[<xarray.DataArray (lon: 5)>\narray([11.4710...,"[[[<xarray.DataArray ()>\narray(25.64039, dtyp...","[[[<xarray.DataArray ()>\narray(1012.4068, dty..."
1,"[[[<xarray.DataArray ()>\narray(296.67743, dty...",[[[<xarray.DataArray (lon: 5)>\narray([30.1999...,[[[<xarray.DataArray (lon: 5)>\narray([5823.44...,[[[<xarray.DataArray (lon: 5)>\narray([119.241...,[[[<xarray.DataArray (lon: 5)>\narray([ 0.0489...,[[[<xarray.DataArray (lon: 5)>\narray([-28.010...,[[[<xarray.DataArray (lon: 5)>\narray([-4.2215...,"[[[<xarray.DataArray ()>\narray(27.41724, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([3.02897...,[[[<xarray.DataArray (lon: 5)>\narray([-55.066...,[[[<xarray.DataArray (lon: 5)>\narray([ 0.8236...,[[[<xarray.DataArray (lon: 5)>\narray([-7.3544...,[[[<xarray.DataArray (lon: 5)>\narray([2.65699...,[[[<xarray.DataArray (lon: 5)>\narray([ 9.1929...,"[[[<xarray.DataArray ()>\narray(23.903805, dty...","[[[<xarray.DataArray ()>\narray(1013.7411, dty..."
2,"[[[<xarray.DataArray ()>\narray(297.10855, dty...",[[[<xarray.DataArray (lon: 5)>\narray([31.64 ...,[[[<xarray.DataArray (lon: 5)>\narray([5822.09...,[[[<xarray.DataArray (lon: 5)>\narray([119.322...,[[[<xarray.DataArray (lon: 5)>\narray([-0.0330...,[[[<xarray.DataArray (lon: 5)>\narray([-26.317...,[[[<xarray.DataArray (lon: 5)>\narray([-4.3960...,"[[[<xarray.DataArray ()>\narray(35.41548, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([2.51263...,[[[<xarray.DataArray (lon: 5)>\narray([-80.430...,[[[<xarray.DataArray (lon: 5)>\narray([ 8.5979...,[[[<xarray.DataArray (lon: 5)>\narray([ 4.6943...,[[[<xarray.DataArray (lon: 5)>\narray([3.92599...,[[[<xarray.DataArray (lon: 5)>\narray([10.4319...,"[[[<xarray.DataArray ()>\narray(24.320047, dty...","[[[<xarray.DataArray ()>\narray(1013.81305, dt..."
3,"[[[<xarray.DataArray ()>\narray(297.90656, dty...",[[[<xarray.DataArray (lon: 5)>\narray([30.07 ...,[[[<xarray.DataArray (lon: 5)>\narray([5845.73...,[[[<xarray.DataArray (lon: 5)>\narray([116.466...,[[[<xarray.DataArray (lon: 5)>\narray([0.03599...,[[[<xarray.DataArray (lon: 5)>\narray([-28.403...,[[[<xarray.DataArray (lon: 5)>\narray([-4.0877...,"[[[<xarray.DataArray ()>\narray(35.977665, dty...",[[[<xarray.DataArray (lon: 5)>\narray([-5.4026...,[[[<xarray.DataArray (lon: 5)>\narray([-63.842...,[[[<xarray.DataArray (lon: 5)>\narray([-6.2063...,[[[<xarray.DataArray (lon: 5)>\narray([ -6.674...,[[[<xarray.DataArray (lon: 5)>\narray([4.465 ...,[[[<xarray.DataArray (lon: 5)>\narray([10.7660...,"[[[<xarray.DataArray ()>\narray(25.467556, dty...","[[[<xarray.DataArray ()>\narray(1013.4535, dty..."
4,"[[[<xarray.DataArray ()>\narray(298.55862, dty...",[[[<xarray.DataArray (lon: 5)>\narray([29.9199...,[[[<xarray.DataArray (lon: 5)>\narray([5860.93...,[[[<xarray.DataArray (lon: 5)>\narray([119.967...,[[[<xarray.DataArray (lon: 5)>\narray([0.02499...,[[[<xarray.DataArray (lon: 5)>\narray([-28.676...,[[[<xarray.DataArray (lon: 5)>\narray([-3.7547...,"[[[<xarray.DataArray ()>\narray(38.174515, dty...",[[[<xarray.DataArray (lon: 5)>\narray([-16.938...,[[[<xarray.DataArray (lon: 5)>\narray([-130.43...,[[[<xarray.DataArray (lon: 5)>\narray([-2.5948...,[[[<xarray.DataArray (lon: 5)>\narray([-16.271...,[[[<xarray.DataArray (lon: 5)>\narray([3.604 ...,[[[<xarray.DataArray (lon: 5)>\narray([12.914 ...,"[[[<xarray.DataArray ()>\narray(26.362017, dty...","[[[<xarray.DataArray ()>\narray(1013.726, dtyp..."
5,"[[[<xarray.DataArray ()>\narray(298.81512, dty...",[[[<xarray.DataArray (lon: 5)>\narray([29.54 ...,[[[<xarray.DataArray (lon: 5)>\narray([5862.9 ...,[[[<xarray.DataArray (lon: 5)>\narray([115.7 ...,[[[<xarray.DataArray (lon: 5)>\narray([0.02099...,[[[<xarray.DataArray (lon: 5)>\narray([-29.214...,[[[<xarray.DataArray (lon: 5)>\narray([-3.3201...,"[[[<xarray.DataArray ()>\narray(38.911, dtype=...",[[[<xarray.DataArray (lon: 5)>\narray([-16.664...,[[[<xarray.DataArray (lon: 5)>\narray([-89.064...,[[[<xarray.DataArray (lon: 5)>\narray([ 0.9872...,[[[<xarray.DataArray (lon: 5)>\narray([-13.142...,[[[<xarray.DataArray (lon: 5)>\narray([3.94899...,[[[<xarray.DataArray (lon: 5)>\narray([12.76 ...,"[[[<xarray.DataArray ()>\narray(26.557959, dty...","[[[<xarray.DataArray ()>\narray(1013.256, dtyp..."
6,"[[[<xarray.DataArray ()>\narray(299.46228, dty...",[[[<xarray.DataArray (lon: 5)>\narray([29.6600...,[[[<xarray.DataArray (lon: 5)>\narray([5867.38...,[[[<xarray.DataArray (lon: 5)>\narray([111.903...,[[[<xarray.DataArray (lon: 5)>\narray([-0.0010...,[[[<xarray.DataArray (lon: 5)>\narray([-29.215...,[[[<xarray.DataArray (lon: 5)>\narray([-3.1709...,"[[[<xarray.DataArray ()>\narray(40.62645, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([-23.352...,[[[<xarray.DataArray (lon: 5)>\narray([-126.56...,[[[<xarray.DataArray (lon: 5)>\narray([-1.5670...,[[[<xarray.DataArray (lon: 5)>\narray([-29.772...,[[[<xarray.DataArray (lon: 5)>\narray([3.82199...,[[[<xarray.DataArray (lon: 5)>\narray([14.1100...,"[[[<xarray.DataArray ()>\narray(27.35884, dtyp...","[[[<xarray.DataArray ()>\narray(1012.8287, dty..."
7,"[[[<xarray.DataArray ()>\narray(299.96225, dty...",[[[<xarray.DataArray (lon: 5)>\narray([30.07 ...,[[[<xarray.DataArray (lon: 5)>\narray([5870.16...,[[[<xarray.DataArray (lon: 5)>\narray([106.935...,[[[<xarray.DataArray (lon: 5)>\narray([-0.0080...,[[[<xarray.DataArray (lon: 5)>\narray([-28.825...,[[[<xarray.DataArray (lon: 5)>\narray([-3.6665...,"[[[<xarray.DataArray ()>\narray(43.137096, dty...",[[[<xarray.DataArray (lon: 5)>\narray([-23.541...,[[[<xarray.DataArray (lon: 5)>\narray([-103.05...,[[[<xarray.DataArray (lon: 5)>\narray([-4.7459...,[[[<xarray.DataArray (lon: 5)>\narray([-11.260...,[[[<xarray.DataArray (lon: 5)>\narray([4.69899...,[[[<xarray.DataArray (lon: 5)>\narray([14.254 ...,"[[[<xarray.DataArray ()>\narray(27.935108, dty...","[[[<xarray.DataArray ()>\narray(1012.21027, dt..."
8,"[[[<xarray.DataArray ()>\narray(299.4434, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([30.39 ...,[[[<xarray.DataArray (lon: 5)>\narray([5858.46...,[[[<xarray.DataArray (lon: 5)>\narray([105.666...,[[[<xarray.DataArray (lon: 5)>\narray([-0.0100...,[[[<xarray.DataArray (lon: 5)>\narray([-28.335...,[[[<xarray.DataArray (lon: 5)>\narray([-3.0480...,"[[[<xarray.DataArray ()>\narray(41.98933, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([-17.972...,[[[<xarray.DataArray (lon: 5)>\narray([-92.627...,[[[<xarray.DataArray (lon: 5)>\narray([-5.0457...,[[[<xarray.DataArray (lon: 5)>\narray([ -6.675...,[[[<xarray.DataArray (lon: 5)>\narray([4.17 ...,[[[<xarray.DataArray (lon: 5)>\narray([13.9080...,"[[[<xarray.DataArray ()>\narray(27.462713, dty...","[[[<xarray.DataArray ()>\narray(1012.0863, dty..."
9,"[[[<xarray.DataArray ()>\narray(298.8393, dtyp...",[[[<xarray.DataArray (lon: 5)>\narray([30.7999...,[[[<xarray.DataArray (lon: 5)>\narray([5853.48...,[[[<xarray.DataArray (lon: 5)>\narray([113.806...,[[[<xarray.DataArray (lon: 5)>\narray([-0.0020...,[[[<xarray.DataArray (lon: 5)>\narray([-27.684...,[[[<xarray.DataArray (lon: 5)>\narray([-3.5969...,"[[[<xarray.DataArray ()>\narray(38.536125, dty...",[[[<xarray.DataArray (lon: 5)>\narray([-12.085...,[[[<xarray.DataArray (lon: 5)>\narray([-102.92...,[[[<xarray.DataArray (lon: 5)>\narray([-0.1313...,[[[<xarray.DataArray (lon: 5)>\narray([-3.5447...,[[[<xarray.DataArray (lon: 5)>\narray([3.28400...,[[[<xarray.DataArray (lon: 5)>\narray([12.66 ...,"[[[<xarray.DataArray ()>\narray(26.630869, dty...","[[[<xarray.DataArray ()>\narray(1013.0435, dty..."


In [58]:
datasets[0][0].loc[dict(time="1948-1-1")].to_array().sel(lon=22.5, method="nearest")

In [None]:
df_temp.head()

In [18]:
list_of_df = []
for data in datasets:
    ds = data[0]
    df =  ds.to_dataframe()
    if "level" in df.index.names:
        df = df.droplevel(level="level")
    df = df.reorder_levels(["lat", "lon", "time"])
    list_of_df.append(df)

In [30]:
list_of_df[0].iloc[0]

air    297.591125
Name: (15.0, 197.5, 1948-01-01 00:00:00), dtype: float32

In [22]:
df_all_cdf_data = pd.concat(list_of_df, axis=1).reset_index()
df_all_cdf_data.head()

Unnamed: 0,lat,lon,time,air,air.1,hgt,hgt.1,omega,pottmp,pottmp.1,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp
0,15.0,197.5,1948-01-01,297.591125,30.560005,5817.838867,107.580643,0.024998,-27.681976,-3.407562,33.237091,-3.272206,-81.67347,0.519392,-14.224104,2.597,11.471001,25.64039,1012.406799
1,15.0,197.5,1948-02-01,296.677429,30.199997,5823.448242,119.241379,0.048998,-28.010559,-4.221558,27.41724,3.028978,-55.066113,0.823663,-7.354427,2.657,9.192999,23.903805,1013.741089
2,15.0,197.5,1948-03-01,297.108551,31.639999,5822.09668,119.322578,-0.033003,-26.31778,-4.396027,35.415482,2.512637,-80.430786,8.597947,4.694368,3.925999,10.431999,24.320047,1013.813049
3,15.0,197.5,1948-04-01,297.906555,30.07,5845.733398,116.466667,0.035997,-28.4039,-4.087799,35.977665,-5.40268,-63.842468,-6.206347,-6.674868,4.465,10.766001,25.467556,1013.453491
4,15.0,197.5,1948-05-01,298.558624,29.919991,5860.935547,119.967743,0.024998,-28.676941,-3.75473,38.174515,-16.938789,-130.431473,-2.594884,-16.271568,3.604,12.914,26.362017,1013.726013


In [23]:
df_all_cdf_data.shape

(21500, 19)