In [1]:
import numpy as np
import netCDF4 as nc
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr
import sys
# from mpl_toolkits.basemap import Basemap

In [2]:
base_dir = "../dataset"
# nc.Dataset(f"{base_dir}/")
df_locations = pd.read_csv(f"{base_dir}/SKNlocations.csv")
df_data = pd.read_excel(f"{base_dir}/FilledDataset2012.xlsx", sheet_name="Data_in")

### Convert the range of Lon_DD from (-180, 180) to (0, 360)

In [3]:
df_locations["Lon_DD_updated"] = df_locations.apply(lambda x: x["Lon_DD"] + 360, axis=1)
# df_locations["closest_grid"] = df_locations.apply(lambda x: closest_grid(np.array([x["Lat_DD"], x["Lon_DD"]])), axis=1)

### Compute the closest grid point of (lat, lon) for all SKN station

In [4]:
df_locations

Unnamed: 0,SKN,Name,Lat_DD,Lon_DD,Lon_DD_updated
0,1.0,KALAE,18.916176,-155.674994,204.325006
1,1.1,MORSE FIELD,18.913680,-155.680550,204.319450
2,1.2,KALAE S TRK STA,18.938669,-155.680549,204.319451
3,2.0,MANUKA,19.108660,-155.825545,204.174455
4,2.1,KAHUKU MAUKA 2.10,19.108890,-155.746670,204.253330
...,...,...,...,...,...
2219,1143.0,FIELD 30,22.202190,-159.347203,200.652797
2220,1144.0,FIELD 7-A,22.186077,-159.318870,200.681130
2221,1145.0,PUU AUAU,22.182760,-159.332203,200.667797
2222,1146.0,Moloaa Dairy,22.183333,-159.337500,200.662500


In [5]:
lat = np.arange(90, -91, -2.5)
lon = np.arange(0, 360, 2.5)
def closest_grid(target):
    closest = np.array([90, 0])
    min_dist = np.linalg.norm(closest - target)
    for latitude in lat:
        for longitude in lon:
            # if longitude > 180:
            #     longitude -= 360
            cord = np.array([latitude, longitude])
            dist = np.linalg.norm(cord - target)
            if dist < min_dist:
                min_dist = dist
                closest = cord
    return closest

df_locations["closest_grid"] = df_locations.apply(lambda x: closest_grid(np.array([x["Lat_DD"], x["Lon_DD_updated"]])), axis=1)

In [6]:
df_locations.head(5)

Unnamed: 0,SKN,Name,Lat_DD,Lon_DD,Lon_DD_updated,closest_grid
0,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
1,1.1,MORSE FIELD,18.91368,-155.68055,204.31945,"[20.0, 205.0]"
2,1.2,KALAE S TRK STA,18.938669,-155.680549,204.319451,"[20.0, 205.0]"
3,2.0,MANUKA,19.10866,-155.825545,204.174455,"[20.0, 205.0]"
4,2.1,KAHUKU MAUKA 2.10,19.10889,-155.74667,204.25333,"[20.0, 205.0]"


### Convert df_data into cell-based matrix

In [7]:
df_data.head(5)

Unnamed: 0,SKN,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1.0,1920,4.76,0.23,0.005188,0.43,0.3,,,0.0,0.0,0.87,,
1,1.0,1921,7.78,1.29,2.946572,0.58,0.27,,,0.58,0.0,0.44,,
2,1.0,1922,4.02,2.75,6.053961,1.63,0.03,,,0.04,0.74,0.95,,
3,1.0,1923,11.47,2.01,3.740279,4.18,0.27,,,1.01,4.94,2.21,,
4,1.0,1924,0.0,0.28,2.168428,3.73,0.62,,,0.32,0.0,2.23,,2.48


In [8]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])

In [9]:
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()

In [10]:
df_data_by_cell = df_data_by_cell.replace(r'^\s*$', np.nan, regex=True).dropna()

In [11]:
# Make sure no space is left
for i in df_data_by_cell["data_in"]:
    if type(i) == type("h"):
        print(i)

## merge!

In [12]:
df_data_w_coord = df_data_by_cell.merge(right=df_locations, left_on="skn", right_on="SKN")

In [13]:
df_data_w_coord

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD,Lon_DD_updated,closest_grid
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
1,1.0,1948,2,1.33,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
2,1.0,1948,3,2.31,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
3,1.0,1948,4,3.23,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
4,1.0,1948,5,1.14,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]"
...,...,...,...,...,...,...,...,...,...,...
865556,1147.0,1973,11,13.58,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]"
865557,1147.0,1973,12,8.23,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]"
865558,1147.0,1974,1,7.51,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]"
865559,1147.0,1974,2,2.31,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]"


In [14]:
df_data_w_coord["tuple"] = df_data_w_coord.apply(lambda x: (x["closest_grid"][0], x["closest_grid"][1], datetime(year=x["year"], month=x["month"], day=1)), axis=1)

In [15]:
df_data_w_coord

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD,Lon_DD_updated,closest_grid,tuple
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]","(20.0, 205.0, 1948-01-01 00:00:00)"
1,1.0,1948,2,1.33,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]","(20.0, 205.0, 1948-02-01 00:00:00)"
2,1.0,1948,3,2.31,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]","(20.0, 205.0, 1948-03-01 00:00:00)"
3,1.0,1948,4,3.23,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]","(20.0, 205.0, 1948-04-01 00:00:00)"
4,1.0,1948,5,1.14,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]","(20.0, 205.0, 1948-05-01 00:00:00)"
...,...,...,...,...,...,...,...,...,...,...,...
865556,1147.0,1973,11,13.58,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]","(22.5, 200.0, 1973-11-01 00:00:00)"
865557,1147.0,1973,12,8.23,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]","(22.5, 200.0, 1973-12-01 00:00:00)"
865558,1147.0,1974,1,7.51,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]","(22.5, 200.0, 1974-01-01 00:00:00)"
865559,1147.0,1974,2,2.31,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,200.663908,"[22.5, 200.0]","(22.5, 200.0, 1974-02-01 00:00:00)"


### Concatenante with netCDF data

In [16]:
# ds = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc")
unique_rows = np.unique(np.hstack(df_data_w_coord["closest_grid"].to_numpy().tolist()).reshape(-1, 2), axis=0)
unique_lat = np.unique(unique_rows[:, 0])
unique_lon = np.unique(unique_rows[:, 1])

ds_air2m = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_air1000_500 = xr.open_dataset(f"{base_dir}/air.1000-500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_hgt500 = xr.open_dataset(f"{base_dir}/hgt500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_hgt1000 = xr.open_dataset(f"{base_dir}/hgt1000.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_omega500 = xr.open_dataset(f"{base_dir}/omega500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pottemp_1000_500 = xr.open_dataset(f"{base_dir}/pottmp.1000-500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pottemp_1000_850 = xr.open_dataset(f"{base_dir}/pottmp.1000-850.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pwtr = xr.open_dataset(f"{base_dir}/pwtr.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_u700 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_u925 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_v700 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_v950 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_shum700 = xr.open_dataset(f"{base_dir}/shum700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_shum925 = xr.open_dataset(f"{base_dir}/shum925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_skt = xr.open_dataset(f"{base_dir}/skt.mon.mean.regridded.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_slp = xr.open_dataset(f"{base_dir}/slp.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]

datasets = [ # list of tuples. (dataset object, attribute string in ds)
    (ds_air2m, "air"),
    (ds_air1000_500, "air"),
    (ds_hgt500, "hgt"),
    (ds_hgt1000, "hgt"),
    (ds_omega500, "omega"),
    (ds_pottemp_1000_500, "pottmp"),
    (ds_pottemp_1000_850, "pottmp"),
    (ds_pwtr, "pr_wtr"),
    (ds_u700, "shum"),
    (ds_u925, "shum"),
    (ds_v700, "shum"),
    (ds_v950, "shum"),
    (ds_shum700, "shum"),
    (ds_shum925, "shum"),
    (ds_skt, "skt"),
    (ds_slp, "slp")
]
# combine all the cdf data

In [17]:
list_of_df = []
for data in datasets:
    ds = data[0]
    df =  ds.to_dataframe()
    if "level" in df.index.names:
        df = df.droplevel(level="level")
    df = df.reorder_levels(["lat", "lon", "time"])
    list_of_df.append(df)

In [18]:
df_all_cdf_data = pd.concat(list_of_df, axis=1).reset_index()

In [19]:
df_all_cdf_data["tuple"] = df_all_cdf_data.apply(
    lambda x: 
        (
            x["lat"], 
            x["lon"],
            datetime(
                year=x["time"].year,
                month=x["time"].month,
                day=1
            )
        ),
    axis=1
)

In [20]:
df_all_cdf_data.head(5)

Unnamed: 0,lat,lon,time,air,air.1,hgt,hgt.1,omega,pottmp,pottmp.1,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp,tuple
0,20.0,200.0,1948-01-01,295.726959,31.169991,5800.35498,122.967743,0.034998,-26.584778,-4.002899,28.530315,-4.261974,-43.965725,-3.64204,-22.181768,2.583,9.947001,24.336798,1014.175476,"(20.0, 200.0, 1948-01-01 00:00:00)"
1,20.0,200.0,1948-02-01,295.328339,31.490005,5794.379395,132.275864,0.047998,-26.071747,-4.346283,24.881031,1.629605,-26.913866,-1.099997,-9.842341,2.036999,8.709999,23.221535,1015.343079,"(20.0, 200.0, 1948-02-01 00:00:00)"
2,20.0,200.0,1948-03-01,295.800781,32.830002,5799.322754,139.967743,-0.006002,-24.555817,-4.327728,31.481606,-4.645377,-68.806618,0.16711,-9.625248,3.341999,9.530001,23.523169,1016.237976,"(20.0, 200.0, 1948-03-01 00:00:00)"
3,20.0,200.0,1948-04-01,296.61438,30.32,5830.533203,131.899994,0.017998,-27.778687,-4.285706,32.632004,2.259589,-38.778152,-1.355754,-2.5445,3.227999,10.177999,24.36927,1015.145142,"(20.0, 200.0, 1948-04-01 00:00:00)"
4,20.0,200.0,1948-05-01,297.482941,29.990005,5859.580566,142.7742,0.008999,-28.302948,-4.197418,33.806774,-13.79039,-85.865623,0.218375,-14.028182,3.119999,11.313,25.491318,1016.440674,"(20.0, 200.0, 1948-05-01 00:00:00)"


In [21]:
df_complete = df_data_w_coord.merge(
    right=df_all_cdf_data,
    left_on="tuple",
    right_on="tuple"
)

In [22]:
df_complete.head()

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD,Lon_DD_updated,closest_grid,...,pottmp,pr_wtr,shum,shum.1,shum.2,shum.3,shum.4,shum.5,skt,slp
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994,204.325006,"[20.0, 205.0]",...,-3.926514,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849
1,2.0,1948,1,5.95,2.0,MANUKA,19.10866,-155.825545,204.174455,"[20.0, 205.0]",...,-3.926514,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849
2,2.2,1948,1,11.5,2.2,KAHUKU SHED 3,19.16474,-155.68228,204.31772,"[20.0, 205.0]",...,-3.926514,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849
3,2.25,1948,1,5.515941,2.25,RESERVOIR (2940),19.160603,-155.822488,204.177512,"[20.0, 205.0]",...,-3.926514,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849
4,2.26,1948,1,4.310617,2.26,CASTLE,19.225323,-155.778876,204.221124,"[20.0, 205.0]",...,-3.926514,29.034512,2.592494,-25.859348,0.589191,7.106411,2.945999,9.869999,23.385218,1014.0849


## Clean up a little

In [23]:
df_complete_cleaned = df_complete.drop(labels=["tuple", "time", "SKN", "closest_grid", "Name"], axis=1)

In [24]:
df_complete_cleaned.columns

Index(['skn', 'year', 'month', 'data_in', 'Lat_DD', 'Lon_DD', 'Lon_DD_updated',
       'lat', 'lon', 'air', 'air', 'hgt', 'hgt', 'omega', 'pottmp', 'pottmp',
       'pr_wtr', 'shum', 'shum', 'shum', 'shum', 'shum', 'shum', 'skt', 'slp'],
      dtype='object')

In [25]:
# df_complete_cleaned = df_complete_cleaned.replace(r'^\s*$', np.nan, regex=True)

In [26]:
df_complete_cleaned.to_csv(f"{base_dir}/dataset.csv", index=False)