In [1]:
import numpy as np
import netCDF4 as nc
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from datetime import datetime
import xarray as xr
import sys
# from mpl_toolkits.basemap import Basemap

In [139]:
base_dir = "/Users/yusukehatanaka/Desktop/StatisticalDownscaling"
# nc.Dataset(f"{base_dir}/")
df_locations = pd.read_csv("SKNlocations.csv")
df_data = pd.read_excel("FilledDataset2012.xlsx", sheet_name="Data_in")

### Compute the closest grid point of (lat, lon) for all SKN station

In [140]:
lat = np.arange(90, -91, -2.5)
lon = np.arange(0, 360, 2.5)
def closest_grid(target):
    closest = np.array([90, 0])
    min_dist = np.linalg.norm(closest - target)
    for latitude in lat:
        for longitude in lon:
            if longitude > 180:
                longitude -= 360
            cord = np.array([latitude, longitude])
            dist = np.linalg.norm(cord - target)
            if dist < min_dist:
                min_dist = dist
                closest = cord
    return closest

df_locations["closest_grid"] = df_locations.apply(lambda x: closest_grid(np.array([x["Lat_DD"], x["Lon_DD"]])), axis=1)

In [141]:
df_locations.head(5)

Unnamed: 0,SKN,Name,Lat_DD,Lon_DD,closest_grid
0,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
1,1.1,MORSE FIELD,18.91368,-155.68055,"[20.0, -155.0]"
2,1.2,KALAE S TRK STA,18.938669,-155.680549,"[20.0, -155.0]"
3,2.0,MANUKA,19.10866,-155.825545,"[20.0, -155.0]"
4,2.1,KAHUKU MAUKA 2.10,19.10889,-155.74667,"[20.0, -155.0]"


### Convert df_data into cell-based matrix

In [142]:
df_data.head(5)

Unnamed: 0,SKN,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,1.0,1920,4.76,0.23,0.005188,0.43,0.3,,,0.0,0.0,0.87,,
1,1.0,1921,7.78,1.29,2.946572,0.58,0.27,,,0.58,0.0,0.44,,
2,1.0,1922,4.02,2.75,6.053961,1.63,0.03,,,0.04,0.74,0.95,,
3,1.0,1923,11.47,2.01,3.740279,4.18,0.27,,,1.01,4.94,2.21,,
4,1.0,1924,0.0,0.28,2.168428,3.73,0.62,,,0.32,0.0,2.23,,2.48


In [143]:
X = []
for index, row in df_data.iterrows():
    if row.Year < 1948:
        # No need to keep data older than 1948 becase no data exists in netCDF files
        continue
    for i, cell in enumerate(row[2:]):
        X.append([row.SKN, row.Year, i + 1, cell])

In [144]:
df_data_by_cell = pd.DataFrame(X, columns = ["skn", "year", "month", "data_in"]).dropna()

In [145]:
df_data_by_cell

Unnamed: 0,skn,year,month,data_in
0,1.0,1948,1,3.2
1,1.0,1948,2,1.33
2,1.0,1948,3,2.31
3,1.0,1948,4,3.23
4,1.0,1948,5,1.14
...,...,...,...,...
1652350,1147.0,1973,11,13.58
1652351,1147.0,1973,12,8.23
1652352,1147.0,1974,1,7.51
1652353,1147.0,1974,2,2.31


In [146]:
df_data_w_coord = df_data_by_cell.merge(right=df_locations, left_on="skn", right_on="SKN")

In [147]:
df_data_w_coord

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD,closest_grid
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
1,1.0,1948,2,1.33,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
2,1.0,1948,3,2.31,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
3,1.0,1948,4,3.23,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
4,1.0,1948,5,1.14,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]"
...,...,...,...,...,...,...,...,...,...
865564,1147.0,1973,11,13.58,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,"[22.5, -160.0]"
865565,1147.0,1973,12,8.23,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,"[22.5, -160.0]"
865566,1147.0,1974,1,7.51,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,"[22.5, -160.0]"
865567,1147.0,1974,2,2.31,1147.0,FLD 960-MOLOAA,22.174441,-159.336092,"[22.5, -160.0]"


### Concatenante with netCDF data

In [232]:
# ds = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc")
unique_rows = np.unique(np.hstack(df_complete["closest_grid"].to_numpy().tolist()).reshape(-1, 2), axis=0)
unique_lat = np.unique(unique_rows[:, 0])
unique_lon = np.unique(unique_rows[:, 1] + 360)

ds_air2m = xr.open_dataset(f"{base_dir}/air.2m.mon.mean.regridded.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_air1000_500 = xr.open_dataset(f"{base_dir}/air.1000-500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_hgt500 = xr.open_dataset(f"{base_dir}/hgt500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_hgt1000 = xr.open_dataset(f"{base_dir}/hgt1000.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_omega500 = xr.open_dataset(f"{base_dir}/omega500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pottemp_1000_500 = xr.open_dataset(f"{base_dir}/pottmp.1000-500.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pottemp_1000_850 = xr.open_dataset(f"{base_dir}/pottmp.1000-850.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_pwtr = xr.open_dataset(f"{base_dir}/pwtr.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_u700 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_u925 = xr.open_dataset(f"{base_dir}/shum_x_uwnd.925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_v700 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_v950 = xr.open_dataset(f"{base_dir}/shum_x_vwnd.925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_shum700 = xr.open_dataset(f"{base_dir}/shum700.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_shum925 = xr.open_dataset(f"{base_dir}/shum925.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_skt = xr.open_dataset(f"{base_dir}/skt.mon.mean.regridded.nc").loc[dict(lat=unique_lat, lon=unique_lon)]
ds_slp = xr.open_dataset(f"{base_dir}/slp.mon.mean.nc").loc[dict(lat=unique_lat, lon=unique_lon)]

datasets = [ # list of tuples. (dataset object, attribute string in ds)
    (ds_air2m, "air"),
    (ds_air1000_500, "air"),
    (ds_hgt500, "hgt"),
    (ds_hgt1000, "hgt"),
    (ds_omega500, "omega"),
    (ds_pottemp_1000_500, "pottmp"),
    (ds_pottemp_1000_850, "pottmp"),
    (ds_pwtr, "pr_wtr"),
    (ds_u700, "shum"),
    (ds_u925, "shum"),
    (ds_v700, "shum"),
    (ds_v950, "shum"),
    (ds_shum700, "shum"),
    (ds_shum925, "shum"),
    (ds_skt, "skt"),
    (ds_slp, "slp")
]
# combine all the cdf data

In [275]:
list_of_df = []
for data in datasets:
    ds = data[0]
    df =  ds.to_dataframe()
    if "level" in df.index.names:
        df = df.droplevel(level="level")
    df = df.reorder_levels(["lat", "lon", "time"])
    list_of_df.append(df)

In [277]:
df_all_cdf_data = pd.concat(list_of_df, axis=1)

In [278]:
df_all_cdf_data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,air,air,hgt,hgt,omega,pottmp,pottmp,pr_wtr,shum,shum,shum,shum,shum,shum,skt,slp
lat,lon,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
20.0,200.0,1948-01-01,295.726959,31.169991,5800.354980,122.967743,0.034998,-26.584778,-4.002899,28.530315,-4.261974,-43.965725,-3.642040,-22.181768,2.583000,9.947001,24.336798,1014.175476
20.0,200.0,1948-02-01,295.328339,31.490005,5794.379395,132.275864,0.047998,-26.071747,-4.346283,24.881031,1.629605,-26.913866,-1.099997,-9.842341,2.036999,8.709999,23.221535,1015.343079
20.0,200.0,1948-03-01,295.800781,32.830002,5799.322754,139.967743,-0.006002,-24.555817,-4.327728,31.481606,-4.645377,-68.806618,0.167110,-9.625248,3.341999,9.530001,23.523169,1016.237976
20.0,200.0,1948-04-01,296.614380,30.320000,5830.533203,131.899994,0.017998,-27.778687,-4.285706,32.632004,2.259589,-38.778152,-1.355754,-2.544500,3.227999,10.177999,24.369270,1015.145142
20.0,200.0,1948-05-01,297.482941,29.990005,5859.580566,142.774200,0.008999,-28.302948,-4.197418,33.806774,-13.790390,-85.865623,0.218375,-14.028182,3.119999,11.313000,25.491318,1016.440674
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22.5,205.0,2019-04-01,296.210876,31.573330,,,,-26.221405,-5.053345,,-5.940984,-56.059456,0.248092,1.399008,,,23.859089,1018.635071
22.5,205.0,2019-05-01,297.478241,29.271776,,,,-29.295868,-5.337433,,-4.876544,-39.390335,0.661427,6.641896,,,24.972092,1016.271057
22.5,205.0,2019-06-01,298.358185,30.306662,,,,-28.208984,-5.502258,,-6.774798,-67.765121,1.480210,22.082863,,,25.836149,1016.319092
22.5,205.0,2019-07-01,298.914978,30.695160,,,,-27.871124,-5.584778,,-16.634422,-84.586983,-0.712057,8.399054,,,26.519691,1016.296082


In [157]:
def get_value_from_cdf(index, year, month, lat, lon):
    dataset = datasets[index][0]
    attribute = datasets[index][1]
    # convert the range of lon from (-180, 180) to (0, 360)
    if lon < 0:
        lon += 360
    return dataset[attribute].loc[dict(time=f"{year}-{month}-01", lat=lat, lon=lon)].values

In [158]:
get_value_from_cdf(1, 1978, 1, 20, -155)

array([30.11], dtype=float32)

In [226]:
a = ds_slp.to_dataframe().reorder_levels(["lat", "lon", "time"])

In [230]:
b = ds_skt.to_dataframe().reorder_levels(["lat", "lon", "time"])
c = pd.concat([a, b], axis=1)

In [231]:
c

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,slp,skt
lat,lon,time,Unnamed: 3_level_1,Unnamed: 4_level_1
20.0,200.0,1948-01-01,1014.175476,24.336798
20.0,200.0,1948-02-01,1015.343079,23.221535
20.0,200.0,1948-03-01,1016.237976,23.523169
20.0,200.0,1948-04-01,1015.145142,24.369270
20.0,200.0,1948-05-01,1016.440674,25.491318
...,...,...,...,...
22.5,205.0,2019-04-01,1018.635071,23.859089
22.5,205.0,2019-05-01,1016.271057,24.972092
22.5,205.0,2019-06-01,1016.319092,25.836149
22.5,205.0,2019-07-01,1016.296082,26.519691


In [224]:
c

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,slp,skt
lat,lon,time,Unnamed: 3_level_1,Unnamed: 4_level_1
20.0,200.0,1948-01-01,1014.175476,
20.0,200.0,1948-02-01,1015.343079,
20.0,200.0,1948-03-01,1016.237976,
20.0,200.0,1948-04-01,1015.145142,
20.0,200.0,1948-05-01,1016.440674,
...,...,...,...,...
22.5,205.0,2019-03-01,,23.138935
22.5,205.0,2019-04-01,,23.859089
22.5,205.0,2019-05-01,,24.972092
22.5,205.0,2019-06-01,,25.836149


In [187]:
c
# df_data_w_coord = df_data_by_cell.merge(right=df_locations, left_on="skn", right_on="SKN")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,slp,skt
lat,lon,time,Unnamed: 3_level_1,Unnamed: 4_level_1
20.0,200.0,1948-01-01,1014.175476,
20.0,200.0,1948-02-01,1015.343079,
20.0,200.0,1948-03-01,1016.237976,
20.0,200.0,1948-04-01,1015.145142,
20.0,200.0,1948-05-01,1016.440674,
...,...,...,...,...
205.0,22.5,2019-03-01,,23.138935
205.0,22.5,2019-04-01,,23.859089
205.0,22.5,2019-05-01,,24.972092
205.0,22.5,2019-06-01,,25.836149


In [182]:
ds_skt["skt"].loc[dict(time="1948-01-01", lat=20, lon=200)]

In [189]:
b

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,skt
lon,lat,time,Unnamed: 3_level_1
200.0,20.0,1948-01-01,24.336798
200.0,20.0,1948-02-01,23.221535
200.0,20.0,1948-03-01,23.523169
200.0,20.0,1948-04-01,24.369270
200.0,20.0,1948-05-01,25.491318
...,...,...,...
205.0,22.5,2019-03-01,23.138935
205.0,22.5,2019-04-01,23.859089
205.0,22.5,2019-05-01,24.972092
205.0,22.5,2019-06-01,25.836149


In [162]:
df_complete = df_data_w_coord.copy(deep=True)
original_len = df_complete.shape[0]
for i in range(len(datasets)):
    df_complete[f"{i}"] = df_complete.apply(
        lambda x: get_value_from_cdf(
            index=i,
            year=x.year,
            month=x.month,
            lat=x.closest_grid[0],
            lon=x.closest_grid[1]
        ),
        axis=1
    )
    sys.stdout.write('\r')
    output = str(i) + f"/{15}"
    sys.stdout.write(output)
    sys.stdout.flush()

KeyError: 202.5

In [161]:
df_complete

Unnamed: 0,skn,year,month,data_in,SKN,Name,Lat_DD,Lon_DD,closest_grid,0,...,6,7,8,9,10,11,12,13,14,15
0,1.0,1948,1,3.2,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",295.3960266113281,...,[-3.9265137],29.034512,[2.5924935],[-25.859348],[0.58919084],[7.1064115],[2.9459991],[9.869999],23.385217666625977,1014.0849
1,1.0,1948,2,1.33,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",294.8040771484375,...,[-4.4982605],27.199657,[7.456773],[-19.957973],[5.228316],[15.838112],[2.8570004],[9.155001],22.271116256713867,1015.4769
2,1.0,1948,3,2.31,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",294.97821044921875,...,[-4.2774353],31.619999,[-0.2868866],[-75.18379],[0.8606598],[8.627603],[3.5860004],[10.271],22.55307388305664,1016.62714
3,1.0,1948,4,3.23,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",295.7965393066406,...,[-4.1464233],32.350327,[0.5276785],[-44.07128],[-1.1609464],[8.856949],[3.5179996],[10.671],23.421499252319336,1015.43164
4,1.0,1948,5,1.14,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",296.50006103515625,...,[-4.1314087],31.28193,[-11.55371],[-103.50909],[-3.5949812],[-6.2010508],[2.2609997],[11.924999],24.446609497070312,1016.5783
5,1.0,1948,6,0.67,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",297.2108154296875,...,[-3.8223877],34.812664,[-10.615497],[-79.34083],[-4.27654],[8.870415],[3.032999],[12.32],25.08786964416504,1015.2937
6,1.0,1948,7,0.35,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",297.7796630859375,...,[-3.8492126],35.435802,[-19.49642],[-108.20165],[-3.3088934],[-4.5083513],[3.2439995],[13.26],25.83987236022949,1014.948
7,1.0,1948,8,0.35,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",298.2498474121094,...,[-4.241516],40.685806,[-18.627869],[-108.21126],[-3.9659607],[11.213643],[4.0059986],[14.017],26.21640396118164,1014.0429
8,1.0,1948,9,0.48,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",297.9986267089844,...,[-3.8659058],39.66933,[-21.5059],[-96.0872],[1.2806104],[7.082904],[4.4160004],[13.364],26.05163955688477,1013.91693
9,1.0,1948,10,0.0,1.0,KALAE,18.916176,-155.674994,"[20.0, -155.0]",297.44439697265625,...,[-3.4046936],34.54774,[-12.073367],[-96.39089],[-2.8106227],[4.0853386],[2.4440002],[12.767],25.36591148376465,1015.3613
