In [2]:
# basic imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#tensorflow
import tensorflow as tf
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout

import tensorflow_probability as tfp
tfd = tfp.distributions

# others
from copy import deepcopy
from xgboost import XGBRegressor
import xarray as xr

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES, BEST_MODEL_COLUMNS, ISLAND_RANGES

2022-05-03 09:04:43.107711: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/apps/software/tools/nmap/7.80/lib
2022-05-03 09:04:43.107748: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
  from pandas import MultiIndex, Int64Index


In [5]:
ds_air2m = xr.open_dataset(f"{BASE_DIR}/air.2m.mon.mean.regridded.nc")
ds_air1000_500 = xr.open_dataset(f"{BASE_DIR}/air.1000-500.mon.mean.nc")
ds_hgt500 = xr.open_dataset(f"{BASE_DIR}/hgt500.mon.mean.nc")
ds_hgt1000 = xr.open_dataset(f"{BASE_DIR}/hgt1000.mon.mean.nc")
ds_omega500 = xr.open_dataset(f"{BASE_DIR}/omega500.mon.mean.nc")
ds_pottemp_1000_500 = xr.open_dataset(f"{BASE_DIR}/pottmp.1000-500.mon.mean.nc")
ds_pottemp_1000_850 = xr.open_dataset(f"{BASE_DIR}/pottmp.1000-850.mon.mean.nc")
ds_pwtr = xr.open_dataset(f"{BASE_DIR}/pwtr.mon.mean.nc")
ds_u700 = xr.open_dataset(f"{BASE_DIR}/shum_x_uwnd.700.mon.mean.nc")
ds_u925 = xr.open_dataset(f"{BASE_DIR}/shum_x_uwnd.925.mon.mean.nc")
ds_v700 = xr.open_dataset(f"{BASE_DIR}/shum_x_vwnd.700.mon.mean.nc")
ds_v950 = xr.open_dataset(f"{BASE_DIR}/shum_x_vwnd.925.mon.mean.nc")
ds_shum700 = xr.open_dataset(f"{BASE_DIR}/shum700.mon.mean.nc")
ds_shum925 = xr.open_dataset(f"{BASE_DIR}/shum925.mon.mean.nc")
ds_skt = xr.open_dataset(f"{BASE_DIR}/skt.mon.mean.regridded.nc")
ds_slp = xr.open_dataset(f"{BASE_DIR}/slp.mon.mean.nc")

# ait temperature difference
datasets = [ # list of tuples. (dataset object, attribute string in ds)
    (ds_air2m, "air"), # surface air temperature 2m
    (ds_air1000_500, "air"), # air temperature difference
    (ds_hgt500, "hgt"), # geopotential height (500hPa)
    (ds_hgt1000, "hgt"), # geopotential height (1000hPa)
    (ds_omega500, "omega"), # omega
    (ds_pottemp_1000_500, "pottmp"), # potential temperature difference 1000-500
    (ds_pottemp_1000_850, "pottmp"), # potential temperature fifference 1000-850
    (ds_pwtr, "pr_wtr"), # precipitable water
    (ds_u700, "shum"), # zonal moisture (u) transport
    (ds_u925, "shum"), # zonal moisture (u) transport
    (ds_v700, "shum"), # meridional moisture (v) transport
    (ds_v950, "shum"), # meridional moisture (v) transport
    (ds_shum700, "shum"), # specific humidity: 700 hPa 
    (ds_shum925, "shum"), # specific humidity: 925 hPa
    (ds_skt, "skt"), # skin temperature
    (ds_slp, "slp") # sea level pressure
]
# combine all the cdf data

In [8]:
datasets[12][0]

In [9]:
df_locations = pd.read_csv(f"{BASE_DIR}/SKNlocations.csv")

In [18]:
df_locations['Lat_DD'].min(), df_locations['Lat_DD'].max()

(18.91367961, 22.23135314)

In [5]:
reanalysis_data = [
    'air2m', 'air1000_500', 'hgt500', 'hgt1000', 'omega500',
    'pottemp1000-500', 'pottemp1000-850', 'pr_wtr', 'shum-uwnd-700',
    'shum-uwnd-925', 'shum-vwnd-700', 'shum-vwnd-950', 'shum700', 'shum925', 
    'skt', 'slp'
]

columns = []
for i in range(6):
    for item in reanalysis_data:
        columns.append(f"{item}_{i}")

columns.extend(['data_in', 'lat', 'lon', 'elevation', 'season_wet', 'season_dry'])
for item in columns:
    print(item, end=' ')

air2m_0 air1000_500_0 hgt500_0 hgt1000_0 omega500_0 pottemp1000-500_0 pottemp1000-850_0 pr_wtr_0 shum-uwnd-700_0 shum-uwnd-925_0 shum-vwnd-700_0 shum-vwnd-950_0 shum700_0 shum925_0 skt_0 slp_0 air2m_1 air1000_500_1 hgt500_1 hgt1000_1 omega500_1 pottemp1000-500_1 pottemp1000-850_1 pr_wtr_1 shum-uwnd-700_1 shum-uwnd-925_1 shum-vwnd-700_1 shum-vwnd-950_1 shum700_1 shum925_1 skt_1 slp_1 air2m_2 air1000_500_2 hgt500_2 hgt1000_2 omega500_2 pottemp1000-500_2 pottemp1000-850_2 pr_wtr_2 shum-uwnd-700_2 shum-uwnd-925_2 shum-vwnd-700_2 shum-vwnd-950_2 shum700_2 shum925_2 skt_2 slp_2 air2m_3 air1000_500_3 hgt500_3 hgt1000_3 omega500_3 pottemp1000-500_3 pottemp1000-850_3 pr_wtr_3 shum-uwnd-700_3 shum-uwnd-925_3 shum-vwnd-700_3 shum-vwnd-950_3 shum700_3 shum925_3 skt_3 slp_3 air2m_4 air1000_500_4 hgt500_4 hgt1000_4 omega500_4 pottemp1000-500_4 pottemp1000-850_4 pr_wtr_4 shum-uwnd-700_4 shum-uwnd-925_4 shum-vwnd-700_4 shum-vwnd-950_4 shum700_4 shum925_4 skt_4 slp_4 air2m_5 air1000_500_5 hgt500_5 hgt1

In [6]:
# load datasets
df_train = pd.read_csv(f"{BASE_DIR}/train.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_test = pd.read_csv(f"{BASE_DIR}/test.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_combined = pd.concat([df_train, df_valid, df_test])