In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa

# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES


In [2]:
# Load the dataset
df_metadata = pd.read_excel(f"{BASE_DIR}/FilledDataset2012.xlsx", sheet_name="Header")


df_data_original = (
    pd.read_csv(f"{BASE_DIR}/dataset.csv")
    .drop(labels=["data_in", "lat", "lon", "Lon_DD_updated", "Lat_DD", "Lon_DD"], axis=1)
    .rename(
    {
        "air"  : "air2m",
        "air.1": "air1000_500",
        "hgt"  : "hgt500",
        "hgt.1": "hgt1000",
        "omega": "omega500",
        "pottmp": "pottemp1000-500",
        "pottmp.1": "pottemp1000-850",
        "pr_wtr": "pr_wtr",
        "shum": "shum-uwnd-700",
        "shum.1": "shum-uwnd-925",
        "shum.2": "shum-vwnd-700",
        "shum.3": "shum-vwnd-950",
        "shum.4": "shum700",
        "shum.5": "shum925",
        "skt": "skt",
        "slp": "slp"
    }
    , axis=1)
    
)

df_interp_100 = (# this will be the base dataset that all others merge onto
    pd.read_csv(f"{BASE_DIR}/dataset_interp_100.csv")
    .rename(
        {
            "air2m"      :"i100_air2m",
            "air1000_500":"i100_air1000_500",
            "hgt500"     :"i100_hgt500",
            "hgt1000"    :"i100_hgt1000",
            "omega500"   :"i100_omega500",
            "pottemp1000-500":"i100_pottemp1000-500",
            "pottemp1000-850":"i100_pottemp1000-850",
            "pr_wtr"       :"i100_pr_wtr",
            "shum-uwnd-700":"i100_shum-uwnd-700",
            "shum-uwnd-925":"i100_shum-uwnd-925",
            "shum-vwnd-700":"i100_shum-vwnd-700",
            "shum-vwnd-950":"i100_shum-vwnd-950",
            "shum700"      :"i100_shum700",
            "shum925"      :"i100_shum925",
            "skt":"i100_skt",
            "slp":"i100_slp"
        },
        axis=1
    )
)

df_interp_50 = (
    pd.read_csv(f"{BASE_DIR}/dataset_interp_50.csv")
    .drop(labels=["data_in", "elevation", "lat", "lon", "name", "Observer", "NumMos", "MinYear", "MaxYear", "Status2010"], axis=1)
    .rename(
        {
            "air2m"      :"i50_air2m",
            "air1000_500":"i50_air1000_500",
            "hgt500"     :"i50_hgt500",
            "hgt1000"    :"i50_hgt1000",
            "omega500"   :"i50_omega500",
            "pottemp1000-500":"i50_pottemp1000-500",
            "pottemp1000-850":"i50_pottemp1000-850",
            "pr_wtr"       :"i50_pr_wtr",
            "shum-uwnd-700":"i50_shum-uwnd-700",
            "shum-uwnd-925":"i50_shum-uwnd-925",
            "shum-vwnd-700":"i50_shum-vwnd-700",
            "shum-vwnd-950":"i50_shum-vwnd-950",
            "shum700"      :"i50_shum700",
            "shum925"      :"i50_shum925",
            "skt":"i50_skt",
            "slp":"i50_slp"
        },
        axis=1
    )
)

df_6grids = (
    pd.read_csv(f"{BASE_DIR}/dataset_5girds.csv")
    .drop(
        labels=["data_in", "elevation", "lat", "lon", "name", "Observer", "NumMos", "MinYear", "MaxYear", "Status2010"],
        axis=1
    ).drop(
        labels=[
            "air2m", "air1000_500", "hgt500", "hgt1000", "omega500",
            "pottemp1000-500", "pottemp1000-850", "pr_wtr", "shum-uwnd-700",
            "shum-uwnd-925", "shum-vwnd-700", "shum-vwnd-950", "shum700",
            "shum925", "skt", "slp"
        ],
        axis=1
    ).rename(
        {
            "air2m"      :"i6grids_air2m",
            "air1000_500":"i6grids_air1000_500",
            "hgt500"     :"i6grids_hgt500",
            "hgt1000"    :"i6grids_hgt1000",
            "omega500"   :"i6grids_omega500",
            "pottemp1000-500":"i6grids_pottemp1000-500",
            "pottemp1000-850":"i6grids_pottemp1000-850",
            "pr_wtr"       :"i6grids_pr_wtr",
            "shum-uwnd-700":"i6grids_shum-uwnd-700",
            "shum-uwnd-925":"i6grids_shum-uwnd-925",
            "shum-vwnd-700":"i6grids_shum-vwnd-700",
            "shum-vwnd-950":"i6grids_shum-vwnd-950",
            "shum700"      :"i6grids_shum700",
            "shum925"      :"i6grids_shum925",
            "skt":"i6grids_skt",
            "slp":"i6grids_slp"
        },
        axis=1
    )
)

In [3]:
df_combined = (
    df_interp_100
    .merge(right=df_data_original, left_on=["skn", "year", "month"], right_on=["skn", "year", "month"])
    .merge(right=df_interp_50, left_on=["skn", "year", "month"], right_on=["skn", "year", "month"])
    .merge(right=df_6grids, left_on=["skn", "year", "month"], right_on=["skn", "year", "month"])
    
)

In [12]:
train, valid, test = (0, 0, 0)
df_train = []
df_valid = []
df_test = []

for name, group in df_combined.groupby(by=["year", "month"]):
    # print(name, len(group))
    label = np.random.choice(a=["train", "valid", "test"], size=1, replace=True, p=[0.6, 0.2, 0.2])
    if label == "train":
        train += len(group)
        df_train.append(group)
    elif label == "valid":
        valid += len(group)
        df_valid.append(group)
    else:
        test += len(group)
        df_test.append(group)

In [13]:
train, valid, test

(541928, 162636, 160997)

In [14]:
df_train = pd.concat(df_train).reset_index().drop(labels=["index"], axis=1)
df_valid = pd.concat(df_valid).reset_index().drop(labels=["index"], axis=1)
df_test  = pd.concat(df_test).reset_index().drop(labels=["index"], axis=1)

In [15]:
df_train.to_csv(f"{BASE_DIR}/train.csv", index=False)
df_valid.to_csv(f"{BASE_DIR}/valid.csv", index=False)
df_test.to_csv(f"{BASE_DIR}/test.csv", index=False)