In [1]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_predict, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR
from sklearn.cluster import KMeans

#others
from xgboost import XGBRegressor
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker
import time
import xarray as xr
import sherpa
import time
from scipy.spatial import Delaunay
from scipy import interpolate
from copy import deepcopy

# enable autoreload
%load_ext autoreload
%autoreload 2
# Variables from config file
from config import BASE_DIR, FILE_NAMES, LABELS, ATTRIBUTES, BEST_MODEL_COLUMNS, ISLAND_RANGES
from math import pi as PI

In [2]:
columns = deepcopy(LABELS)
columns.extend(["season_wet", "elevation", "lat", "lon"])
for item in columns:
    print(item, end=' ')

# load datasets
df_train = pd.read_csv(f"{BASE_DIR}/train.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_valid = pd.read_csv(f"{BASE_DIR}/valid.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_test = pd.read_csv(f"{BASE_DIR}/test.csv", usecols=columns + ['year', 'month', 'skn', 'data_in'])
df_combined = pd.concat([df_train, df_valid, df_test])

air2m air1000_500 hgt500 hgt1000 omega500 pottemp1000-500 pottemp1000-850 pr_wtr shum-uwnd-700 shum-uwnd-925 shum-vwnd-700 shum-vwnd-950 shum700 shum925 skt slp season_wet elevation lat lon 

In [3]:
# Split the stations by the number of samples available
threshold = 400
df_split = df_combined.groupby('skn').size().reset_index().rename(columns={0: "n_samples"})
df_split['class'] = df_split.apply(lambda row: 0 if row['n_samples'] < threshold else 1, axis=1)
df_combined = df_combined.merge(right=df_split, left_on="skn", right_on='skn')

In [65]:
train, valid, test = (0, 0, 0)
df_train = []
df_valid = []
df_test = []
np.random.seed(40)
for name, group in df_combined[df_combined['class']==1].groupby(by=["year", "month"]):
    # print(name, len(group))
    label = np.random.choice(a=["train", "valid", "test"], size=1, replace=True, p=[0.6, 0.2, 0.2])
    if label == "train":
        train += len(group)
        df_train.append(group)
    elif label == "valid":
        valid += len(group)
        df_valid.append(group)
    else:
        test += len(group)
        df_test.append(group)
print(len(df_train), len(df_valid), len(df_test))

462 162 156


In [66]:
df_train = pd.concat(df_train).reset_index().drop(labels=["index"], axis=1)
df_valid = pd.concat(df_valid).reset_index().drop(labels=["index"], axis=1)
df_test = pd.concat(df_test).reset_index().drop(labels=["index"], axis=1)
df_train.shape, df_valid.shape, df_test.shape

((461374, 26), (158908, 26), (159178, 26))

In [67]:
df_train.to_csv(f"{BASE_DIR}/split_on_n_samples/high/train.csv", index=False)
df_valid.to_csv(f"{BASE_DIR}/split_on_n_samples/high/valid.csv", index=False)
df_test.to_csv(f"{BASE_DIR}/split_on_n_samples/high/test.csv", index=False)

In [68]:
# stations with lower number of samples
train, valid, test = (0, 0, 0)
df_train = []
df_valid = []
df_test = []
np.random.seed(5)
for name, group in df_combined[df_combined['class']==0].groupby(by=["year", "month"]):
    # print(name, len(group))
    label = np.random.choice(a=["train", "valid", "test"], size=1, replace=True, p=[0.6, 0.2, 0.2])
    if label == "train":
        train += len(group)
        df_train.append(group)
    elif label == "valid":
        valid += len(group)
        df_valid.append(group)
    else:
        test += len(group)
        df_test.append(group)
print(len(df_train), len(df_valid), len(df_test))

df_train = pd.concat(df_train).reset_index().drop(labels=["index"], axis=1)
df_valid = pd.concat(df_valid).reset_index().drop(labels=["index"], axis=1)
df_test = pd.concat(df_test).reset_index().drop(labels=["index"], axis=1)
df_train.shape, df_valid.shape, df_test.shape

478 149 153


((52789, 26), (16343, 26), (16969, 26))

In [69]:
df_train.to_csv(f"{BASE_DIR}/split_on_n_samples/low/train.csv", index=False)
df_valid.to_csv(f"{BASE_DIR}/split_on_n_samples/low/valid.csv", index=False)
df_test.to_csv(f"{BASE_DIR}/split_on_n_samples/low/test.csv", index=False)