In [2]:
import netCDF4
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from netCDF4 import Dataset as netCDFDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd

In [13]:
labels = netCDFDataset("../data/labels/GTD_1979-2019_JJAextd_8.nc", mode="r").variables[
    "blocking"
][:]
data = netCDFDataset(
    "../data/geopotential_height_500hPa_era5_6hourly_z0001_daymean_2019_beginAdjust_1x1_final.nc",
    mode="r",
).variables["z_0001"][:]

# Split the dataset into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    data, labels, test_size=0.1
)

# Reshape the input data to be flattened
train_data_flat = train_data.reshape(
    len(train_data), int(train_data.size / len(train_data))
)
val_data_flat = val_data.reshape(len(val_data), int(val_data.size / len(val_data)))

print("train_data: " + str(train_data.shape))
print("val_data: " + str(val_data.shape))

print("train_data_flat: " + str(train_data_flat.shape))
print("val_data_flat: " + str(val_data_flat.shape))

train_data: (3616, 5, 45, 100)
val_data: (402, 5, 45, 100)
train_data_flat: (3616, 22500)
val_data_flat: (402, 22500)


In [3]:
ukesm_labels = netCDFDataset("../data/labels/GTD_UKESM1-0-LL_piControl_1960-2060_JJAextd.nc", mode="r").variables[
    "blocking"
][:]
ukesm_data = netCDFDataset(
    "../data/500zg_day_UKESM1-0-LL_piControl_r1i1p1f2_gn_19600101-20601230_NHML_JJAextd.nc_1x1_final.nc",
    mode="r",
).variables["z_0001"][:]

# Split the dataset into training and validation sets
ukesm_train_data, ukesm_val_data, ukesm_train_labels, ukesm_val_labels = train_test_split(
    ukesm_data, ukesm_labels, test_size=0.1
)

# Reshape the input data to be flattened
ukesm_train_data_flat = ukesm_train_data.reshape(
    len(ukesm_train_data), int(ukesm_train_data.size / len(ukesm_train_data))
)
ukesm_val_data_flat = ukesm_val_data.reshape(len(ukesm_val_data), int(ukesm_val_data.size / len(ukesm_val_data)))

print("train_data: " + str(ukesm_train_data.shape))
print("val_data: " + str(ukesm_val_data.shape))

print("train_data_flat: " + str(ukesm_train_data_flat.shape))
print("val_data_flat: " + str(ukesm_val_data_flat.shape))

train_data: (8908, 5, 45, 100)
val_data: (990, 5, 45, 100)
train_data_flat: (8908, 22500)
val_data_flat: (990, 22500)


In [6]:
rf = RandomForestClassifier(verbose=1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True],
    "max_features": ["log2"],
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(
    estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=["f1", "precision", "recall"], refit="f1", n_jobs=-1
)

In [16]:
print("training model (era5)")
grid_search.fit(train_data_flat, train_labels)
print("training finished")

training model (era5)
training finished


In [17]:
ukesm_val_predictions = grid_search.predict(ukesm_val_data_flat)
val_predictions = grid_search.predict(val_data_flat)
print("era5 score: " + str(f1_score(val_labels, val_predictions)))
print("ukesm score: " + str(f1_score(ukesm_val_labels, ukesm_val_predictions)))

era5 score: 0.8339483394833948
ukesm score: 0.7640845070422536


In [9]:
# Access grid search results
results_df = pd.DataFrame(grid_search.cv_results_)

# Save results to a CSV file
results_df.to_csv('grid_search_results.csv', index=False)