In [63]:
import netCDF4
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from netCDF4 import Dataset as netCDFDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
import xarray as xr
import numpy as np

In [64]:
current_datetime = datetime.now()
run = current_datetime.strftime("%Y-%m-%d %H:%M")
writer = SummaryWriter(f"../runs/validation/rf/{run}")

In [65]:
labels = netCDFDataset("../data/labels/GTD_1979-2019_JJAextd_8.nc", mode="r").variables[
    "blocking"
][:]
data = netCDFDataset(
    "../data/geopotential_height_500hPa_era5_6hourly_z0001_daymean_2019_beginAdjust_1x1_final.nc",
    mode="r",
).variables["z_0001"][:]
xr_data = xr.open_dataset(xr.backends.NetCDF4DataStore(netCDFDataset(
    "../data/geopotential_height_500hPa_era5_6hourly_z0001_daymean_2019_beginAdjust_1x1_final.nc",
    mode="r",
)), decode_times=True)

time = xr_data.time

# Split the dataset into training and validation sets
train_data, val_data, train_labels, val_labels, train_time, val_time = train_test_split(
    data, labels, time, test_size=0.1
)

# Reshape the input data to be flattened
train_data_flat = train_data.reshape(
    len(train_data), int(train_data.size / len(train_data))
)
val_data_flat = val_data.reshape(len(val_data), int(val_data.size / len(val_data)))

print("train_data: " + str(train_data.shape))
print("val_data: " + str(val_data.shape))
print("train_time" + str(train_time.shape))

print("train_data_flat: " + str(train_data_flat.shape))
print("val_data_flat: " + str(val_data_flat.shape))

train_data: (3616, 5, 45, 100)
val_data: (402, 5, 45, 100)
train_time(3616,)
train_data_flat: (3616, 22500)
val_data_flat: (402, 22500)


In [66]:
ukesm_labels = netCDFDataset("../data/labels/GTD_UKESM1-0-LL_piControl_1960-2060_JJAextd.nc", mode="r").variables[
    "blocking"
][:]
ukesm_data = netCDFDataset(
    "../data/500zg_day_UKESM1-0-LL_piControl_r1i1p1f2_gn_19600101-20601230_NHML_JJAextd_1x1_final.nc",
    mode="r",
).variables["z_0001"][:]

# Split the dataset into training and validation sets
ukesm_train_data, ukesm_val_data, ukesm_train_labels, ukesm_val_labels = train_test_split(
    ukesm_data, ukesm_labels, test_size=0.1
)

# Reshape the input data to be flattened
ukesm_train_data_flat = ukesm_train_data.reshape(
    len(ukesm_train_data), int(ukesm_train_data.size / len(ukesm_train_data))
)
ukesm_val_data_flat = ukesm_val_data.reshape(len(ukesm_val_data), int(ukesm_val_data.size / len(ukesm_val_data)))

print("train_data: " + str(ukesm_train_data.shape))
print("val_data: " + str(ukesm_val_data.shape))

print("train_data_flat: " + str(ukesm_train_data_flat.shape))
print("val_data_flat: " + str(ukesm_val_data_flat.shape))

train_data: (8908, 5, 45, 100)
val_data: (990, 5, 45, 100)
train_data_flat: (8908, 22500)
val_data_flat: (990, 22500)


In [67]:
rf = RandomForestClassifier(verbose=1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True],
    "max_features": ["log2"],
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(
    estimator=rf_classifier, param_grid=param_grid, cv=5, scoring=["f1", "precision", "recall"], refit="f1", n_jobs=-1
)

In [68]:
print("training model (era5)")
grid_search.fit(train_data_flat, train_labels)
print("training finished")

training model (era5)
training finished


In [69]:
ukesm_val_predictions = grid_search.predict(ukesm_val_data_flat)
val_predictions = grid_search.predict(val_data_flat)
print("era5 score: " + str(f1_score(val_labels, val_predictions)))
print("ukesm score: " + str(f1_score(ukesm_val_labels, ukesm_val_predictions)))

era5 score: 0.7901234567901235
ukesm score: 0.7304347826086957


In [9]:
# Access grid search results
results_df = pd.DataFrame(grid_search.cv_results_)

# Save results to a CSV file
results_df.to_csv('grid_search_results.csv', index=False)

In [70]:
import numpy as np

false_positives = (val_predictions == 1) & (val_labels == 0)
false_negatives = (val_predictions == 0) & (val_labels == 1)

count_false_positives = torch.sum(torch.tensor(false_positives)).item()
count_false_negatives = torch.sum(torch.tensor(false_negatives)).item()


print("false_positives: " + str(count_false_positives))
print("false_negatives: " + str(count_false_negatives))

print("begin val: " + str(val_time[0]))

for idx, (fp, fn) in enumerate(zip(false_positives, false_negatives)):
    if fp.item():
        writer.add_image(
            f"false-positive/rf_{np.datetime_as_string(val_time.data[idx], unit='s')}", torch.tensor(val_data[idx]).view((1, 225, -1))
        )
    if fn.item():
        writer.add_image(
            f"false-negative/rf_{np.datetime_as_string(val_time.data[idx], unit='s')}", torch.tensor(val_data[idx]).view((1, 225, -1))
        )

false_positives: 15
false_negatives: 36
begin val: <xarray.DataArray 'time' ()>
array('2003-08-16T09:00:00.000000000', dtype='datetime64[ns]')
Coordinates:
    time     datetime64[ns] 2003-08-16T09:00:00
Attributes:
    standard_name:  time
    long_name:      time
    bounds:         time_bnds
    axis:           T
