In [1]:
import netCDF4
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from netCDF4 import Dataset as netCDFDataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
import pandas as pd

In [3]:
labels = netCDFDataset("../data/labels/GTD_1979-2019_JJAextd_8.nc", mode="r").variables[
    "blocking"
][:]
data = netCDFDataset(
    "../data/geopotential_height_500hPa_era5_6hourly_z0001_daymean_final.nc",
    mode="r",
).variables["z_0001"][:, :, :, 550:950]

# Split the dataset into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(
    data, labels, test_size=0.1
)

# Reshape the input data to be flattened
train_data_flat = train_data.reshape(
    len(train_data), int(train_data.size / len(train_data))
)
val_data_flat = val_data.reshape(len(val_data), int(val_data.size / len(val_data)))

print("train_data: " + str(train_data.shape))
print("val_data: " + str(val_data.shape))

print("train_data_flat: " + str(train_data_flat.shape))
print("val_data_flat: " + str(val_data_flat.shape))

train_data: (3616, 5, 281, 400)
val_data: (402, 5, 281, 400)
train_data_flat: (3616, 562000)
val_data_flat: (402, 562000)


In [4]:
rf = RandomForestClassifier(verbose=1)

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None],
    "min_samples_split": [2, 5],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True],
    "max_features": ["log2"],
    #"max_features": ["sqrt", "log2"],
}

rf_classifier = RandomForestClassifier()

grid_search = GridSearchCV(
    estimator=rf_classifier, param_grid=param_grid, cv=5, scoring="f1", n_jobs=-1
)

In [None]:
print("training model")
grid_search.fit(train_data_flat, train_labels)
print("training finished")

training model


In [7]:
val_predictions = grid_search.predict(val_data_flat)
print(f1_score(val_labels, val_predictions))

0.8408163265306122


In [8]:
# Access grid search results
results_df = pd.DataFrame(grid_search.cv_results_)

# Save results to a CSV file
results_df.to_csv('grid_search_results.csv', index=False)