## Yield Prediction Model Search

In [1]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

In [2]:
import numpy as np
import pandas as pd

In [3]:
FEATURES = ("grid_cc", "grid_exg", "grid_ch", "grid_cv")

### load data

In [4]:
# modify here with your dataset
def load_data(data_dir):
    data = dict()

    file_names = [
        file_name for file_name in os.listdir(data_dir) if not file_name.startswith(".")
    ]
    for file_name in file_names:
        feature, file_type = os.path.splitext(file_name)
        file_dir = os.path.join(data_dir, file_name)
        if file_type[1:].lower() == "csv":
            data[feature] = pd.read_csv(file_dir)
        else:
            data[feature] = pd.read_pickle(file_dir)

    return data

In [5]:
# data_2020 = load_data("./splined_data/2020/")
# data_2021 = load_data("./splined_data/2021/")
# data_2022 = load_data("./splined_data/2022/")

### create training set and validation set

In [10]:
def generate_train_test(train_datasets, test_dataset, features):
    x_trainset = []
    y_trainset = []
    # generate trainset
    for train_data in train_datasets:
        tmp = np.concatenate([train_data[feature] for feature in features], axis=1)
        x_trainset.append(tmp)
        y_trainset.append(train_data["yield"])
    x_trainset = np.concatenate(x_trainset, axis=0)
    y_trainset = np.concatenate(y_trainset, axis=0).reshape(-1, 1)
    assert x_trainset.shape[0] == y_trainset.shape[0]

    # generate testset
    y_testset = test_dataset["yield"].values.reshape(-1, 1)
    x_testset = np.concatenate([test_dataset[feature] for feature in features], axis=1)
    assert x_trainset.shape[1] == x_testset.shape[1]
    
    return x_trainset, x_testset, y_trainset, y_testset

In [25]:
# use all four features to train the model
features = ("grid_cc", "grid_exg", "grid_ch", "grid_cv")

# 2020, 2021 for training
# 2022 for test
train_datasets = [data_2020, data_2021]

x_trainset, x_testset, y_trainset, y_testset = generate_train_test(train_datasets,  data_2022, features)

### training_set, validation_set, and test_set split

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [13]:
def split_train_valid_test(x_trainset, y_trainset, x_testset, y_testset, test_size=0.3):
    # split data
    x_train, x_valid, y_train, y_valid = train_test_split(
        x_trainset, y_trainset, test_size=test_size, random_state=42
    )
    x_valid, x_test, y_valid, y_test = train_test_split(
        x_valid, y_valid, test_size=0.5, random_state=43
    )

    # standardize data
    standardize_scaler = StandardScaler()
    x_train = standardize_scaler.fit_transform(x_train)
    x_valid = standardize_scaler.transform(x_valid)
    x_test = standardize_scaler.transform(x_test)
    x_testset = standardize_scaler.transform(x_testset)

    return x_train, x_valid, x_test, x_testset, y_train, y_valid, y_test, y_testset

In [14]:
x_train, x_valid, x_test, x_testset, y_train, y_valid, y_test, y_testset = split_train_valid_test(x_trainset, y_trainset, x_testset, y_testset)

### define model (MLP)

In [15]:
import keras_tuner
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers

In [16]:
args = {
    "dropout": [0.2, 0.6],
    "learning_rate": [1e-5, 1e-2],
    "num_layer": [2, 15],
    "num_units": [32, 2048, 32],
    "activation": ["relu", "tanh"],
}

In [17]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Flatten())

    dropout_rate = hp.Float(
        "dr", min_value=args["dropout"][0], max_value=args["dropout"][1], sampling="log"
    )
    learning_rate = hp.Float(
        "lr",
        min_value=args["learning_rate"][0],
        max_value=args["learning_rate"][1],
        sampling="log",
    )

    for i in range(
        hp.Int(
            "num_layer", min_value=args["num_layer"][0], max_value=args["num_layer"][1]
        )
    ):
        model.add(
            layers.Dense(
                units=hp.Int(
                    f"units_{i}",
                    min_value=args["num_units"][0],
                    max_value=args["num_units"][1],
                    step=args["num_units"][2],
                ),
                activation=hp.Choice("activation", args["activation"]),
            )
        )
        if hp.Boolean("dropout"):
            model.add(layers.Dropout(rate=dropout_rate))
    # output layer
    model.add(layers.Dense(1))

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="mean_squared_error",
        metrics=[tfa.metrics.r_square.RSquare()],
    )
    return model

In [18]:
build_model(keras_tuner.HyperParameters())

<keras.engine.sequential.Sequential at 0x7efc587dbd60>

### model search

In [19]:
tuner_args = {
    "max_trials": 2,
    "executions_per_trial": 1,
    "project_name": "cc_exg_ch_cv",
}

In [20]:
# use random search
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective=keras_tuner.Objective("val_r_square", direction="max"),
    max_trials=tuner_args["max_trials"],
    executions_per_trial=tuner_args["executions_per_trial"],
    overwrite=True,
    directory="model_search",
    project_name=tuner_args["project_name"],
)

### train model

In [21]:
tuner.search(x_train, y_train, epochs=5, validation_data=(x_valid,y_valid))

Trial 2 Complete [00h 00m 06s]
val_r_square: 0.7563138008117676

Best val_r_square So Far: 0.7563138008117676
Total elapsed time: 00h 00m 14s
INFO:tensorflow:Oracle triggered exit


### save model

In [22]:
models = tuner.get_best_models(num_models=2)
best_model  = models[0]

# generate model
input_size = x_train.shape[1]
best_model.build(input_shape=(None, input_size))

best_model.save("saved_model/best_model")



INFO:tensorflow:Assets written to: saved_model/best_model/assets


INFO:tensorflow:Assets written to: saved_model/best_model/assets


### yield predictin

In [23]:
from sklearn.metrics import mean_squared_error

In [24]:
preds = best_model(x_testset)

# calculate RMSE, R2 
rmse = mean_squared_error(y_testset, preds)
print(f"rmse: {rmse:.2f}")

rmse: 1.99
