In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pprint import pp

In [None]:
SAVE_DATA = False

### Read data


In [None]:
df = pd.read_excel("data.xlsx", index_col="exp")
df.head()

In [None]:
df.info()

### Extract data


In [None]:
_X = df.iloc[:, :-3]
_Y = df.iloc[:, -3:]
print(_X.shape)
print(_Y.shape)

### Split data


In [None]:
from sklearn.model_selection import train_test_split

_X_train, _X_test, _Y_train, _Y_test = train_test_split(
    _X, _Y, test_size=0.3, random_state=0
)
print(_X_train.shape)
print(_X_test.shape)
print(_Y_train.shape)
print(_Y_test.shape)

### Scale data


In [None]:
from sklearn.preprocessing import StandardScaler

scX = StandardScaler()
X_train = scX.fit_transform(_X_train)
X_test = scX.transform(_X_test)

scY = StandardScaler()
Y_train = scY.fit_transform(_Y_train)
Y_test = scY.transform(_Y_test)

### Choose model


In [None]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

reg = MultiOutputRegressor(estimator=SVR())

### Setup hypermater search


In [None]:
pp(reg.get_params())

param_grid = [{"estimator__C": [0.001, 0.01, 0.01, 1]}]

In [None]:
from sklearn.model_selection import GridSearchCV

# See https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter for a list of scoring parameters.

gs = GridSearchCV(
    estimator=reg,
    param_grid=param_grid,
    cv=3,
    # scoring="neg_mean_squared_error",
    scoring="r2",
    n_jobs=-1,
)


### Train model


In [None]:
gs.fit(X_train, Y_train)

In [None]:
cv_results = pd.DataFrame(gs.cv_results_).sort_values(by="rank_test_score")
display(cv_results)

In [None]:
# Check the best estimator.
gs.best_estimator_.get_params()

### Predict results


In [None]:
# Note that gs.predict will choose the best model.

Y_train_pred = gs.predict(X_train)
Y_test_pred = gs.predict(X_test)

print(Y_train_pred.shape)
print(Y_test_pred.shape)

### Evaluate model performance


In [None]:
from datetime import datetime

dt = datetime.now().strftime("%Y-%m-%d_%H-%M")

In [None]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_percentage_error,
    r2_score,
)


def eval_perf(y_true, y_pred):
    mse = mean_squared_error(y_true=y_true, y_pred=y_pred)
    mape = mean_absolute_percentage_error(y_true=y_true, y_pred=y_pred)
    r2 = r2_score(y_true=y_true, y_pred=y_pred)
    return mse, mape, r2


def print_perf(data):
    for k, v in data.items():
        print(k, ":", v)


data_arr = []
for i in range(0, _Y.shape[1]):
    mse_train, mape_train, r2_train = eval_perf(
        y_true=Y_train[:, i], y_pred=Y_train_pred[:, i]
    )
    mse_test, mape_test, r2_test = eval_perf(
        y_true=Y_test[:, i], y_pred=Y_test_pred[:, i]
    )

    data = {
        "Y": f"Y-{i + 1}",
        "MSE Train": mse_train,
        "MSE Test": mse_test,
        "MAPE Train": mape_train,
        "MAPE Test": mape_test,
        "R2 Train": r2_train,
        "R2 Test": r2_test,
    }
    # print_perf(data)

    data_arr.append(data)

mse_train, mape_train, r2_train = eval_perf(y_true=Y_train, y_pred=Y_train_pred)
mse_test, mape_test, r2_test = eval_perf(y_true=Y_test, y_pred=Y_test_pred)
print_perf(data)

data = {
    "Y": "Y-All",
    "MSE Train": mse_train,
    "MSE Test": mse_test,
    "MAPE Train": mape_train,
    "MAPE Test": mape_test,
    "R2 Train": r2_train,
    "R2 Test": r2_test,
}
data_arr.append(data)
# print_perf(data)

df_eval = pd.DataFrame.from_dict(data_arr)
display(df_eval)

if SAVE_DATA:
    filename = f"eval_{dt}.xlsx"
    df_eval.to_excel(filename, index=False)

In [None]:
from sklearn.metrics import PredictionErrorDisplay

for i in range(0, _Y.shape[1]):
    fig, axes = plt.subplots(
        nrows=1,
        ncols=2,
        figsize=(10, 5),
        constrained_layout=True,
        sharex=True,
        sharey=True,
    )

    display_train = PredictionErrorDisplay(
        y_true=Y_train[:, i], y_pred=Y_train_pred[:, i]
    )
    display_train.plot(ax=axes[0])
    axes[0].set_title("Train")

    display_train = PredictionErrorDisplay(
        y_true=Y_test[:, i], y_pred=Y_test_pred[:, i]
    )
    display_train.plot(ax=axes[1])
    axes[1].set_title("Test")

    if SAVE_DATA:
        filename = f"res_plot_{dt}_{i}.png"
        fig.savefig(filename, dpi=300)

    plt.show()

### Save data


In [None]:
import pickle

filename = f"data_{dt}.pkl"

data_save = {
    "model": reg,
    "desc": "This is the saved data",
    "Y_train": Y_train,
    "Y_train_pred": Y_train_pred,
    "Y_test": Y_test,
    "Y_test_Pred": Y_test_pred,
    "eval": df_eval,
}


# Save the model
if SAVE_DATA:
    with open(filename, "wb") as file:
        pickle.dump(data_save, file)

### Test loading data


In [None]:
if SAVE_DATA:
    with open(filename, "rb") as file:
        data_load = pickle.load(file)

    print(data_load)