In [2]:
%load_ext lab_black
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import glob
import tqdm
import umap
import lightgbm as lgb
import numpy as np

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

import matplotlib.pyplot as plt

from utils import DATA_SETS, gather_df, prepare_df, model_function

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [None]:
import optuna
import json

result_folder = "sim-res-final"

info_file_name = "./{}/info.json".format(result_folder)
if os.path.exists(info_file_name):
    with open(info_file_name) as f:
        info = json.loads(f.read())
else:
    info = []

for dataset in DATA_SETS:
    df = gather_df(dataset, True)
    target_cols = [c for c in df.columns if "target" in c]

    for target_col in target_cols:
        for pred_ahead in [14, 28, 56]:
            # compute the size of location_matrix_W, which is variable per dataset
            n_feats = (
                len([c for c in df.columns if "ws10m_max" in c])
                + len([c for c in df.columns if "rainfall" in c])
                + len([c for c in df.columns if "temperature" in c])
            )

            # this is the optimization function that we are going to hyperoptimize
            def objective(trial, return_dataframes=False):
                location_array_W = []
                for i in range(n_feats):
                    location_array_W.append(trial.suggest_float("W_{}".format(i), 0, 1))
                dfp_val, dfp_test, lgb_model = model_function(
                    dataset,
                    location_array_W,
                    pred_ahead,
                    target_col,
                    extended_data=trial.suggest_categorical(
                        "extended_data", [True, False]
                    ),
                    impute_missing=trial.suggest_categorical(
                        "impute_missing", [True, False]
                    ),
                    do_extract=trial.suggest_categorical("do_extract", [True, False]),
                    shift_features=trial.suggest_categorical(
                        "shift_features", [True, False]
                    ),
                    use_early_stopping=trial.suggest_categorical(
                        "use_early_stopping", [True, False]
                    ),
                    lgb_boosting_type=trial.suggest_categorical(
                        "lgb_boosting_type", ["gbdt", "goss", "dart"]
                    ),
                    lgb_num_leaves=trial.suggest_categorical(
                        "lgb_num_leaves", [10, 31, 50]
                    ),
                    lgb_learning_rate=trial.suggest_uniform(
                        "lgb_learning_rate", 0.01, 0.3
                    ),
                    lgb_max_depth=trial.suggest_int(
                        "lgb_max_depth", -1, 10
                    ),  # -1 here means infinite
                )
                if return_dataframes:
                    return dfp_val, dfp_test
                return np.mean(
                    np.abs(dfp_val.p - dfp_val.y)
                )  # we optimize on the MAE of the validation dataset

            study = optuna.create_study()
            study.optimize(objective, n_trials=100)

            # the outputting dataframe with optimal parameters
            location_array_W_optim = []
            for k, v in study.best_params.items():
                if "W_" in k:
                    location_array_W_optim.append(v)

            dfp_val, dfp_test, lgb_model = model_function(
                dataset,
                location_array_W_optim,
                pred_ahead,
                target_col,
                extended_data=study.best_params["extended_data"],
                impute_missing=study.best_params["impute_missing"],
                do_extract=study.best_params["do_extract"],
                shift_features=study.best_params["shift_features"],
                use_early_stopping=study.best_params["use_early_stopping"],
                lgb_boosting_type=study.best_params["lgb_boosting_type"],
                lgb_num_leaves=study.best_params["lgb_num_leaves"],
                lgb_learning_rate=study.best_params["lgb_learning_rate"],
                lgb_max_depth=study.best_params["lgb_max_depth"],
            )

            run_id = np.random.randint(1, 100000000000000)

            mae_test = np.mean(np.abs(dfp_test.p - dfp_test.y))
            rmse_test = np.sqrt(np.mean((dfp_test.p - dfp_test.y) ** 2))

            mae_val = np.mean(np.abs(dfp_val.p - dfp_val.y))
            rmse_val = np.sqrt(np.mean((dfp_val.p - dfp_val.y) ** 2))

            info.append(
                {
                    "run_id": run_id,
                    "best_params": study.best_params,
                    "location_weights": location_array_W_optim,
                    "dataset": dataset,
                    "target_col": target_col,
                    "pred_ahead": pred_ahead,
                    "mae_test": mae_test,
                    "rmse_test": rmse_test,
                    "mae_val": mae_val,
                    "rmse_val": rmse_val,
                    "dfp_test_start": str(dfp_test.index[0]),
                    "dfp_test_length": len(dfp_test.index),
                    "mae_normalized_val": (
                        np.abs((dfp_val.y - dfp_val.p))
                        / np.abs(dfp_val.original.mean())
                    ).mean(),
                    "mae_normalized_test": (
                        np.abs((dfp_test.y - dfp_test.p))
                        / np.abs(dfp_test.original.mean())
                    ).mean(),
                }
            )

            with open(info_file_name, "w+") as f:
                json.dump(info, f)
            dfp_val.reset_index().to_feather(
                "./{}/{}-validation.feather".format(result_folder, run_id)
            )
            dfp_test.reset_index().to_feather(
                "./{}/{}-test.feather".format(result_folder, run_id)
            )

In [42]:
df_info = pd.read_json("./sim-res-final/info.json")
run = df_info.iloc[0]

dfp_val, dfp_test, lgb_model = model_function(
    run.dataset,
    run.location_weights,
    run.pred_ahead,
    run.target_col,
    extended_data=run.best_params["extended_data"],
    impute_missing=run.best_params["impute_missing"],
    do_extract=run.best_params["do_extract"],
    shift_features=run.best_params["shift_features"],
    use_early_stopping=run.best_params["use_early_stopping"],
    lgb_boosting_type=run.best_params["lgb_boosting_type"],
    lgb_num_leaves=run.best_params["lgb_num_leaves"],
    lgb_learning_rate=run.best_params["lgb_learning_rate"],
    lgb_max_depth=run.best_params["lgb_max_depth"],
)

df_imp = pd.DataFrame(
    {
        "importance": lgb_model.feature_importances_,
        "feature_name": lgb_model.feature_name_,
    }
)
df_imp.sort_values("importance")[::-1].iloc[:5]

Unnamed: 0,importance,feature_name
0,100,target_depth_to_groundwater_lt2
22,74,_ps
2,51,depth_to_groundwater_pag
12,46,week
4,40,depth_to_groundwater_diec


In [19]:
df_imp = pd.DataFrame(
    {
        "importance": lgb_model.feature_importances_,
        "feature_name": lgb_model.feature_name_,
    }
)
df_imp.sort_values("importance")[::-1].iloc[:5]

Unnamed: 0,importance,feature_name
0,100,target_depth_to_groundwater_lt2
22,74,_ps
2,51,depth_to_groundwater_pag
12,46,week
4,40,depth_to_groundwater_diec


    location_weight                            location_name
7          0.001784                             croce_arcana
20         0.002015  rainfall_tereglio_coreglia_antelminelli
19         0.004533                    rainfall_croce_arcana
8          0.182114           tereglio_coreglia_antelminelli
24         0.217798              temperature_ponte_a_moriano
    location_weight             location_name
6          0.590280                 calavorno
14         0.645115      rainfall_monte_serra
16         0.784542  rainfall_borgo_a_mozzano
11         0.803401       lucca_orto_botanico
12         0.815247        rainfall_gallicano


Unnamed: 0,location_weight,location_name
7,0.001784,croce_arcana
20,0.002015,rainfall_tereglio_coreglia_antelminelli
19,0.004533,rainfall_croce_arcana
8,0.182114,tereglio_coreglia_antelminelli
24,0.217798,temperature_ponte_a_moriano
18,0.221245,rainfall_calavorno
5,0.290177,piaggione
22,0.291734,temperature_orentano
17,0.302676,rainfall_piaggione
21,0.305023,rainfall_fabbriche_di_vallico


In [42]:
dfp_val.reset_index().to_feather("./sim-res/{}-validation.feather".format(run_id))
dfp_test.reset_index().to_feather("./sim-res/{}-test.feather".format(run_id))

# info.append({
#     'run_id': run_id,
#     'best_params': best_params,
#     'location_weights': location_array_W_optim,
#     'dataset': dataset,
#     'target_col': target_col,
#     'pred_ahead': pred_ahead
# })
# with open(info_file_name, 'w') as f:
#     json.dump(info, f)