In [6]:
!pip install nb_black lightgbm pyarrow umap-learn BorutaShap

Collecting BorutaShap
  Downloading BorutaShap-1.0.15-py3-none-any.whl (13 kB)
Installing collected packages: BorutaShap
Successfully installed BorutaShap-1.0.15


In [1]:
%load_ext lab_black
%reload_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import glob
import tqdm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import STL
import umap
import lightgbm as lgb

from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.impute import KNNImputer, SimpleImputer

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, f_regression, SelectFromModel

In [2]:
def gather_df(
    dataset_name,
    detrend_features=True,
    shift_features=True,
    shift_features_longterm=True,
    load_related_data=True,
):
    fname = "./data/kaggle-preprocessed/{}.feather".format(dataset_name)
    if not os.path.exists(fname):
        raise Exception("preprocessed file doesnt exist")
    df = pd.read_feather(fname)
    df = df.set_index(df.index_col)
    df = df.drop("index_col", axis=1).drop("Date", axis=1)

    if load_related_data:
        related_datas = []
        for col in df.columns:
            if "rain" in col:
                location = col.replace("rainfall_", "")
            elif "temperature" in col:
                location = col.replace("temperature_", "")
            else:
                continue
            filename = "./data/nasa-power/{}.feather".format(location)
            if os.path.exists(filename):
                df_related = pd.read_feather(filename)
                df_related = df_related.set_index(df_related.index_col)
                df_related = df_related.drop("index_col", axis=1)
                df_related.columns = [
                    "{}_{}".format(location, c.lower()) for c in df_related.columns
                ]

                related_datas.append(df_related)
            else:
                print("not found: {}".format(col))

        df_related = pd.concat(related_datas)
        df_related = df_related.groupby(df_related.index).max()
        df = pd.merge(df, df_related, how="left", left_index=True, right_index=True)
        for col in df.columns:
            if "_index" in col:
                df = df.drop(col, axis=1)
    for col in df.columns:
        df[col] = df[col].astype(np.float)

    ignore_cols = ["year", "month", "week", "day", " day_of_year"]

    df = df.rename(
        columns={
            "flow_rate_lupa": "target_flow_rate_lupa",
            "depth_to_groundwater_cos": "target_depth_to_groundwater_cos",
            "depth_to_groundwater_pozzo_9": "target_depth_to_groundwater_pozzo_9",
            "flow_rate_madonna_di_canneto": "target_flow_rate_madonna_di_canneto",
        }
    )

    if dataset_name == "aquifer_luco":
        df = df.rename(
            columns={
                "target_depth_to_groundwater_pozzo_1": "depth_to_groundwater_pozzo_1",
                "target_depth_to_groundwater_pozzo_3": "depth_to_groundwater_pozzo_3",
                "target_depth_to_groundwater_pozzo_4": "depth_to_groundwater_pozzo_4",
            }
        )

    if detrend_features:
        for col in tqdm.tqdm(df.columns):

            if df[col].dtype != np.float64 or col in ignore_cols:
                continue
            decomp = seasonal_decompose(
                df[col].ffill().fillna(0),
                freq=52,
                model="additive",
                extrapolate_trend="freq",
            )
            df[f"{col}_trend"] = decomp.trend
            df[f"{col}_resid"] = decomp.resid
            df[f"{col}_seasonal"] = decomp.seasonal
    if shift_features:
        for col in df.columns:
            if "shift" not in col:
                for i in range(1, 5):
                    df["{}_shift_{}".format(col, i)] = df[col].shift(i)
                for i in range(5, 20, 5):
                    df["{}_shift_{}".format(col, i)] = (
                        df[col].rolling(5).mean().shift(i)
                    )
    if shift_features_longterm:
        for col in tqdm.tqdm(df.columns):
            if "shift" not in col and df[col].dtype == np.float or col in ignore_cols:
                for i in range(1, 12):
                    df["{}_shift_longterm_{}".format(col, i)] = (
                        df[col].rolling(30).mean().shift(i * 30)
                    )

    return df


def get_target_df(target):
    # detrend the target signal into trend/season/resid as well
    stl = STL(target.ffill().fillna(0), seasonal=13)
    res = stl.fit()
    df = pd.DataFrame(
        {
            "season": res.seasonal,
            "trend": res.trend,
            "resid": res.resid,
            "target": target,
        }
    )
    return df

In [None]:
results = []
data_sets = [
    "aquifer_auser",
    "water_spring_amiata",
    "aquifer_petrignano",
    "aquifer_doganella",
    "aquifer_luco",
    "river_arno",
    "lake_bilancino",
    "water_spring_lupa",
    "water_spring_madonna_di_canneto",
]
for dataset in data_sets:
    df = gather_df(dataset, True, False, False, True)
    
    for target_col in [c for c in df.columns if "target" in c]:
        for shift in [30,60]:
            y = get_target_df(df[target_col].shift(-shift)).target

            pipeline = Pipeline(
                memory=None,
                steps=[
                    ("impute", KNNImputer()),
#                     ("scaling", StandardScaler(copy=True, with_mean=True, with_std=True)),
                    ("best", SelectFromModel(lgb.LGBMRegressor())),
                    #         ("dim_reduction", PCA(n_components=10)),
#                             ("dim_reduction", umap.UMAP(n_components=5)),
                ],
                verbose=False,
            )

            for split in range(500, len(df.index), 100):

                X_train = df.iloc[:split]
                X_test = df.iloc[split:]
                y_train = y.iloc[:split]
                y_test = y.iloc[split:]

                x_filtered = ~pd.isna(y_train)

                if len(X_train[x_filtered]) < 500:
                    continue

                X_train_piped = pipeline.fit_transform(X_train[x_filtered], y_train[x_filtered])
                X_test_piped = pipeline.transform(X_test)
                y_train = y_train[x_filtered]

                feat_model = lgb.LGBMRegressor()
                feat_model.fit(X_train[x_filtered], y_train)

                p = feat_model.predict(X_test)
                dfp = pd.DataFrame({"p": p, "y": y_test}).iloc[:100]
                mae = np.mean(np.abs(dfp.p - dfp.y))
                rmse= np.sqrt(np.mean(dfp.p - dfp.y)**2)

                results.append({
                    'dataset': dataset,
                    'split': split,
                    'mae': mae,
                    'rmse': rmse,
                    'type': 'all',
                    'target_col': target_col,
                    'shift': shift
                })

                feat_model = lgb.LGBMRegressor()
                feat_model.fit(X_train_piped, y_train)

                p = feat_model.predict(X_test_piped)
                dfp = pd.DataFrame({"p": p, "y": y_test}).iloc[:100]
                mae = np.mean(np.abs(dfp.p - dfp.y))
                rmse= np.sqrt(np.mean(dfp.p - dfp.y)**2)
                results.append({
                    'dataset': dataset,
                    'split': split,
                    'mae': mae,
                    'rmse': rmse,
                    'type': 'pipeline',
                     'target_col': target_col,
                    'shift': shift
                })
                pd.DataFrame(results).to_json('./simres-alldata-detrend.json')

  decomp = seasonal_decompose(
100%|██████████| 223/223 [00:00<00:00, 289.53it/s]
