In [2]:
%load_ext lab_black
import pandas as pd
import numpy as np
import os
import glob

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [22]:
# some preprocessing of all csv's to get standardized pickle dataframes
# adding some date related features, and making sure the target column names are renamed to "target_xxxx"
# all casing to lower underscores

for file in glob.glob("./data/kaggle-original/*.csv"):
    df = pd.read_csv(file, parse_dates=["Date"], dayfirst=True, index_col=["Date"])
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["week"] = df.index.isocalendar().week
    df["day"] = df.index.day
    df["day_of_year"] = df.index.dayofyear

    df = df.sort_index()

    for column in df.columns:
        if column in [
            "Depth_to_Groundwater_SAL",
            "Depth_to_Groundwater_COS",
            "Depth_to_Groundwater_LT2",
            "Flow_Rate_Bugnano",
            "Flow_Rate_Arbure",
            "Flow_Rate_Ermicciolo",
            "Flow_Rate_Galleria_Alta",
            "Depth_to_Groundwater_P24",
            "Depth_to_Groundwater_P25",
            "Depth_to_Groundwater_Pozzo_1",
            "Depth_to_Groundwater_Pozzo_2",
            "Depth_to_Groundwater_Pozzo_3",
            "Depth_to_Groundwater_Pozzo_4",
            "Depth_to_Groundwater_Pozzo_5",
            "Depth_to_Groundwater_Pozzo_6",
            "Depth_to_Groundwater_Pozzo_7",
            "Depth_to_Groundwater_Pozzo_8",
            "Depth_to_Groundwater_Pozzo_9 ",
            "Depth_to_Groundwater_Podere_Casetta",
            "Hydrometry_Nave_di_Rosano",
            "Lake_Level",
            "Flow_Rate",
        ]:
            df.rename(columns={column: "target_{}".format(column)}, inplace=True)
    df.columns = df.columns.str.lower()
    target_name = file.split("/")[-1].split(".")[0]
    df.to_pickle("./data/kaggle-preprocessed/{}.pkl".format(target_name.lower()))

In [39]:
def gather_df(dataset_name):
    fname = "./data/kaggle-preprocessed/{}.pkl".format(dataset_name)
    if not os.path.exists(fname):
        raise Exception("preprocessed file doesnt exist")
    df = pd.read_pickle(fname)

    related_datas = []
    for col in df.columns:
        if "rain" in col:
            location = col.replace("rainfall_", "")
        elif "temperature" in col:
            location = col.replace("temperature_", "")
        else:
            continue
        filename = "./data/nasa-power/{}.pkl".format(location)
        if os.path.exists(filename):
            df_related = pd.read_pickle(filename)
            df_related.columns = [
                "{}_{}".format(location, c.lower()) for c in df_related.columns
            ]
            related_datas.append(df_related)
        else:
            print("not found: {}".format(col))
    df_related = pd.concat(related_datas)
    df_related = df_related.groupby(df_related.index).max()
    df = pd.merge(df, df_related, how="left", left_index=True, right_index=True)

    for col in df.columns:
        if "target_" in col:
            df["{}_10_mean".format(col)] = df[col].rolling(10).mean()
            df["{}_10_std".format(col)] = df[col].rolling(10).std()
            df["{}_1y_ago".format(col)] = df[col].shift(365)
            df["{}_1y_ago_10_mean".format(col)] = df[col].rolling(10).mean().shift(365)

    return df

In [40]:
gather_df("river_arno")

Unnamed: 0_level_0,rainfall_le_croci,rainfall_cavallina,rainfall_s_agata,rainfall_mangona,rainfall_s_piero,rainfall_vernio,rainfall_stia,rainfall_consuma,rainfall_incisa,rainfall_montevarchi,...,firenze_ws10m,firenze_ws10m_max,firenze_ws10m_min,firenze_ws50m,firenze_ws50m_max,firenze_ws50m_min,target_hydrometry_nave_di_rosano_10_mean,target_hydrometry_nave_di_rosano_10_std,target_hydrometry_nave_di_rosano_1y_ago,target_hydrometry_nave_di_rosano_1y_ago_10_mean
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1998-01-01,,,,,,,,,,,...,,,,,,,,,,
1998-01-02,,,,,,,,,,,...,,,,,,,,,,
1998-01-03,,,,,,,,,,,...,,,,,,,,,,
1998-01-04,,,,,,,,,,,...,,,,,,,,,,
1998-01-05,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-26,0.0,0.0,0.0,0.0,0.0,,,,,,...,1.87,3.02,0.42,2.70,5.44,0.60,1.444,0.127906,1.19,1.193
2020-06-27,0.0,0.0,0.0,0.0,0.0,,,,,,...,2.09,2.99,0.59,3.00,5.19,1.01,1.396,0.114717,1.10,1.173
2020-06-28,0.0,0.0,0.0,0.0,0.0,,,,,,...,2.35,4.67,0.82,3.16,5.61,1.30,1.362,0.079274,1.11,1.168
2020-06-29,0.0,0.0,0.0,0.0,0.0,,,,,,...,4.31,7.52,2.00,6.02,9.38,3.41,1.330,0.077460,1.07,1.153
