In [86]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [87]:
import pandas as pd
import plotly.express as px
from config.config import ESTACIONS_DIR, OBSERVACIONS_DIR, OBSERVACIONS_FILTRAT_DIR
import plotly.graph_objects as go
from pathlib import Path
import numpy as np

In [88]:
estacions = pd.read_csv(ESTACIONS_DIR, index_col=5)
estacions.head()

Unnamed: 0_level_0,Estació,UTM X,UTM Y,ID,amunt_em
nom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
a01,Aforament - Abrera,409650,4596023,1,no
a02,Aforament - Anglès (riera d'Osor),469379,4645530,2,si
a03,Aforament - Balsareny,407058,4634787,3,no
a04,Aforament - Berga (Olvan),407241,4659812,4,no
a05,Aforament - Boadella d'Empordà,488441,4686953,5,no


In [89]:
#Estacions correctes parcialment o totalment

for observacions_filtrat_path in list(OBSERVACIONS_FILTRAT_DIR.glob("*.csv"))[:5]:
    name_file = observacions_filtrat_path.name
    nom = name_file.replace('.csv', '')
    
    estacio = estacions.loc[nom]['Estació']
    observacions_filtrat = pd.read_csv(observacions_filtrat_path)
    observacions = pd.read_csv(Path(OBSERVACIONS_DIR, name_file))

    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=observacions["Date"], y=observacions["Flow"], name="Observacions incorrectes", mode="lines", line_color="red"))
    fig.add_trace(go.Scatter(x=observacions_filtrat["Date"], y=observacions_filtrat["Flow"], name="Observacions correctes", mode="lines", line_color="blue"))
    fig.update_layout(
        title=estacio + '--' + nom
    )    
    fig.show()

In [90]:
estacions_filtrades = list(map(lambda path: path.name, OBSERVACIONS_FILTRAT_DIR.glob("*.csv")))
totes_estacions = list(map(lambda path: path.name, OBSERVACIONS_DIR.glob("*.csv")))
estacions_descartades = set(totes_estacions) - set(estacions_filtrades)
estacions_descartades

{'a02.csv',
 'a06.csv',
 'a10.csv',
 'a14.csv',
 'a15.csv',
 'a27.csv',
 'a32.csv',
 'a56.csv',
 'a63.csv',
 'a64.csv'}

In [91]:
#for each element of estacions_descartades, get the file in OBSERVACIONS_DIR and plot it
for nom_estacio in estacions_descartades:
    observacions = pd.read_csv(Path(OBSERVACIONS_DIR, nom_estacio))
    estacio = estacions.loc[nom_estacio.replace('.csv', '')]['Estació']
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=observacions["Date"], y=observacions["Flow"], name="Observacions", mode="lines"))
    fig.update_layout(
        title=estacio + '--' + nom_estacio
    )    
    fig.show()

In [92]:
#Min and max date in all the datasets in observacions_dir
min_date = min(map(lambda path: pd.read_csv(path)["Date"].min(), OBSERVACIONS_DIR.glob("*.csv")))
max_date = max(map(lambda path: pd.read_csv(path)["Date"].max(), OBSERVACIONS_DIR.glob("*.csv")))

print(f"Min date: {min_date}")
print(f"Max date: {max_date}")

Min date: 2001-01-01
Max date: 2021-02-16


In [93]:
#How many null values in observacions_dir
null_values = sum(map(lambda path: pd.read_csv(path)["Flow"].isnull().sum(), OBSERVACIONS_DIR.glob("*.csv")))
null_values

167746

In [97]:
#values that are null in observacions_filtrat_dir but not in observacions_dir
def number_of_bad_samples(file):
    df_filtrat = pd.read_csv(file)
    df = pd.read_csv(Path(OBSERVACIONS_DIR, file.name))
    
    #Number of null values in OBSERVACIONS_FILTRAT_DIR
    null_values_filtrat = df_filtrat["Flow"].isnull().sum()

    #Number of null values in OBSERVACIONS_DIR
    null_values = df["Flow"].isnull().sum()

    bad_samples = null_values_filtrat - null_values

    if bad_samples < 0:
        print(f"Error: {file.name}")
        return 0
    
    return bad_samples


#sum of bad samples in all the files
sum([number_of_bad_samples(file) for file in OBSERVACIONS_FILTRAT_DIR.glob("*.csv")])


9068

In [104]:
def count_non_null_values(file):
    df = pd.read_csv(file)
    return df["Flow"].count()

#for each station in estacions_descartades, get the file in OBSERVACIONS_DIR and count the non null values
sum([count_non_null_values(Path(OBSERVACIONS_DIR, nom_estacio)) for nom_estacio in estacions_descartades])


42555

In [95]:
#https://towardsdatascience.com/anomaly-detection-time-series-4c661f6f165f
#https://facebook.github.io/prophet/docs/seasonality,_holiday_effects,_and_regressors.html
#https://github.com/uber/orbit
#https://towardsdatascience.com/tods-detecting-outliers-from-time-series-data-2d4bd2e91381
#sarima


def errors_from_file(file, **kwargs):

    observacio_filtrada_df = pd.read_csv(file).rename(columns = {'Date': 'ds', 'Flow': 'y'})
    prophet_model = Prophet.Prophet(observacio_filtrada_df, **kwargs)
    
    nash = prophet_model.nash()
    pbias = prophet_model.pbias()

    return nash, pbias


def errors_from_config(**kwargs):

    num_cores = multiprocessing.cpu_count()
    nash, pbias = zip(*Parallel(n_jobs=num_cores)(delayed(errors_from_file)(file, **kwargs) for file in OBSERVACIONS_FILTRAT_DIR.glob("*.csv")))

    return sum(nash) / len(nash), sum(pbias) / len(pbias)

def hyperparameter_opt(fitted_model, **kwargs):
    
    nash, pbias = errors_from_config(fitted_model, **kwargs)
    return nash, pbias
