# Create artifact from time series dataframe
Gets a .tsf or .csv with a time serie, convert int to np.dataframe and loads it to weights and biases (W&B)

## Set-up
Initial notebook setup and specific debugging and pre-configured cases selection
### VsCode update patch
Initial notebook setup when using VSCode

In [None]:
import sys
if '--vscode' in sys.argv:
    print("Executing inside vscode")
    import nbs_pipeline.utils.vscode  as vs
    vs.DisplayHandle.update = vs.update_patch

### Debugging variables
- `print_flag`. If `True` it adds debbuging messages in those functions that allows so (eg. `get_enc_embeddings`)
- `reset_kernel`. If `True` it resets the kernel by the end of the execution. Use only in case that memory management is needed.

In [None]:
print_flag = True
reset_kernel=False

## Preconfigurated cases selection
- `pre_configured_case`. If `True`, a preconfigured case will be selected, forcing the artifact to get the expected configuration based on the information in `config\*.yml` and `utils\config.py`.
- `case_id`. If `preconfigured_case` is `True`, it forces to select the configuration of the `case_id` preconfigured samples. The available preconfigured samples are shown in the next cell.
- `frequency_factor`. If `pre_configured_case` is `True`, frequency will be resampled by `config.freq*frequency_factor`
  `frequency_factor_change_alias`. If `pre_configured_case` is `True` and `frequency_factor != 1` then the dataset alias will be modified for adding the new frequency as suffix.

In [None]:
import utils.config as cfg_
cfg_.show_available_configs()

In [None]:
pre_configured_case = True
case_id = 8
frequency_factor = 1
frequency_factor_change_alias = True

## Main code


In [None]:
import pandas as pd
import numpy as np
from fastcore.all import *
import wandb
from dvats.load import TSArtifact, infer_or_inject_freq
import pickle
import matplotlib
import matplotlib.pyplot as plt
from tsai.data.external import convert_tsf_to_dataframe
from tsai.utils import stack_pad

### Path and Artiffact configurattions
This notebook gets configuration from `config\base.yaml` and `config\01-dataset_artifact.yaml`

In [None]:
base_path = Path.home()

In [None]:
config = cfg_.get_artifact_config_sd2a(print_flag = False)
if pre_configured_case: 
    cfg_.force_artifact_config_sd2a(
        config = config, 
        id = case_id, 
        print_flag = print_flag, 
        both = print_flag, 
        frequency_factor = frequency_factor, 
        frequency_factor_change_alias = frequency_factor_change_alias
    )

### Data Extraction

The data is assumed to come as a dataframe, either as a binarized  picke file or
as a csv file. It can also come as a `.tsf` file

#### Check file content (if wanted)

In [None]:
if print_flag:
    fpath=os.path.expanduser(config.data_fpath)
    print(fpath)
    try: 
        with open(fpath, 'r') as file:
            for _ in range(13):
                line = file.readline()
                print(line, end='')
        data, _, _, _, _ = convert_tsf_to_dataframe(fpath)
        print("Timestamp", data.start_timestamp)
    except Exception as e:
        print("Error while converting file. Maybe not a tsf: ", e)

#### Extract data

In [None]:
ext = str(config.data_fpath).split('.')[-1]

if ext == 'pickle':
    df = pd.read_pickle(config.data_fpath)
    
elif ext in ['csv','txt']:
    df = pd.read_csv(config.data_fpath, **config.csv_config)
    
elif ext == 'tsf':
    data, _, _, _, _ = convert_tsf_to_dataframe(os.path.expanduser(config.data_fpath))
    config.update({'start_date': data.start_timestamp[0]}, allow_val_change=True)
    date_format = config.date_format
    df = pd.DataFrame(stack_pad(data.series_value).T)
    
else:
    raise Exception('The data file path has an unsupported extension')


In [None]:
if print_flag:
    print(f'File loaded successfully')
    print(df.shape)
    display(df.head())

In [None]:
df

#### Set the time column (if any) as index

In [None]:
if config.time_col is not None:
    if print_flag: print("time_col: "+str(config.time_col))
    
    if isinstance(config.time_col, int): 
        if print_flag: print("Op 1: time_col int")
        datetime = df.iloc[:, config.time_col]
    
    elif isinstance(config.time_col, list): 
        if print_flag: print("Op 2: time_col list")
        datetime = df.iloc[:, config.time_col].apply(lambda x: x.astype(str).str.cat(sep='-'), axis=1)
    
    index = pd.DatetimeIndex(datetime)
    
    if config.date_offset:
        index += config.date_offset
    
    df = df.set_index(index, drop=False)   
    
    #Delete Timestamp col
    col_name = df.columns[config.time_col]
    
    if print_flag: print("... drop Timestamp col " + str(col_name))
    
    df = df.drop(col_name, axis=1)
    
if print_flag: display(df.head())

#### Set dataframe frequency

In [None]:
df = infer_or_inject_freq(
    df, 
    injected_freq=config.freq, 
    start_date=config.start_date, 
    format=config.date_format
)
if print_flag: print(df.index.freq)

#### Select only the needed variables

In [None]:
# Subset of variables
if config.data_cols:
    if print_flag: print("data_cols: ", config.data_cols)
    df = df.iloc[:, config.data_cols]

if print_flag: print(f'Num. variables: {len(df.columns)}')

#### Ensure data integrity

In [None]:
#Duplicated rows
if print_flag: print("df shape before dropping duplicates", df.shape)
df.drop_duplicates()
if print_flag: print("df shape after dropping duplicates", df.shape)
# Verificar si hay duplicados en el índice del dataframe
if df.index.duplicated().any():
    raise ValueError("Duplicated index names")

In [None]:
# Replace the default missing values by np.NaN
if config.missing_values_constant:
    df.replace(config.missing_values_constant, np.nan, inplace=True)

#### Show time series plot

In [None]:
show_time_serie_flag = True
if show_time_serie_flag:
    # Show time series plot
    fig, ax = plt.subplots(1, figsize=(15,5), )
    cmap = matplotlib.colormaps.get_cmap('viridis')
    #df.plot(color=cmap(0.05), ax=ax) # or use colormap=cmap
    df.plot(colormap=cmap, ax=ax) # or use colormap=cmap
    # rect = Rectangle((5000, -4.2), 3000, 8.4, facecolor='lightgrey', alpha=0.5)
    # ax.add_patch(rect)
    plt.tight_layout()
    plt.legend()
    display(plt.show())

### Add Lag features

In [None]:
df.head(13)

In [None]:
from tsai.data.core import TSTensor
from tsai.data.preprocessing  import TSRollingMean
import torch 

In [None]:
# Si se quieren añadir simplemente N pasos hacia atrás/Hacia delante
def add_lagged_features(df, total_extra_features = 12, save_index = True):
    # reframe as supervised learning
    if save_index:
        df_lag = pd.DataFrame(index=df.index)
    else:
        df_lag = pd.DataFrame()
    for i in range(total_extra_features,0,-1):
     df_lag['t-'+str(i)] = df.shift(i).values[:,0]
    df_lag['t'] = df.values[:,0]
    return df_lag

In [None]:
def dataFrame2TSTensor(df, print_flag = False):
    cols = df.columns.tolist()
    cols.remove(df.index.name) if df.index.name else None
    arr = np.vstack([df[col].values for col in cols])
    if print_flag: 
        print(arr.shape)
    df_tensor = TSTensor(arr)
    df_tensor = np.expand_dims(arr, axis=0)
    df_tensor = TSTensor(df_tensor)
    if print_flag:
        print(df_tensor.shape)
        print("(1, nvars, length) = (1,", df.shape[1], ",", df.shape[0],")")
    return df_tensor

### TODO: Check que efectivamente se esté creando shape (1, num_vars, df.shape[0])

In [None]:
# Usando rolling means de diferentes tamaños 
def add_rolling_means(
    df, 
    config,
    window_init = 1, #First window mean size
    window_end = 2, #Last window mean size
    window_step = 1, #Windows will be window_init + k*step until window_end
    replace = False, #True for replacing original variables, False for adding extra variable
    print_flag = False,
    save_index = True
):
    vars = list(range(df.shape[1])) if config.data_cols == [] else config.data_cols

        
    if print_flag:
        print("-----> Convert df to tensor <-----")
    
    if replace:
        if save_index:
            df_roll = pd.DataFrame(index=df.index)
        else:
            df_roll = pd.DataFrame()
    else:
        df_roll = df.copy()
        if print_flag: print("df_roll starts as original ~",  df_roll.shape)

    t = dataFrame2TSTensor(df, print_flag)

    
    
    if print_flag:
        print("-----> Apply rolling means to tensor <-----", t.shape)


    concatenated_columns = []
    for i in range(window_init,window_end, +window_step):
        if print_flag:  print("\t ---> Window", i, "<----")
        
        t_nvars = t.shape[1]

        if print_flag: print("\t Vars: ", vars)
            
        t_roll_i = TSRollingMean(sel_vars=vars, window=i)(t)

        
        count = t_nvars       

            
        for var in vars: 
            columnname = 'RollingMean - var ' + df.columns[var] + " - window - " + str(i)
            if print_flag: 
                print("\t\t---> Saving column | ", columnname, " | <----")
                print("\t\t t_roll_i ~", t_roll_i.shape)
                print("\t\t count", count)
                print("\t\t t_roll_i[:,count] ~", t_roll_i[:,count].shape)
                print("\t\t df_roll length", df_roll.shape[0])
            #df_roll[columnname]  = t_roll_i[:,count][0]
            concatenated_columns.append(pd.Series(t_roll_i[:, count][0], name=columnname))
            count += 1
        
    if print_flag: 
        print("-----> Save the result <-----")
        
    df_concat = pd.concat(concatenated_columns, axis=1)
    
    if print_flag: 
        print("-----> df concat <-----")
        display(df_concat.head())
    
    df_roll[df_concat.columns] = df_concat.values
        
    if print_flag: 
        print("-----> df roll <-----")
        print(df_roll.shape)
        display(df_roll.head())
        
    return df_roll

In [None]:
df_mod = add_rolling_means(
    df, config, 
    window_init = 2, 
    window_end = 4, 
    window_step = 1, 
    replace = False, 
    print_flag = False, 
    save_index = True
)
display(df_mod.head(5))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from matplotlib.gridspec import GridSpec
import pyscamp as scamp

In [None]:
def plot_mps(ts, mp, m, print_flag = False):
    fig = plt.figure(figsize=(10, 6))
    gs = GridSpec(2, 1, height_ratios=[1, 4])
    # Serie temporal
    if print_flag:
        print(ts)
        print(mp.shape)
    
    ax1 = fig.add_subplot(gs[0])
    ax1.plot(ts, label="Serie Temporal")
    ax1.set_title("Serie Temporal")
    ax1.legend()
    
    # MPlot
    ax2 = fig.add_subplot(gs[1], sharex=ax1)
    mp_values = mp.astype(float) # Extraer solo los valores del perfil de similitud
    ax2.imshow(mp_values.reshape(-1, 1).T, aspect='auto', origin='lower', cmap='hot', extent=(0, len(ts), 0, m))
    ax2.set_title("Matrix profile plot")

    plt.tight_layout()
    display(plt.show())

In [None]:
import pyscamp as scamp

In [None]:
# Usando rolling means de diferentes tamaños 
def add_selfjoin_profile(
    df, 
    config,
    m, #Should be inside config
    replace = False, #True for replacing original variables, False for adding extra variable
    print_flag = False,
    save_index = True
):
    vars = list(range(df.shape[1])) if config.data_cols == [] else config.data_cols

    has_gpu_support = scamp.gpu_supported()
    if print_flag: 
        has_gpu_support
        
    if replace:
        if save_index:
            df_mp = pd.DataFrame(index=df.index)
        else:
            df_mp = pd.DataFrame()
    else:
        df_mp = df.copy()
        if print_flag: print("df_roll starts as original ~",  df_mp.shape)

    concatenated_columns = []
    for var in vars: 
        ts = df.iloc[:,var].values
        if print_flag:
            print("var: ", var)
            print("ts.len: ", len(ts))
            print("m: ", m)
        mp_scamp, _ = scamp.selfjoin(ts, m)
        mp_scamp = np.pad(mp_scamp, (0, df.shape[0] - len(mp_scamp)), 'constant', constant_values=(0))
    
        columnname = 'Matrix Profile - var ' + df.columns[var]
        concatenated_columns.append(pd.Series(
            mp_scamp, name=columnname)
        )
        
        if print_flag: 
            print("---> MP plot var " + df.columns[var] + " <---")
            plot_mps(ts, mp_scamp, m, print_flag = print_flag)
            print("\t\t---> Saving column | ", columnname, " | <----")
            print("\t\t mp_var ~", mp_scamp.shape)
    
    df_concat = pd.concat(concatenated_columns, axis=1)
    df_mp[df_concat.columns] = df_concat.values
    
    return df_mp

In [None]:
df.shape[0]

In [None]:
df_mod_ = add_selfjoin_profile(
    df, config, 
    m = 2000,
    replace = True, 
    print_flag = True, 
    save_index = True
)
df_mod = pd.concat([df_mod, df_mod_], axis=1)
display(df_mod_.head(5))

In [None]:
df_mod.head(5)

### Data Transformation

__Handle Missing Values, Resample and Normalize__

> In this second part, Time Series Artifact (TSArtifact) object can be created and missing values handling techniques, resampling and normalization can be applied.
> 
> This techniques should be applied on the three subsets that must be previously created: training, validation and testing.

#### Training data

##### Build dataframe

In [None]:
def build_train_df(config, df, print_flag = False): 
    rg = config.range_training
    if isinstance(rg, list):
        rg_training = rg
    
    elif isinstance(rg, dict):
        rg_training = pd.date_range(rg['start'], rg['end'], freq=rg['freq'])
    
    elif config.test_split:
        rg_training = df.index[:math.ceil(len(df) * (1-config.test_split))]

    else:
        rg_training = None
        
    df_training = df[df.index.isin(rg_training)] if rg_training is not None else df
    return df_training, rg_training    

In [None]:
df_training, rg_training = build_train_df(config, df_mod)

In [None]:
df_training.head()

##### Build training artifact

In [None]:
training_artifact = TSArtifact.from_df(
    df_training, 
    name=config.artifact_name, 
    missing_values_technique=config.missing_values_technique,
    resampling_freq=config.resampling_freq, 
    normalize=config.normalize_training, 
    path=str(Path.home()/config.wandb_artifacts_path)
)
if print_flag: display(training_artifact.metadata)

In [None]:
#Debugging 
if df_training.index.duplicated().any():
    raise ValueError("Duplicated index names")

#### Testing data

##### Build dataframe & artifact

In [None]:
def build_test_df(config, df, print_flag = False): 
    # Testing data
    rg = config.range_testing
    df_testing = pd.DataFrame()
    if rg or config.test_split:
    
        if isinstance(rg, list):
            rg_testing = rg

        elif isinstance(rg, dict):
            rg_testing = pd.date_range(rg['start'], rg['end'], freq=rg['freq'])

        elif config.test_split:
            rg_testing = df.index[math.ceil(len(df) * (1 - config.test_split)):]
        else:
            rg_testing = None
        df_testing = df[df.index.isin(rg_testing)]
        testing_artifact = TSArtifact.from_df(
            df_testing,
            name=config.artifact_name, 
            missing_values_technique=config.missing_values_technique,
            resampling_freq=config.resampling_freq, 
            normalize=False,
            path=str(Path.home()/config.wandb_artifacts_path)
        )
        display(testing_artifact.metadata)
        if df_testing.index.duplicated().any():
            print("Duplicated values in dataframe index.")
        else:
            print("No duplicated values in dataframe index")
    else:
        if print_flag: print("rg "+ str(rg) + " | test_split "+ str(config.test_split))
        testing_artifact = None
    return df_testing, testing_artifact

In [None]:
df_testing, testing_artifact = build_test_df(config, df_mod, False)

#### Training + Testing data

##### Build dataframe & artifact

In [None]:
# Training + Testing data
if(config.joining_train_test):
    print("joining_train_test: "+ str(config.joining_train_test))
    df_train_test = pd.concat([df_training, df_testing])
    train_test_artifact = TSArtifact.from_df(
        df_train_test,
        name=config.artifact_name, 
        missing_values_technique=config.missing_values_technique,
        resampling_freq=config.resampling_freq, 
        normalize=False,
        path=str(Path.home()/config.wandb_artifacts_path)
    )
    if df_train_test.index.duplicated().any():
        print("Duplicated values in dataframe index.")
    else:
        print("No duplicated values in dataframe index.")
    display(train_test_artifact.metadata)
else:
    train_test_artifact = None

### Storing artifacts

For the experiment tracking and hyperparameter we will use the tool **Weights & Biases**. 

> 
Before running this notebook part, make sure you have the `$WANDB_API_KEY`, `$WANDB_ENTITY` and `$WANDB_PROJECT` environment varibales defined with your API_KEY and your ENTITY and PROJECT names (run in a terminal `echo $WANDB_API_KEY` to see it, same with the other variables). If not, run in a terminal `wandb login [API_KEY]` to set the first one. You can see your API_KEY [here](https://wandb.ai/authorize) or in the settings of your W&B account. Run in a terminal `export WANDB_ENTITY=entity_name` and/or `export WANDB_PROJECT=project_name` to set the other two
> 
> <span style="color:red"> TODO: Modify config.ipynb so it gets wandb config from base.yml </span>.

In [None]:
import os
path = os.path.expanduser("~/work/nbs_pipeline/")
name="01_dataset_artifact"
os.environ["WANDB_NOTEBOOK_NAME"] = path+name+".ipynb"
runname=name
print("runname: "+runname)

In [None]:
mode = 'online' if config.use_wandb else 'disabled'

# Make the run that will produce the artifact
with wandb.init(job_type='create_dataset', resume=True, mode=mode, config=config, name=runname) as run:
    if testing_artifact: 
        run.log_artifact(training_artifact, aliases=['train'])
        run.log_artifact(testing_artifact, aliases=['test'])
        
        if train_test_artifact:
            run.log_artifact(train_test_artifact, aliases=['all'])
    
    else:
        run.log_artifact(training_artifact, aliases=['all'])

In [None]:
run.finish()

In [None]:
from dvats.imports import beep
print("Execution ended")
beep(1)

In [None]:
if reset_kernel:
    import os
    os._exit(00)