In [None]:
import sys
sys.path.append("..")
%load_ext autoreload
%autoreload 2

### Create artifact from time series dataframe

In [None]:
import pandas as pd
import numpy as np
from fastcore.all import *
import wandb
from timecluster_extension.load import TSArtifact
import pickle
import matplotlib.pyplot as plt

### Notebook config

In [None]:
config = AttrDict(
    use_wandb = False, # Whether to use or not wandb for experiment tracking
    wandb_entity = 'pacmel',
    wandb_project = 'timecluster_extension',
    base_path = Path.home()/'data/PACMEL-2019/JNK/', # Folder path where you data lives
    data_file_name = 'jnk_before_handling_missing.pickle', # Name of the data file. Must be a pickle file
    variables_file_name = 'mining-mapping.csv',
    wandb_artifacts_path = 'data/PACMEL-2019/wandb_artifacts', # Output path where the resulting TSArtifact will be stored
    artifact_name = 'JNK', # Name of the artifact to be created
    joining_validation_test = True, #True to create an artifact linking validation and testing data or False if it doesn't
    # training, validation and testing days
    range_training = range(1, 16), #[15 days]
    range_validation = range(16, 21), #[5 days]
    range_testing = range(21, 30),  #[9 days]
    resampling_freq = '5s', # resampling frequency
    missing_values_technique='linear_interpolation' # handle missing values technique
)

### Run

For the experiment tracking and hyperparameter we will use the tool **Weights & Biases**. 

Before running this notebook part, make sure you have the `$WANDB_API_KEY` environment varibale defined with your API_KEY (run in a terminal `echo $WANDB_API_KEY` to see it). If not, run in a terminal `wandb login [API_KEY]`. You can see your API_KEY [here](https://wandb.ai/authorize) or in the settings of your W&B account.

In [None]:
# IMPORTANT! --> use wandb.init(..., mode='disabled', ...) to test W&B API without uploading anything to the clouds
run = wandb.init(entity = config.wandb_entity, 
                 project=config.wandb_project, 
                 job_type='create_dataset', 
                 resume=True,
                 mode='online' if config.use_wandb else 'offline',
                 config=config)
config = wandb.config

[34m[1mwandb[0m: W&B syncing is set to `offline` in this directory.  Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.


#### Data Extraction

The data is assumed to come as a dataframe binarized in a picke file

In [None]:
f = open(f'{config.base_path}/{config.data_file_name}', 'rb')
bin_data = f.read()
data_file = pickle.loads(bin_data)
print(f'File loaded successfully')

File loaded successfully


In [None]:
main_df = data_file[0]
main_df.index.freq = 's'
print(f'Num. variables: {len(main_df.columns)}')

Num. variables: 16


In [None]:
#slow
main_df.plot(subplots=True, figsize=(15,15), layout=(8,2),  sharex=True, colormap='viridis')
plt.tight_layout()

#### Data Transformation

##### 1. Previous transformations
In this part, filters or transformation can be defined depending on variables origin or time-dependent features, among others.

In [None]:
# variables types
var_df = pd.read_csv(f'{config.base_path}/{config.variables_file_name}', index_col='Variable name')
data_var_type = {var : var_df.loc[var,'Type'] for var in main_df.columns}
data_var_type

{'RCD_AverageThree-phaseCurrent': 'double',
 'LCD_AverageThree-phaseCurrent': 'double',
 'LP_AverageThree-phaseCurrent': 'double',
 'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)': 'double',
 'RHD_RightHaulageDrive(tractor)Temperature(gearbox)': 'double',
 'LA_LeftArmTemperature': 'double',
 'RA_RightArmTemperature': 'double',
 'SM_DailyRouteOfTheShearer': 'double',
 'SM_TotalRoute': 'double',
 'LHD_EngineCurrent': 'double',
 'RHD_EngineCurrent': 'double',
 'RCD_BearingTemperature': 'double',
 'SM_ShearerSpeed': 'double',
 'SM_ShearerLocation': 'double',
 'SM_ShearerMoveInLeft': 'bool',
 'SM_ShearerMoveInRight': 'bool'}

In [None]:
# only double variables type are stored
double_var_list = [var for var, var_type in data_var_type.items() if var_type=='double']
df = main_df[main_df.columns.intersection(double_var_list)]
print(f'Num. variables: {len(df.columns)}')

Num. variables: 14


##### 2. Handle Missing Values, Resample and Normalize
In this second part, Time Series Artifact (TSArtifact) object can be created and missing values handling techniques, resampling and normalization can be applied. This techniques should be applied on the three subsets that must be previously created: training, validation and testing.

In [None]:
# Training data
rg = config.range_training
df_training = df.query('index.dt.day.isin(@rg)')
training_artifact = TSArtifact.from_df(df_training, 
                                       name=config.artifact_name, 
                                       missing_values_technique=config.missing_values_technique,
                                       resampling_freq=config.resampling_freq, 
                                       normalize=True, 
                                       path=str(Path.home()/config.wandb_artifacts_path))
training_artifact.metadata

{'TS': {'sd': '2019-06-01 00:00:00',
  'ed': '2019-06-15 23:59:59',
  'created': 'from-df',
  'n_vars': 14,
  'handle_missing_values_technique': 'linear_interpolation',
  'has_missing_values': 'False',
  'n_samples': 259200,
  'freq': '<5 * Seconds>',
  'vars': ['RCD_AverageThree-phaseCurrent',
   'LCD_AverageThree-phaseCurrent',
   'LP_AverageThree-phaseCurrent',
   'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)',
   'RHD_RightHaulageDrive(tractor)Temperature(gearbox)',
   'LA_LeftArmTemperature',
   'RA_RightArmTemperature',
   'SM_DailyRouteOfTheShearer',
   'SM_TotalRoute',
   'LHD_EngineCurrent',
   'RHD_EngineCurrent',
   'RCD_BearingTemperature',
   'SM_ShearerSpeed',
   'SM_ShearerLocation'],
  'normalization': {'means': {'RCD_AverageThree-phaseCurrent': 39.716197145061734,
    'LCD_AverageThree-phaseCurrent': 38.54135300925926,
    'LP_AverageThree-phaseCurrent': 2.0873996913580246,
    'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)': 58.599024305555545,
    'RHD_Right

In [None]:
# Validation data
rg = config.range_validation
df_validation = df.query('index.dt.day.isin(@rg)')
validation_artifact = TSArtifact.from_df(df_validation,
                                         name=config.artifact_name, 
                                         missing_values_technique=config.missing_values_technique,
                                         resampling_freq=config.resampling_freq, 
                                         normalize=False,
                                         path=str(Path.home()/config.wandb_artifacts_path))
validation_artifact.metadata

{'TS': {'sd': '2019-06-16 00:00:00',
  'ed': '2019-06-20 23:59:59',
  'created': 'from-df',
  'n_vars': 14,
  'handle_missing_values_technique': 'linear_interpolation',
  'has_missing_values': 'False',
  'n_samples': 86400,
  'freq': '<5 * Seconds>',
  'vars': ['RCD_AverageThree-phaseCurrent',
   'LCD_AverageThree-phaseCurrent',
   'LP_AverageThree-phaseCurrent',
   'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)',
   'RHD_RightHaulageDrive(tractor)Temperature(gearbox)',
   'LA_LeftArmTemperature',
   'RA_RightArmTemperature',
   'SM_DailyRouteOfTheShearer',
   'SM_TotalRoute',
   'LHD_EngineCurrent',
   'RHD_EngineCurrent',
   'RCD_BearingTemperature',
   'SM_ShearerSpeed',
   'SM_ShearerLocation'],
  'hash': '-5501534258397058373'}}

In [None]:
# Testing data
rg = config.range_testing
df_testing = df.query('index.dt.day.isin(@rg)')
testing_artifact = TSArtifact.from_df(df_testing,
                                      name=config.artifact_name, 
                                      missing_values_technique=config.missing_values_technique,
                                      resampling_freq=config.resampling_freq, 
                                      normalize=False,
                                      path=str(Path.home()/config.wandb_artifacts_path))
testing_artifact.metadata

{'TS': {'sd': '2019-06-21 00:00:00',
  'ed': '2019-06-29 23:59:59',
  'created': 'from-df',
  'n_vars': 14,
  'handle_missing_values_technique': 'linear_interpolation',
  'has_missing_values': 'False',
  'n_samples': 155520,
  'freq': '<5 * Seconds>',
  'vars': ['RCD_AverageThree-phaseCurrent',
   'LCD_AverageThree-phaseCurrent',
   'LP_AverageThree-phaseCurrent',
   'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)',
   'RHD_RightHaulageDrive(tractor)Temperature(gearbox)',
   'LA_LeftArmTemperature',
   'RA_RightArmTemperature',
   'SM_DailyRouteOfTheShearer',
   'SM_TotalRoute',
   'LHD_EngineCurrent',
   'RHD_EngineCurrent',
   'RCD_BearingTemperature',
   'SM_ShearerSpeed',
   'SM_ShearerLocation'],
  'hash': '-6632507113984032164'}}

In [None]:
# Validation + Testing data
if(config.joining_validation_test):
    val_test_range = list(config.range_validation) + list(config.range_testing)
    df_val_test = df.query('index.dt.day.isin(@val_test_range)')
    val_test_artifact = TSArtifact.from_df(df_val_test,
                                           name=config.artifact_name, 
                                           missing_values_technique=config.missing_values_technique,
                                           resampling_freq=config.resampling_freq, 
                                           normalize=False,
                                           path=str(Path.home()/config.wandb_artifacts_path))
    print(val_test_artifact.metadata)

{'TS': {'sd': '2019-06-16 00:00:00', 'ed': '2019-06-29 23:59:59', 'created': 'from-df', 'n_vars': 14, 'handle_missing_values_technique': 'linear_interpolation', 'has_missing_values': 'False', 'n_samples': 241920, 'freq': '<5 * Seconds>', 'vars': ['RCD_AverageThree-phaseCurrent', 'LCD_AverageThree-phaseCurrent', 'LP_AverageThree-phaseCurrent', 'LHD_LeftHaulageDrive(tractor)Temperature(gearbox)', 'RHD_RightHaulageDrive(tractor)Temperature(gearbox)', 'LA_LeftArmTemperature', 'RA_RightArmTemperature', 'SM_DailyRouteOfTheShearer', 'SM_TotalRoute', 'LHD_EngineCurrent', 'RHD_EngineCurrent', 'RCD_BearingTemperature', 'SM_ShearerSpeed', 'SM_ShearerLocation'], 'hash': '-3445972596578682469'}}


#### Data Loading

In [None]:
# log the artifacts

run.log_artifact(training_artifact)
run.log_artifact(validation_artifact)
run.log_artifact(testing_artifact)

if(config.joining_validation_test):
    run.log_artifact(val_test_artifact)

In [None]:
run.finish()

[34m[1mwandb[0m: You can sync this run to the cloud by running:
[34m[1mwandb[0m: [33mwandb sync /home/victor/work/nbs/wandb/offline-run-20210527_172157-32t2j9by[0m
