In [38]:
import pyreadr
import numpy as np
import xarray as xr
from bcolz import carray #Can also use ctable
import pandas as pd
import shutil
import opendatasets as ods

# Download the data from Kaggle

You'll need to sign up with Kaggle, then go to the account page and create an API token before running this.

In [2]:
ods.download("https://www.kaggle.com/averkij/tennessee-eastman-process-simulation-dataset")

Skipping, found downloaded files in ".\tennessee-eastman-process-simulation-dataset" (use force=True to force download)


## File structure

This dataset consists of four files:
* TEP_FaultFree_Testing.RData (47.3 MB)
* TEP_FaultFree_Training.RData (24.7 MB)
* TEP_Faulty_Testing.RData (836.9 MB)
* TEP_Faulty_Training.RData (494.1 MB)

The "FaultFree" files contain simulation runs that demonstrate completely normal behaviour. The "Faulty" files contain simulations where a fault is introduced either one hour (training data) or eight hours (testing data) into the simulation. Simulations in the training files ran for 500 time steps (25 hours), while simulations in the test sets are larger (960 samples, 48 hours)

Columns 4 to 55 contain the actual measurements, while column 1 contains the fault number from 0 to 20, where 0 means no fault. To keep this simple, I'm going to convert this to 0 or 1 (no fault or a fault).

## Data structure

Column two contains `simulationRun`, a number from 1 to 500 in the training data, that determines what random seed was used to make that simulation. Importantly, multiple simulations using **the same `simulationRun` value** do exist. This happens in the "Faulty" files, where the simulation is run once for each fault. In the "FaultFree" files, there's only one simulation per `simulationRun`.

This does mean that the first hour of a simulation in the training data appears 21 times (once for the fault-free simulation and 20 times for the fault simulations). I'm going to solve this the easy way by dropping the first hour of each training simulation and eight hours of each testing simulation.

## Loading RData files

This format is used by the R community, but for our purposes we need something that (a) works in Python and (b) doesn't need to be loaded entirely into RAM.

The `pyreadr` module loads RData frames into Pandas dataframes. Unfortunately, it loads the entire dataset into RAM. Here's an example of loading an object called `fault_free_training`.

In [16]:
r_data = pyreadr.read_r('tennessee-eastman-process-simulation-dataset/TEP_FaultFree_Training.RData', use_objects=['fault_free_training'])['fault_free_training']
r_data

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,0.0,1.0,1,0.25038,3674.0,4529.0,9.2320,26.889,42.402,2704.3,...,53.744,24.657,62.544,22.137,39.935,42.323,47.757,47.510,41.258,18.447
1,0.0,1.0,2,0.25109,3659.4,4556.6,9.4264,26.721,42.576,2705.0,...,53.414,24.588,59.259,22.084,40.176,38.554,43.692,47.427,41.359,17.194
2,0.0,1.0,3,0.25038,3660.3,4477.8,9.4426,26.875,42.070,2706.2,...,54.357,24.666,61.275,22.380,40.244,38.990,46.699,47.468,41.199,20.530
3,0.0,1.0,4,0.24977,3661.3,4512.1,9.4776,26.758,42.063,2707.2,...,53.946,24.725,59.856,22.277,40.257,38.072,47.541,47.658,41.643,18.089
4,0.0,1.0,5,0.29405,3679.0,4497.0,9.3381,26.889,42.650,2705.1,...,53.658,28.797,60.717,21.947,39.144,41.955,47.645,47.346,41.507,18.461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,0.0,500.0,496,0.29325,3640.1,4473.0,9.1949,26.867,42.379,2700.2,...,53.429,29.249,60.773,21.532,40.451,34.064,48.953,48.291,40.812,18.756
249996,0.0,500.0,497,0.29134,3625.7,4506.2,9.2109,26.889,42.291,2700.6,...,53.830,28.975,61.517,21.750,42.762,42.645,51.055,48.589,40.933,19.360
249997,0.0,500.0,498,0.29438,3600.2,4478.3,9.1957,26.820,42.448,2700.3,...,54.163,28.676,61.656,21.487,42.109,39.770,46.770,48.648,41.465,19.344
249998,0.0,500.0,499,0.25269,3683.5,4486.4,9.2832,27.188,42.757,2697.4,...,53.453,24.889,61.564,21.392,39.334,42.274,43.623,48.797,39.835,18.512


In [15]:
del r_data

### What do we want to achieve?

A useful fault detector takes in a window of data (or one sample) and tells you whether the fault is happening at that point in time. Sometimes you can predict that a fault will happen, but that's not the focus here. The data needs to be rearranged to make it easy to see where one simulation ends and another starts. To do this, I'll reshape the data into `[num_simulations, num_samples, num_features]`. It also needs to be converted from a dataframe to an array that lives on disk. Reshaping the array will make it easier to get all the rows for individual simulations. Bcolz will provide the disk-backed array.

In [1]:
# File name, object name, and finally the starting sample of the data to be extracted.
training_data_paths = [
    ('tennessee-eastman-process-simulation-dataset/TEP_FaultFree_Training.RData', 'fault_free_training', -1),
    ('tennessee-eastman-process-simulation-dataset/TEP_Faulty_Training.RData', 'faulty_training', 20)
]

testing_data_paths = [
    ('tennessee-eastman-process-simulation-dataset/TEP_FaultFree_Testing.RData', 'fault_free_testing', -1),
    ('tennessee-eastman-process-simulation-dataset/TEP_Faulty_Testing.RData', 'faulty_testing', 8*20)
]

In [50]:
def data_to_bcolz(data, data_output_name, labels_output_name):
    '''
    Convert the dataframe into a bcolz array, splitting the features and labels into separate arrays.
    '''
    pass

def load_data(spec, data_output_name, labels_output_name):
    output = None
    labels = None
    feature_len = 52
    for file, key, fault_start in spec:
        print(f'Read {file}')
        data = pyreadr.read_r(file, use_objects=[key])[key]
        print('Convert to bcolz.ctable')
        sim_length = data['sample'].max()
        num_simulations = data.shape[0] / sim_length
        if output is None:
            shutil.rmtree(f'{data_output_name}.bcolz', ignore_errors=True)
            shutil.rmtree(f'{label_output_name}.bcolz', ignore_errors=True)
#             output = ctable.fromdataframe(data, rootdir=f'{data_output_name}.bcolz')
#             labels = ctable.fromdataframe(data, rootdir=f'{label_output_name}.bcolz')
            output = carray()
        else:
            shutil.rmtree(f'{key}_data.bcolz', ignore_errors=True)
            shutil.rmtree(f'{key}_labels.bcolz', ignore_errors=True)
            output.append(ctable.fromdataframe(data, rootdir=f'{key}_data.bcolz'))
            output.append(ctable.fromdataframe(data, rootdir=f'{key}_labels.bcolz'))
            shutil.rmtree(f'{key}_data.bcolz', ignore_errors=False)
            shutil.rmtree(f'{key}_labels.bcolz', ignore_errors=False)
        print(f'Done {file}')
        del data
    return output, labels

In [51]:
training_data = load_data(training_data_paths, 'training_data')
training_data

Read tennesee-eastman-archive/TEP_FaultFree_Training.RData
Convert to bcolz.ctable
Done tennesee-eastman-archive/TEP_FaultFree_Training.RData
Read tennesee-eastman-archive/TEP_Faulty_Training.RData
Convert to bcolz.ctable
Done tennesee-eastman-archive/TEP_Faulty_Training.RData


ctable((57750000,), [('faultNumber', '<f8'), ('simulationRun', '<f8'), ('sample', '<i4'), ('xmeas_1', '<f8'), ('xmeas_2', '<f8'), ('xmeas_3', '<f8'), ('xmeas_4', '<f8'), ('xmeas_5', '<f8'), ('xmeas_6', '<f8'), ('xmeas_7', '<f8'), ('xmeas_8', '<f8'), ('xmeas_9', '<f8'), ('xmeas_10', '<f8'), ('xmeas_11', '<f8'), ('xmeas_12', '<f8'), ('xmeas_13', '<f8'), ('xmeas_14', '<f8'), ('xmeas_15', '<f8'), ('xmeas_16', '<f8'), ('xmeas_17', '<f8'), ('xmeas_18', '<f8'), ('xmeas_19', '<f8'), ('xmeas_20', '<f8'), ('xmeas_21', '<f8'), ('xmeas_22', '<f8'), ('xmeas_23', '<f8'), ('xmeas_24', '<f8'), ('xmeas_25', '<f8'), ('xmeas_26', '<f8'), ('xmeas_27', '<f8'), ('xmeas_28', '<f8'), ('xmeas_29', '<f8'), ('xmeas_30', '<f8'), ('xmeas_31', '<f8'), ('xmeas_32', '<f8'), ('xmeas_33', '<f8'), ('xmeas_34', '<f8'), ('xmeas_35', '<f8'), ('xmeas_36', '<f8'), ('xmeas_37', '<f8'), ('xmeas_38', '<f8'), ('xmeas_39', '<f8'), ('xmeas_40', '<f8'), ('xmeas_41', '<f8'), ('xmv_1', '<f8'), ('xmv_2', '<f8'), ('xmv_3', '<f8'), ('xm

In [11]:
testing_data = load_data(testing_data_paths, 'testing_data')
testing_data

Read tennesee-eastman-archive/TEP_FaultFree_Testing.RData
Read tennesee-eastman-archive/TEP_Faulty_Testing.RData


ctable((10080000,), [('faultNumber', '<i4'), ('simulationRun', '<f8'), ('sample', '<i4'), ('xmeas_1', '<f8'), ('xmeas_2', '<f8'), ('xmeas_3', '<f8'), ('xmeas_4', '<f8'), ('xmeas_5', '<f8'), ('xmeas_6', '<f8'), ('xmeas_7', '<f8'), ('xmeas_8', '<f8'), ('xmeas_9', '<f8'), ('xmeas_10', '<f8'), ('xmeas_11', '<f8'), ('xmeas_12', '<f8'), ('xmeas_13', '<f8'), ('xmeas_14', '<f8'), ('xmeas_15', '<f8'), ('xmeas_16', '<f8'), ('xmeas_17', '<f8'), ('xmeas_18', '<f8'), ('xmeas_19', '<f8'), ('xmeas_20', '<f8'), ('xmeas_21', '<f8'), ('xmeas_22', '<f8'), ('xmeas_23', '<f8'), ('xmeas_24', '<f8'), ('xmeas_25', '<f8'), ('xmeas_26', '<f8'), ('xmeas_27', '<f8'), ('xmeas_28', '<f8'), ('xmeas_29', '<f8'), ('xmeas_30', '<f8'), ('xmeas_31', '<f8'), ('xmeas_32', '<f8'), ('xmeas_33', '<f8'), ('xmeas_34', '<f8'), ('xmeas_35', '<f8'), ('xmeas_36', '<f8'), ('xmeas_37', '<f8'), ('xmeas_38', '<f8'), ('xmeas_39', '<f8'), ('xmeas_40', '<f8'), ('xmeas_41', '<f8'), ('xmv_1', '<f8'), ('xmv_2', '<f8'), ('xmv_3', '<f8'), ('xm

In [26]:
training_data[training_data.cols.names[3:]][[10, 20, 30]]

array([(0.2348 , 3677.4, 4489.8, 9.3199, 26.695, 42.014, 2703.9, 75.193, 120.39, 0.35435, 80.241, 48.487, 2632.9, 26.304, 48.203, 3102.2, 23.319, 65.774, 230.92, 341.2 , 94.645, 77.569, 32.148, 8.9493, 26.111, 6.7796, 18.826, 1.6824, 32.876, 13.811, 23.921, 1.2803, 18.58 , 2.2525, 4.8822, 2.2569, 0.014489 , 0.81503, 0.1046  , 54.279, 44.352, 62.645, 54.542, 23.133, 61.425, 21.93 , 42.262, 33.648, 42.375, 47.328, 40.344, 17.198),
       (0.27833, 3649.7, 4479.9, 9.3486, 26.387, 42.564, 2701.5, 75.073, 120.4 , 0.33729, 80.384, 50.172, 2630.3, 27.059, 50.066, 3097.6, 22.868, 65.649, 226.86, 341.45, 94.518, 77.321, 32.294, 9.0822, 26.056, 6.9624, 18.749, 1.6767, 33.065, 13.966, 23.52 , 1.3563, 18.466, 2.2213, 4.8492, 2.232 , 0.018032 , 0.87043, 0.10962 , 53.559, 43.529, 63.137, 53.947, 27.761, 60.589, 21.743, 39.398, 38.607, 46.686, 46.688, 41.585, 18.294),
       (0.22515, 3689.6, 4525.4, 9.4095, 27.133, 42.395, 2698.9, 75.073, 120.41, 0.32419, 80.437, 50.174, 2627.2, 24.056, 51.358, 3096

In [27]:
del training_data
del testing_data