In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## add labels
- read all.hd5
- detect spew events, 
- label pre-spew and post-spew periods
- label invalid data
- write to labelled.hd5

In [21]:
%matplotlib notebook

In [22]:
data = pd.read_hdf('/mnt/raw/clean/all.h5', 'data')

In [23]:
columns = {
    "3311WI671.PV" : "Spewput",
    "3311HS181A.PV" : "Throughput",
}

In [24]:
data.rename(columns=columns, inplace=True)

In [25]:
SAMPLE_FREQ = pd.Timedelta('00:00:05')

In [26]:
#invalid data when throughput is below threshold (plus a window around that period)
INVALID_THRESHOLD = 1600
INVALID_WINDOW = pd.Timedelta('00:20:00')

invalid = data.Throughput < INVALID_THRESHOLD
invalid = pd.rolling_sum(invalid, window = INVALID_WINDOW / SAMPLE_FREQ, center=True) > 0

In [32]:
SPEW_THRESHOLD = 300

SAMPLE_FREQ = pd.Timedelta('00:00:05')
SPEW_RECOVERY_TIME = pd.Timedelta('00:45:00')
PRE_SPEW_WARNING = pd.Timedelta('00:03:00')

spewing = data.Spewput > SPEW_THRESHOLD
afterspew = pd.rolling_sum(spewing, window = SPEW_RECOVERY_TIME / SAMPLE_FREQ) > 1
spewevent = spewing & (~afterspew)
prespew = (pd.rolling_sum(spewevent[::-1], window= PRE_SPEW_WARNING / SAMPLE_FREQ)[::-1] > 0) & (~spewevent) & (~afterspew)


In [33]:
data["SPEW_EVENT"] = spewevent
data["PRE_SPEW"] = prespew
data["POST_SPEW"] = afterspew
data["INVALID"] = invalid

In [34]:
data.to_hdf(r"/mnt/raw/clean/labelled.h5", 'data')