# Look at some data

In [1]:
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as dates
import numpy as np
%matplotlib widget

In [2]:
def comp(df_true, df_test):
    s_true = df_true.sum()
    s_test = df_test.sum()
    err = s_test - s_true
    err_rel = err/s_true
    print("err")
    print(err)
    print("err rel")
    print(err_rel)

In [3]:
df = pd.read_csv("log.2021-05-22.2021-05-28", delim_whitespace=True,
    index_col=0, parse_dates=True, header=None,
    names=['time','angle','volume_ul'], skiprows=0, memory_map=True,
    engine="c")

print(df.shape)

(604531, 2)


In [4]:
UL_PER_GALLON = 3785411.784
df['volume_gal']=df['volume_ul']/UL_PER_GALLON

In [5]:
df=df.drop(columns=['angle', 'volume_ul'])
df = df.resample('S').sum()

# Noise

In [6]:
plt.subplots()
plt.xlim([dates.datestr2num('2021-05-26 16:30'), dates.datestr2num('2021-05-26 16:50')])
plt.ylim([-0.001,0.001])
plt.scatter(x=df.index, y=df['volume_gal'].to_numpy(), s=1)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.collections.PathCollection at 0x7fe5104b6820>

The shutoff events produce significant backwards spikes, which i think is real.  The angle measure is up to 40k which is >2 turns, but mostly under 10k i.e. half a turn.

The noise at zero is mostly small, like 100 on the angle scale (2 degrees), but there are periods with more noise, like up to 1000 (20 degrees), which seems like a lot for a flow that's really zero, and then at some times of day it's much higher, like 40k, several turns. In terms of volume it's negligible, but what is it?

I don't think it's magnetometer noise, it has a negative trend during some hours of the day. The meter isn't *accurate* for such low flows but it does *work* so i think it's real.

I think it's the __water heater.__  It doesn't exist in the early morning, but it turns on after showers.  The expansion has to go *somewhere*.

In [7]:
df_med = df.rolling(3, center=True).median().fillna(0)
comp(df, df_med)
plt.subplots()
plt.plot(df.index, df['volume_gal'].to_numpy(), label="true")
plt.plot(df_med.index, df_med['volume_gal'].to_numpy(), label="median")
plt.legend()

err
volume_gal   -2.556514
dtype: float64
err rel
volume_gal   -0.000583
dtype: float64


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.legend.Legend at 0x7fe517d957f0>

# Discretization

In [8]:
df_med_disc = 2 * (df_med * 0.5).round(2)
print(df_med_disc)
comp(df, df_med_disc)
plt.subplots()
plt.plot(df.index, df['volume_gal'].to_numpy(), label="true")
plt.plot(df_med_disc.index, df_med_disc['volume_gal'].to_numpy(), label='disc')
plt.legend()
plt.grid()

                     volume_gal
time                           
2021-05-22 00:00:00         0.0
2021-05-22 00:00:01         0.0
2021-05-22 00:00:02        -0.0
2021-05-22 00:00:03        -0.0
2021-05-22 00:00:04        -0.0
...                         ...
2021-05-28 23:59:55        -0.0
2021-05-28 23:59:56        -0.0
2021-05-28 23:59:57        -0.0
2021-05-28 23:59:58        -0.0
2021-05-28 23:59:59         0.0

[604800 rows x 1 columns]
err
volume_gal    34.439377
dtype: float64
err rel
volume_gal    0.007848
dtype: float64


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Holy mackerel, this is pretty good, <1% error.

In [9]:
print(df_med_disc.nunique())

volume_gal    11
dtype: int64


In [10]:
deltas = df_med_disc[df_med_disc['volume_gal'].shift() != df_med_disc['volume_gal']].diff().fillna(0)
print(len(deltas))
print(deltas.nunique())

3731
volume_gal    30
dtype: int64


So this is a vocabulary with 30 unique tokens, and a week is <4000 tokens long, so a day would be in the ballpark for a transformer model.