In [1]:
# Standard library
import warnings
import math

# Third party
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

import matplotlib.pyplot as plt
%matplotlib inline

from scipy.io import wavfile

## Load dataset

In [2]:
df = pd.read_csv(f"../data/VTech/_VTech_merged.csv")

## Dataset information

In [3]:
print(f"Number of workers: {len(df['worker'].unique())}")
for worker, worker_df in df.groupby("worker"):
    print(f"\tWorker {worker}: {len(worker_df['day'].unique())} days"
          f", {len(worker_df['trial'].unique())} trials"
          f", {worker_df['mode'].diff().abs().sum():3.0f} transitions"
          f", {len(worker_df):10,} samples")
print(f"Total number of transitions: {df['mode'].diff().abs().sum()}")


Number of workers: 4
	Worker 1: 1 days, 5 trials,  28 transitions,  1,448,215 samples
	Worker 2: 3 days, 4 trials,  68 transitions,  1,351,866 samples
	Worker 3: 1 days, 2 trials,  59 transitions,    859,697 samples
	Worker 4: 1 days, 6 trials, 110 transitions,  1,646,663 samples
Total number of transitions: 266.0


## Building the TF Dataset

Stack consecutive `WINDOW_SIZE` samples with a sliding window, use the least sample for the target variable.
Do not overlap different workers/trials.
The resulting samples in the dataset are: `(WINDOW_SIZE, N_FEATURES) -> (1,)`

In [4]:
first_feature = "orientation_T8_q0"
WINDOW_SIZE = 50
    
def make_windowed_dataset(ds, window_size, shift=1):
    windows = ds.window(window_size, shift=shift)

    def sub_to_batch(sub):
        return sub.batch(window_size, drop_remainder=True)

    def add_labels(batch):
        #return batch[:, :-1], batch[-1, -1]
        return batch[:, :-1], batch[0, -1]
    
    windows = windows.flat_map(sub_to_batch)
    return windows.map(add_labels)


ds = None
for trial_id, trial_data in df.groupby(["worker", "trial"]):
    ds_trial = make_windowed_dataset(
        tf.data.Dataset.from_tensor_slices(trial_data.loc[:,first_feature:].values),
        window_size=WINDOW_SIZE
    )
    if ds is None:
        ds = ds_trial
    else:
        ds = ds.concatenate(ds_trial)

## Model 

In [5]:
normalization = tf.keras.layers.experimental.preprocessing.Normalization()
normalization.adapt(ds.take(100000).map(lambda x, y: x))


model = keras.Sequential(
    [
        keras.layers.Input(shape=(WINDOW_SIZE, 66)),
        normalization,
        keras.layers.Flatten(),
        keras.layers.Dense(16, activation="relu", name="layer1"),
        keras.layers.Dense(4, activation="relu", name="layer2"),
        keras.layers.Dense(1, activation= "sigmoid" ),
    ]
)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
normalization (Normalization (None, 50, 66)            133       
_________________________________________________________________
flatten (Flatten)            (None, 3300)              0         
_________________________________________________________________
layer1 (Dense)               (None, 16)                52816     
_________________________________________________________________
layer2 (Dense)               (None, 4)                 68        
_________________________________________________________________
dense (Dense)                (None, 1)                 5         
Total params: 53,022
Trainable params: 52,889
Non-trainable params: 133
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.binary_crossentropy, 
              metrics=['accuracy'])
model.fit(ds.shuffle(buffer_size=2**18).batch(128), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

In [None]:
y = np.array([y for _, y in ds])
ypred = model.predict(ds.batch(128)).flatten()
yhat = np.asarray(ypred > 0.5, dtype=np.float)
plt.plot(p[:4000])

In [None]:
plt.plot(y[:100000])
plt.plot(yhat[:100000])

In [None]:
np.sum(y == yhat) / len(y)

In [None]:
wav = np.vstack((y, ypred, yhat)).T
wavfile.write("vtech_preds_late.wav", 25, wav)