In [1]:
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
from tqdm import tqdm_notebook

Using TensorFlow backend.


In [2]:
train = pd.read_hdf('train_test.hdf', 'train')
test = pd.read_hdf('train_test.hdf', 'test')

ts_columns = ['motion', 'temperature', 
              'sweat_10', 'sweat_11', 'sweat_12', 'sweat_13', 'sweat_14', 'sweat_15', 'sweat_16',
              'sweat_r0', 'sweat_r1', 'sweat_r2', 'sweat_r3', 'sweat_r4', 'sweat_r5', 'sweat_r6']

# Make TFRecords

In [3]:
def value_features(values):
    if type(values[0]) == np.float64:
        return tf.train.Feature(float_list=tf.train.FloatList(value=values))
    elif type(values[0]) == np.float:
        return tf.train.Feature(float_list=tf.train.FloatList(value=[np.float64(v) for v in values]))
    if type(values[0]) == np.int64:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=values))
    elif type(values[0]) == str:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[v.encode() for v in values]))

def value_feature(value):
    if type(value) == np.float64:
        return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))
    elif type(value) == np.float:
        return tf.train.Feature(float_list=tf.train.FloatList(value=[np.float64(value)]))
    elif type(value) == np.int64:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    elif type(value) == int:
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    elif type(value) == str:
        return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value.encode()]))

In [4]:
info = pd.read_excel('Infos.xlsx').set_index(['id', 'study'])
info.sex.fillna('NaN', inplace=True)
info.self_size.fillna('NaN', inplace=True)
info.shirt_size.fillna('NaN', inplace=True)

In [5]:
def make_tfrecords(data, filepath, is_train=True):
    writer = tf.python_io.TFRecordWriter(filepath)
    for _, g in tqdm_notebook(data.groupby(level=[0,1,2,3])):
        cur_info = info.loc[_[0], _[2]]
        feature = {'id': value_feature(_[0]), 'boardmac': value_feature(_[1]), 
                   'study': value_feature(_[2]), 'period': value_feature(_[3])}
        feature.update({column: value_feature(cur_info[column])
                   for column in ['sex', 'age', 'self_size', 'shirt_size', 'deodorant_left', 'deodorant_right']})
        feature.update({'sequence_length': value_feature(g.shape[0])})
        feature.update({column: value_features(g[column].values) for column in (ts_columns if is_train else ['motion', 'temperature'])})
        feature.update({'timestamp': value_features([np.int64(dt.timestamp()) for dt in g['datetime'].dt.tz_localize('UTC').dt.to_pydatetime()])})
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())

In [None]:
make_tfrecords(train, 'train.tfrecords')

In [13]:
make_tfrecords(test, 'test.tfrecords', is_train=False)

A Jupyter Widget




# Baseline Model

In [6]:
from functools import partial

class Baseline:
    
    def __init__(self, filepath, batch_size, is_train=True):
        with tf.Graph().as_default() as graph:

            dataset = tf.data.TFRecordDataset(filepath).map(partial(self._parse_function, train=is_train))
            if is_train:
                dataset = dataset.shuffle(256)
            dataset = dataset.padded_batch(batch_size, padded_shapes=dataset.output_shapes)
            self.iterator = dataset.make_initializable_iterator()
            self.example = self.iterator.get_next()

            rnn_cell = keras.layers.LSTMCell(units=14)
            rnn = keras.layers.RNN(rnn_cell, return_sequences=True)
            t = tf.cast(self.example['timestamp'], tf.float32)
            t_diff = t[:,1:] - t[:, :-1]
            x = tf.stack([t_diff, self.example['motion'][:, 1:] / 960.0, self.example['temperature'][:, 1:] / 85.125], 2)
            self.y_pred = rnn(x)
            if is_train:
                y_true = tf.stack([self.example[c][:, 1:] / 738600.0
                                   for c in [c for c in ts_columns if c.startswith('sweat')]], 2)

                self.loss = tf.reduce_mean(tf.reduce_mean(keras.losses.mean_squared_error(y_true, self.y_pred), 1), 0)

                opt = tf.train.AdamOptimizer()
                self.opt_op = opt.minimize(self.loss)

            self.saver = tf.train.Saver()

            self.graph = graph

    @classmethod
    def _parse_function(self, example_proto, train=True):
        features = {
            'id': tf.FixedLenFeature((), tf.int64),
            'boardmac': tf.FixedLenFeature((), tf.string),
            'study': tf.FixedLenFeature((), tf.int64),
            'period': tf.FixedLenFeature((), tf.int64),
            'sex': tf.FixedLenFeature((), tf.string),
            'age': tf.FixedLenFeature((), tf.float32),
            'self_size': tf.FixedLenFeature((), tf.string),
            'shirt_size': tf.FixedLenFeature((), tf.string),
            'deodorant_left': tf.FixedLenFeature((), tf.int64),
            'deodorant_right': tf.FixedLenFeature((), tf.int64),
            'sequence_length': tf.FixedLenFeature((), tf.int64),
            'timestamp': tf.FixedLenSequenceFeature((), tf.int64, allow_missing=True)
        }
        if train:
            features.update({column: tf.FixedLenSequenceFeature((), tf.float32, allow_missing=True)
                             for column in ts_columns})
        else:
            features.update({column: tf.FixedLenSequenceFeature((), tf.float32, allow_missing=True)
                             for column in ['motion', 'temperature']})

        return tf.parse_single_example(example_proto, features)

## Train

In [29]:
NUM_EPOCHS = 5
model = Baseline('train.tfrecords', 171, is_train=True)
pb = tqdm_notebook(range(NUM_EPOCHS))

with tf.Session(graph=model.graph) as sess:
    sess.run(tf.global_variables_initializer())
    #model.saver.restore(sess, 'model/model.ckpt')
    
    for epoch in pb:
        sess.run(model.iterator.initializer)
        while True:
            try:
                cur_loss, _ = sess.run([model.loss, model.opt_op])
                pb.set_description("Current loss: %f" % cur_loss)
            except:
                break
        model.saver.save(sess, 'model/model.ckpt')

A Jupyter Widget




## Test

In [7]:
sweat_columns = [c for c in ts_columns if c.startswith('sweat')]

In [8]:
model = Baseline('test.tfrecords', 47, is_train=False)
predictions = []
with tf.Session(graph=model.graph) as sess:
    sess.run(tf.global_variables_initializer())
    model.saver.restore(sess, 'model/model.ckpt')
    
    sess.run(model.iterator.initializer)
    while True:
        try:
            examples, preds = sess.run([model.example, model.y_pred])
            for user_ID, boardmac, study, period, timestamp, seq_len, pred in zip(examples['id'], examples['boardmac'], examples['study'], examples['period'], 
                   examples['timestamp'], examples['sequence_length'], preds):
                rows = {'user_ID': user_ID, 'boardmac': boardmac, 'study': study, 'period': period}
               
                rows.update({
                    'timestamp': timestamp[:seq_len]
                })
                rows.update({
                    c: np.pad(p[:seq_len], (max(seq_len - p.shape[0], 0), 0), 'constant') for p, c in zip(np.transpose(pred),sweat_columns) 
                })
                predictions.append(pd.DataFrame(rows))
        except:
            break

INFO:tensorflow:Restoring parameters from model/model.ckpt


# Submission

In [17]:
submission = pd.concat(predictions)

In [19]:
submission['datetime'] = pd.to_datetime(submission['timestamp'], unit='s')
submission['boardmac'] = submission.boardmac.str.decode('UTF')

In [20]:
submission = submission.set_index(['user_ID', 'boardmac', 'study', 'period', 'datetime'])[sweat_columns]

In [12]:
submission.to_csv('submission.csv')

In [13]:
true_test = pd.read_hdf('private_data.hdf', 'true_test').set_index('datetime', append=True)[sweat_columns]

In [25]:
assert np.all(submission.index == true_test.index)

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
mean_squared_error(true_test, submission)

553764573.16888249