In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
import sys
import dateutil.parser

from __future__ import print_function

In [2]:
sys.path.insert(0, os.path.abspath('../..'))

from UKMovementSensing import dataprep

In [3]:
execfile('config.py')

## Run the preprocessing script

This part runs all steps for the preprocessing. This can also be done by calling the script `modelgen.py` directly from the commandline. Use the script:

`python modelgen.py file_path annotations_path wearcodes_path output_path`

In [4]:
print('Process annotations...')
annotations = dataprep.process_annotations(annotations_path)
print('Join wearcodes...')
annotations_codes = dataprep.join_wearcodes(wearcodes_path, annotations)

Process annotations...
Number of missing start and end times: 0 0
Multiple end times per slot. first 10:
slot
1                        [04:10:00, 10:10:00, 08:10:00]
2              [04:20:00, 10:20:00, 08:30:00, 04:10:00]
3     [04:30:00, 10:30:00, 09:00:00, 04:40:00, 04:20...
4     [04:40:00, 10:40:00, 09:10:00, 04:50:00, 04:30...
5     [04:50:00, 10:50:00, 09:20:00, 05:00:00, 04:40...
6     [05:00:00, 05:10:00, 11:00:00, 10:00:00, 04:50...
7     [05:10:00, 05:20:00, 11:10:00, 12:40:00, 05:00...
8     [05:20:00, 05:30:00, 11:20:00, 12:50:00, 05:10...
9     [05:30:00, 05:40:00, 11:30:00, 13:00:00, 05:20...
10    [05:40:00, 05:50:00, 11:40:00, 13:10:00, 05:30...
Name: end_time_time, dtype: object
Not all days have 144 slots! first 5:
accSmallID  day
7a1fd48f    1      137
7a2bbb74    1      118
            2       90
7a2e1cd4    1      138
7a44256e    2       72
Name: slot, dtype: int64
Join wearcodes...


In [None]:
print('Process data one by one...')
dataprep.process_data_onebyone(annotations_codes, accelerometer_5sec_path,  subset_path)

Alternatively, do each step for all the accelerometer files. If we do this we have the intermediate results to inspect (see the codeblocks below).

In [6]:
#print('Process data...')
#dfs = dataprep.process_data(annotations_codes, accelerometer_5sec_path)
# print('Save merged...')
# dataprep.save_merged(dfs, merged_path)
# print('Take subsequences...')
# subsets = dataprep.take_subsequences(dfs)
# print('Switch positions...')
# subsets = dataprep.switch_positions(subsets)
# print('Save subsequences...')
# dataprep.save_subsequences(subsets, subset_path)

## Investigate annotations

In [7]:
print(annotations.shape)
annotations.head()

(41242, 7)


Unnamed: 0,slot,day,activity,label,start_time,end_time,accSmallID
0,1,1,1,Sleeping and resting (including sick in bed),2015-03-11 04:00:00+00:00,2015-03-11 04:10:00+00:00,7a0f03e6
1,2,1,1,Sleeping and resting (including sick in bed),2015-03-11 04:10:00+00:00,2015-03-11 04:20:00+00:00,7a0f03e6
2,3,1,1,Sleeping and resting (including sick in bed),2015-03-11 04:20:00+00:00,2015-03-11 04:30:00+00:00,7a0f03e6
3,4,1,1,Sleeping and resting (including sick in bed),2015-03-11 04:30:00+00:00,2015-03-11 04:40:00+00:00,7a0f03e6
4,5,1,1,Sleeping and resting (including sick in bed),2015-03-11 04:40:00+00:00,2015-03-11 04:50:00+00:00,7a0f03e6


In [None]:
# To check: Do we have any gaps?
for i in range(1, annotations.shape[0]):
    if annotations['serflag'][i]==annotations['serflag'][i-1] and annotations['tud_day'][i]==annotations['tud_day'][i-1]:
        if (annotations['end_time'][i-1] != annotations['start_time'][i]):
            print(annotations.loc[[i-1, i],['start_time', 'end_time']])
            print(annotations['end_time'][i-1] - annotations['start_time'][i])

## Investigate join with wearcodes

In [None]:
print(annotations_codes.shape)
annotations_codes.head()

## Investigate accelerometer data

In [None]:
example_key = dfs.keys()[0]
binfile, day = example_key
df = dfs[example_key]
df.head()

In [None]:
t = df.index[0]
print(t.tz)

## Investigate subsequences

In [None]:
#Create boxplots for each sequence for the angles
#We expect x (and thus anglex) to be distributed either mostly on the negative or mostly on the positive half
maxplotsnr = 15
plotsets = subsets.values()[:maxplotsnr]
fig, axes = plt.subplots(maxplotsnr, figsize=(10, 50))
for i, dataset in enumerate(plotsets):
    non_sleeping_indices = dataset['act'] != 1.0
    non_sleeping = dataset[non_sleeping_indices]
    print(np.median(non_sleeping['anglex']), np.median(non_sleeping['angley']))
    axes[i].boxplot([non_sleeping['anglex'], non_sleeping['angley'], non_sleeping['anglez']], labels=['x', 'y', 'z']);