In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os
import sys
import dateutil.parser

from __future__ import print_function

In [3]:
sys.path.insert(0, os.path.abspath('..'))

from UKMovementSensing import dataprep

In [4]:
data_root_path = "/media/sf_VBox_Shared/London/UCL_data_Jan2017/" 
accelerometer_5sec_path = os.path.join(data_root_path, 'accelerometer_5second/')
subsets_path = os.path.join(data_root_path, 'subsets/')

In [73]:
def process_data(filepath, subsets_path):
    """
    Load all accelerometer data in the specified directory,
    and join with annotations.
    Parameters
    ----------
    annotations_codes : Pandas DataFrame
        DataFrame with annotations and wearcodes (result from join_wearcodes)
    filepath : str
        Directory of the accelerometer data files
    Returns
    -------
    dict holding all the merged dataframe
    """
    for fn in os.listdir(filepath):
        dfs = {}
        # Load data
        data = pd.read_csv(os.path.join(filepath, fn),
                           index_col='timestamp', parse_dates=[0],
                           infer_datetime_format=True)
        data.index = data.index.tz_localize('Europe/London')
        data = data.dropna()
        data['filename'] = fn
        # Keep all data frames
        dfs[(fn, 1)] = data

        # Take subsequences
        subsets = dataprep.take_subsequences(dfs)

        # Switch positions
        subsets_switched = dataprep.switch_positions(subsets)

        # Save file
        dataprep.save_subsequences(subsets, subsets_path)
    return dfs

In [74]:
%pdb on
process_data(accelerometer_5sec_path, subsets_path)

Automatic pdb calling has been turned ON
switched dataset with median 6.721257
switched dataset with median 3.868726
switched dataset with median 25.470679
switched dataset with median 34.190342
switched dataset with median 19.850993
switched dataset with median 24.903292
switched dataset with median 10.000154
switched dataset with median 38.213411
switched dataset with median 36.847239
switched dataset with median 33.555606
switched dataset with median 14.732124
switched dataset with median 29.725724
switched dataset with median 8.816002
switched dataset with median 22.524306
switched dataset with median 27.502124
switched dataset with median 26.822321
switched dataset with median 16.291079
switched dataset with median 18.934421
switched dataset with median 28.640689
switched dataset with median 37.213232
switched dataset with median 14.682658
switched dataset with median 16.398199
switched dataset with median 21.263895
switched dataset with median 1.199933
switched dataset with media

KeyboardInterrupt: 

> [1;32m/home/dafne/anaconda2/envs/ucl2/lib/python2.7/site-packages/pandas/io/parsers.py[0m(1332)[0;36mread[1;34m()[0m
[1;32m   1330 [1;33m[1;33m[0m[0m
[0m[1;32m   1331 [1;33m            [1;32melse[0m[1;33m:[0m[1;33m[0m[0m
[0m[1;32m-> 1332 [1;33m                [1;32mraise[0m[1;33m[0m[0m
[0m[1;32m   1333 [1;33m[1;33m[0m[0m
[0m[1;32m   1334 [1;33m        [1;31m# Done with first read, next time raise StopIteration[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> exit()


In [85]:
%pdb off

Automatic pdb calling has been turned OFF


In [6]:
annotations = pd.read_csv('/media/sf_VBox_Shared/London/UCL_data_Jan2017/test_tud.csv')

In [7]:
annotations = dataprep.remove_invalid_annotations(annotations)

# Convert timestamps to datetime
annotations['start_time'] = [dataprep.parse_time(s) for s in annotations['start_time']]
annotations['end_time'] = [dataprep.parse_time(s) for s in annotations['end_time']]

Number of missing start and end times: 0 0


In [8]:
annotations = annotations.sort_values(['accSmallID', 'day', 'slot'])
annotations.index = list(range(annotations.shape[0]))

In [12]:
differences = [x-y for x,y in zip(annotations['end_time'], annotations['start_time'])]
diff_indices = [x != pd.Timedelta(10, unit='m') for x in differences]

In [13]:
differences

[Timedelta('0 days 00:10:00'),
 Timedelta('0 days 00:10:00'),
 Timedelta('0 days 00:10:00'),
 Timedelta('0 days 00:10:00')]

In [123]:
time_a = dataprep.parse_time("2015-03-29T00:50:00+0000")
time_b = dataprep.parse_time("2015-03-29T02:00:00+0100")

In [16]:
%pdb on
annotations = dataprep.process_annotations("/media/sf_VBox_Shared/London/UCL_data_Jan2017/tud.csv")

Automatic pdb calling has been turned ON
Number of missing start and end times: 0 0
Multiple end times per slot. first 10:
slot
1                        [04:10:00, 10:10:00, 08:10:00]
2              [04:20:00, 10:20:00, 08:30:00, 04:10:00]
3     [04:30:00, 10:30:00, 09:00:00, 04:40:00, 04:20...
4     [04:40:00, 10:40:00, 09:10:00, 04:50:00, 04:30...
5     [04:50:00, 10:50:00, 09:20:00, 05:00:00, 04:40...
6     [05:00:00, 05:10:00, 11:00:00, 10:00:00, 04:50...
7     [05:10:00, 05:20:00, 11:10:00, 12:40:00, 05:00...
8     [05:20:00, 05:30:00, 11:20:00, 12:50:00, 05:10...
9     [05:30:00, 05:40:00, 11:30:00, 13:00:00, 05:20...
10    [05:40:00, 05:50:00, 11:40:00, 13:10:00, 05:30...
Name: end_time_time, dtype: object
Not all days have 144 slots! first 5:
accSmallID  day
7a1fd48f    1      137
7a2bbb74    1      118
            2       90
7a2e1cd4    1      138
7a44256e    2       72
Name: slot, dtype: int64
