In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd

# Motion Data loading

In [21]:
motion_path = "motion.tsv"
df_motion = pd.read_csv(motion_path, sep="\t", header=None, index_col=0)

In [22]:
df_motion.head()

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-23T05:45:16.325908+00:00,0,0,0,0,0
2017-05-23T05:45:33.516809+00:00,1,0,0,0,0
2017-05-23T05:50:12.286779+00:00,0,0,0,0,0
2017-05-23T05:50:18.664725+00:00,1,0,0,0,0
2017-05-23T05:59:42.986589+00:00,0,0,0,0,0


# Motion Data formatting

In [23]:
df_motion.index = pd.to_datetime(df_motion.index, infer_datetime_format=True)

In [24]:
df_motion.head()

Unnamed: 0_level_0,1,2,3,4,5
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-23 05:45:16.325908,0,0,0,0,0
2017-05-23 05:45:33.516809,1,0,0,0,0
2017-05-23 05:50:12.286779,0,0,0,0,0
2017-05-23 05:50:18.664725,1,0,0,0,0
2017-05-23 05:59:42.986589,0,0,0,0,0


In [7]:
# TODO: lot of unnecessary information, see if possible to reduce amount of data. For example by adding consecutive rows..
df_walking = df_motion[2]

## time span

In [8]:
analysis_time_span = pd.to_timedelta(48, unit="h")
first_date = df_walking.index[0] + analysis_time_span

In [9]:
first_date

Timestamp('2017-05-25 05:45:16.325908')

In [10]:
time_serie_frequency = pd.to_timedelta(15, unit="m")
number_of_values = int((df_walking.index[-1] - first_date) / time_serie_frequency)
last_date = first_date + number_of_values * time_serie_frequency
print(last_date)

2017-06-02 23:45:16.325908


# First task

In [11]:
x = np.zeros(number_of_values)

In [12]:
# TODO: iterating over a dataframe's rows is not very efficient. Think of a better way.
def time_in_activity_between(s, start, stop):
    """Compute the time spent in a certain activity between start and stop time.
    
    This function aggregates the time spent in a certain activity given by a pd
    Series between a start and stop time. It will consider the activity happening
    before the time span to see if we begin by walking.
    
    Arguments:
        - s (pd.Series): the series containing the activity state with the time
        in index.
        - start (pd.TimeStamp): the beginning of the period of interest.
        - stop (pd.TimeStamp): the end of the period of interest.
        
    Returns:
        - float: the time spent in activity expressed in minutes.
    """
    res = 0
    time_prev = 0
    state_prev = 0
    one_minute = pd.offsets.Minute(1)
    s_interest = s[start: stop]
    if len(s_interest) > 0:
        # Let's look at what was happening before
        first_time_interest = s_interest.index[0]
        iloc_time = s.index.get_loc(first_time_interest)
        if iloc_time > 0:
            if s[iloc_time - 1] == 1:
                time_prev = start
                state_prev = 1
        # Now let's iterate through the rows
        for time, state in s_interest.iteritems():
            if state_prev == 1 and state == 0:
                res += (time - time_prev) / one_minute
            if state == 0:
                time_prev = time
                state_prev = 0
            if state == 1 and state_prev == 0:
                time_prev = time
                state_prev = 1
        # Let's account for the last activity
        if state_prev == 1:
            res += (stop - time_prev) / one_minute
    else:
        # nothing is changing in that time span
        last_state_index = s[s.index<start].index[-1]
        last_state = s[last_state_index]
        if last_state == 1:
            res = (stop - start) / one_minute
    return res

In [13]:
x[0]= time_in_activity_between(df_walking, first_date - analysis_time_span, first_date)

In [14]:
# There is no need to recompute all the time span, we can just remove 
# the first 15 minutes and add the next 15 minutes.
for i in range(1, number_of_values):
    time_to_add = time_in_activity_between(
        s=df_walking, 
        start=first_date + (i-1)*time_serie_frequency,
        stop=first_date + i*time_serie_frequency)
    time_to_remove = time_in_activity_between(
        s=df_walking, 
        start=first_date - analysis_time_span + (i-1)*time_serie_frequency,
        stop=first_date - analysis_time_span + i*time_serie_frequency)
    x[i] = x[i-1] + time_to_add - time_to_remove

In [16]:
# print(x)

# Glucose data loading

In [19]:
glucose_path = "blood-glucose.csv"
df_glucose = pd.read_csv(glucose_path, sep=",", header=None, index_col=0)

# Glucose data formatting

In [25]:
df_glucose.index = pd.to_datetime(df_glucose.index, infer_datetime_format=True)

## Missing value inference

I chose a simple way to infer the missing values: linear interpolation as offered by pandas.

In [34]:
df_glucose.interpolate(inplace=True)

In [46]:
# quick fix
filling_value = df_glucose.iloc[42]
df_glucose.iloc[0:42, :] = float(filling_value)

In [47]:
df_glucose

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
2017-05-23 00:00:00,76.0
2017-05-23 00:15:00,76.0
2017-05-23 00:30:00,76.0
2017-05-23 00:45:00,76.0
2017-05-23 01:00:00,76.0
2017-05-23 01:15:00,76.0
2017-05-23 01:30:00,76.0
2017-05-23 01:45:00,76.0
2017-05-23 02:00:00,76.0
2017-05-23 02:15:00,76.0
