In [1]:
import sys
sys.path.append("../..")
import pyadlml
import numpy as np
import pandas as pd

In [2]:
data_path = "/home/chris/code/adlml/datasets/mitlab/"
sub1_act = data_path + "subject1/" + "Activities.csv"
sub1_dev = data_path + "subject1/" + "sensors.csv"
sub1_data = data_path + "subject1/" + "activities_data.csv"
sub2_act = data_path + "subject2/" + "Activities.csv"
sub2_dev = data_path + "subject2/" + "sensors.csv"
sub2_data = data_path + "subject2/" + "activities_data.csv"

In [3]:
from pyadlml.dataset.devices import _check_devices_sequ_order
from pyadlml.dataset.activities import _is_activity_overlapping, correct_activity_overlap
from pyadlml.dataset.mitlab import read_data, _load_device_map, _load_activity_map

In [None]:
df_sub1_dev = _load_device_map(sub1_dev)
df_sub1_act = _load_activity_map(sub1_act)
df_sub2_dev = _load_device_map(sub2_dev)
df_sub2_act = _load_activity_map(sub2_act)

subj1_df_dev, subj1_df_act = read_data(sub1_data, df_sub1_dev, df_sub1_act)
subj2_df_dev, subj2_df_act = read_data(sub2_data, df_sub2_dev, df_sub2_act)

In [None]:
END_TIME = 'end_time'
START_TIME = 'start_time'

def _is_activity_overlapping(df, shift=1):
    """ checks if preceeding activities overlap each other
    """
    assert shift >=1
    df = df.sort_values(by=START_TIME)
    mask = (df[START_TIME].shift(-shift) - df[END_TIME]) < pd.Timedelta('0ms')
    return not df[mask].empty


In [6]:
def _get_overlapping_activities(df, shift=1):
    """ gets all activities that have an overlap
    """
    assert shift >=1
    
    df = df.copy()
    df = df.sort_values(by=START_TIME)
    df = df.reset_index(drop=True)
    
    # get all activities that are have in an overlap
    mask = (df[START_TIME].shift(-shift) - df[END_TIME]) < pd.Timedelta('0ms')
    
    # as start_time is shifted upwards to select the right corresp. overlap 
    # shift the mask 'shift' steps downards  
    mask = mask.shift(+shift) | mask
    return df[mask] 

In [10]:
from pyadlml.dataset.activities import _merge_ints
def _correct_overlapping_activities(area_to_correct):
    """
    
    """
    assert len(area_to_correct) >= 2
    
    result = _create_activity_df()
    stack = area_to_correct.copy().reset_index(drop=True)
    #print('area_to_correct: ', area_to_correct)
    
    while True:
        #print('~'*50)            
        #print('stack:\n', stack, '\n')
        # pop first and second item from stack if they overlap otherwise 
        # append to result until two items overlap
        while True:                
            current_row = stack.iloc[0]
            ov = stack.iloc[1]

            # if they don't overlap push onto result otherwise proceed with merging 
            # procedure
            int1 = pd.Interval(current_row.start_time, current_row.end_time)
            int2 = pd.Interval(ov.start_time, ov.end_time)
            if int1.overlaps(int2):
                stack = stack.iloc[2:]
                break
            # the case when the last two activities on stack don't overlap
            elif stack.iloc[2:].empty:
                result = result.append(stack)
                return result
            else:
                result = result.append(current_row)
                stack = stack.iloc[1:]

 
        new_rows = _merge_ints(current_row, ov)

        if stack.empty: 
            result = result.append(new_rows)
            return result
        else:
            result = result.append(new_rows.iloc[0,:])

        new_rows = new_rows.iloc[1:]
        stack = stack.append(new_rows)
        stack = stack.sort_values(by='start_time')
    return result


In [30]:
from pyadlml.dataset.activities import _create_activity_df
def _join_to_interval(x):
    return pd.Interval(x[0], x[1])

def correct_activity_overlap(df):
    """ solve the merge overlapping interval problem
        worst runtime is O(n^2)
        average runtime is O()
        
    Parameters
    ----------
    df : pd.DataFrame
        Activity dataframe with the columns
        
    Returns
    -------
    df : pd.DataFrame 
        corrected activity dataframe
    corrections : list
        a list of tuples with the areas that had to be corrected and the corrections
    """
    
    # 1. sor the given list of time intervals in ascending order of
    #    starting time
    df = df.copy()
    df = df.sort_values(START_TIME)
    df = df.reset_index(drop=True)
    
    res = _create_activity_df()
    corrections = [] # list of tuples
    
    # get all activities that have in an overlap with a direct preceding interval
    mask = (df[START_TIME].shift(-1) - df[END_TIME]) < pd.Timedelta('0ms')
    idxs_succ_overlaps = list(df[mask].index)
    
    print('len: df', len(df)) # debug
    print(idxs_succ_overlaps) # debug
    i_l, i = 0, 0
    while i < len(idxs_succ_overlaps):
        print('#'*20)
        i_h = idxs_succ_overlaps[i]
        print('res: ', i_l, ' to ', i_h)
        res = res.append(df.iloc[i_l:i_h,:])
        
        # get index of first element that start_time is lesser than the end time
        # this marks the point where we can copy again indices
        i_l = i_h
        i_h = list(df[df[START_TIME] > df.iloc[i_h,:].end_time].index)[0]-1
        
        # only for last iteration
        if i == len(idxs_succ_overlaps)-1:
            area_to_correct = df.iloc[i_l:i_h+1,:]
            result = _correct_overlapping_activities(area_to_correct)                             
            corrections.append((area_to_correct, result))
            res = res.append(result)
            break
            
        # if the index exceed the next pair of overlapping indicies, extent the range to
        # include them
        if i_h >= idxs_succ_overlaps[i+1]:
            while idxs_succ_overlaps[i+1] <= i_h:                
                i_h = list(df[df[START_TIME] > df.iloc[i_h,:].end_time].index)[0]-1
                i += 1
        area_to_correct = df.iloc[i_l:i_h+1,:]
        result = _correct_overlapping_activities(area_to_correct)                             
        corrections.append((area_to_correct, result))        
        res = res.append(result)        
        i += 1
        print('i_l: ', i_l, 'i_h: ', i_h)
        
    res = res.sort_values(by=START_TIME)
    res = res.reset_index(drop=True)
    return res, corrections

In [35]:
#for i in range(1, 5):
    #print('sub1 act overlap?: ', _is_activity_overlapping(subj1_df_act, shift=i))
df, cor_lst = correct_activity_overlap(subj1_df_act)
#print('sub2 act overlap?: ', _is_activity_overlapping(subj2_df_act))    
#print('sub2 dev order?: ', _check_devices_sequ_order(subj2_df_dev))
#assert len(df) >= len(subj1_df_act)
for a, b in cor_lst:
    print(a)
    print('=>')
    print(b)
    print('~'*20)

len: df 295
[142, 243, 244, 284]
####################
res:  0  to  142
i_l:  142 i_h:  143
####################
res:  142  to  243
i_l:  243 i_h:  245
####################
res:  243  to  284
             start_time            end_time  activity
142 2003-04-04 06:39:31 2003-04-04 07:05:33   Bathing
143 2003-04-04 07:05:31 2003-04-04 07:09:40  Dressing
=>
               start_time            end_time  activity
0 2003-04-04 06:39:31.000 2003-04-04 07:05:31   Bathing
1 2003-04-04 07:05:31.001 2003-04-04 07:09:40  Dressing
~~~~~~~~~~~~~~~~~~~~
             start_time            end_time   activity
243 2003-04-09 20:02:09 2003-04-09 20:18:45   Grooming
244 2003-04-09 20:07:09 2003-04-09 21:21:01      Other
245 2003-04-09 21:13:07 2003-04-09 21:25:58  Toileting
=>
               start_time            end_time   activity
0 2003-04-09 20:02:09.000 2003-04-09 20:07:09   Grooming
0 2003-04-09 20:07:09.001 2003-04-09 21:13:07      Other
1 2003-04-09 21:13:07.001 2003-04-09 21:25:58  Toileting
~~~~

In [13]:
subj1_df_act.iloc[140:].head(n=50)

Unnamed: 0,start_time,end_time,activity
140,2003-04-04 06:10:03,2003-04-04 06:11:05,Doing laundry
141,2003-04-04 06:12:04,2003-04-04 06:12:49,Toileting
142,2003-04-04 06:39:31,2003-04-04 07:05:33,Bathing
143,2003-04-04 07:05:31,2003-04-04 07:09:40,Dressing
144,2003-04-04 07:10:39,2003-04-04 07:21:18,Preparing breakfast
145,2003-04-04 07:21:28,2003-04-04 07:24:54,Grooming
146,2003-04-04 10:54:09,2003-04-04 11:39:59,Preparing lunch
147,2003-04-04 12:27:33,2003-04-04 12:30:05,Doing laundry
148,2003-04-04 12:30:17,2003-04-04 12:31:10,Toileting
149,2003-04-04 13:44:59,2003-04-04 13:55:42,Cleaning


In [715]:
def load(sub1_dev, sub1_act, sub1_data, sub2_dev, sub2_act, sub2_data):
    pass

        

    #    subj1_df_act = correct_activity_overlap(subj1_df_act)

In [8]:
load(sub1_dev, sub1_act, sub1_data, sub2_dev, sub2_act, sub2_data)

sub1 act overlap?:  True
sub2 act overlap?:  True


ValueError: 11 start_time        2003-05-01 17:22:49
end_time          2003-05-01 17:22:51
device        115 - Kitchen Microwave
Name: 24, dtype: object start_time        2003-05-01 11:05:39
end_time          2003-05-01 11:05:41
device        115 - Kitchen Microwave
Name: 30, dtype: object