In [1]:
import os
import numpy as np
import pandas as pd
import datetime
from glob import glob
import bisect
from bisect import bisect_left
from tqdm import tqdm

### Example code:
1. Convert local time string to timestamp
2. Match personal-daybreak days in study to timestamp
3. Match nearest-time sensor log to timestamp
4. X-min window of sensor data aggregates before/after timestamp

### 1. Convert local time string to timestamp


In [4]:
'''
Input: dataframe, column name of local time, column name of timestamp
Output: dataframe
'''

time_offset_dict = {
    "CDT": "UTC-05",
    "CST": "UTC-06",
    "MDT": "UTC-06",
    "MST": "UTC-07",
    "PDT": "UTC-07",
    "PST": "UTC-08",
    "EDT": "UTC-04",
    "EST": "UTC-05",
    "AKDT": "UTC-08",
    "AKST": "UTC-09",
    "HDT": "UTC-09",
    "HST": "UTC-10"
}

def get_time_offset(time_zone_abbr):
    time_delta = datetime.timedelta(hours=0)
    sign = 0

    if time_zone_abbr in time_offset_dict:
        time_offset = time_offset_dict[time_zone_abbr]
        time_delta, sign = parse_time_offset(time_offset)

    return time_delta, sign

def convert_local_time_to_timestamp(local_time):
    if len(local_time.split(" ")) < 3:
        timestamp_str = "unknown time zone"
    else:
        time_zone = local_time.split(" ")[2]
        time_delta, sign = get_time_offset(time_zone)
        if time_zone == "unknownTZ" or sign == 0:
            timestamp_str = "unknown time zone"
        else:
            local_time = local_time.split(" ")[0] + " " + local_time.split(" ")[1]
            if "." in local_time:
                datetime_time = datetime.datetime.strptime(local_time, "%Y-%m-%d %H:%M:%S.%f")
            else:
                datetime_time = datetime.datetime.strptime(local_time, "%Y-%m-%d %H:%M:%S")

            datetime_time_tz = datetime_time - sign * time_delta
            timestamp_str = datetime_time_tz.replace(tzinfo=datetime.timezone.utc).timestamp()  # float

    return timestamp_str

def parse_time_offset(time_offset):
    sign_str = time_offset.strip('UTC')[0]
    if sign_str == "-":
        sign = -1
    elif sign_str == "+":
        sign = 1
    else:
        sigh = 0

    time_offset_int = int(time_offset.strip('UTC')[1:])
    time_delta = datetime.timedelta(hours=time_offset_int)

    return time_delta, sign

In [26]:
'''
Example
'''

local_time = "2021-06-04 07:00:00 EDT"
convert_local_time_to_timestamp(local_time)

1622804400.0

### 2. Match personal-daybreak days in study to timestamp

In [7]:
'''
Input: personal daybreak daily report, participant_id_text, local time w/o timezone
Output: personal-daybreak days in study
'''

# Read personal daybreak file
personal_daily_report_file_path = "/home/li.jix/repo/TIME/day_definition/adaptive_daily_report.csv"
df_personal = pd.read_csv(personal_daily_report_file_path)
df_personal_lite = df_personal[["participant_id_text","participant_id_numeric","days_in_study","last_breaktime","current_breaktime"]]


# Without timezone info, we don't know the unix time equivalent, but it's ok to assume the converted datetime are in the same time zone and compare among them
def convert_local_time_to_datetime(local_time):
    if pd.isna(local_time):
        return local_time
    
    split = local_time.split(" ")
    if len(split) > 2:
        local_time = split[0] + " " + split[1]
    
    if "/" in local_time:
        local_time_dt = datetime.datetime.strptime(local_time, '%m/%d/%Y %H:%M:%S')
    elif "-" in local_time:
        local_time_dt = datetime.datetime.strptime(local_time, '%Y-%m-%d %H:%M:%S')
    return local_time_dt
        

df_personal_lite["last_breaktime_dt"] = [convert_local_time_to_datetime(x) for x in df_personal_lite["last_breaktime"]]
df_personal_lite["current_breaktime_dt"] = [convert_local_time_to_datetime(x) for x in df_personal_lite["current_breaktime"]]



def take_closest(mySeries, myNumber):

    myList = list(mySeries)
    pos = bisect_left(myList, myNumber)-1
    if pos < 0:
        return np.nan
    elif pos == len(myList)-1:
        return np.nan
    else:
        return mySeries.index[pos]

def get_personal_daybreak_days_in_study(df_personal_lite, participant_id_text, time_to_check):
    df_personal_lite_pid = df_personal_lite[df_personal_lite.participant_id_text == participant_id_text]
    last_breaktime_dt_series = df_personal_lite_pid.last_breaktime_dt
    idx_before = take_closest(last_breaktime_dt_series, time_to_check)
    if pd.isna(idx_before):
        days_in_study = np.nan
    else:
        days_in_study = df_personal_lite_pid.loc[idx_before,"days_in_study"]
    return days_in_study

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
'''
Example
'''

local_time = "2021-06-04 07:00:00"
participant_id_text = "afflictedrevenueepilepsy"
local_time_dt = convert_local_time_to_datetime(local_time)
days_in_study_personal_break = get_personal_daybreak_days_in_study(df_personal_lite, participant_id_text, local_time_dt)
print(days_in_study_personal_break)

2

### 3. Match nearest-time sensor log to timestamp

In [9]:
'''
Input: target_file_pattern, pid, datetime_match_to_list
Output: matched data column, matched time column
'''
def extract_participant_id(intermediate_participant_path):

    participant_id = intermediate_participant_path.split(os.sep)[-1]

    if not participant_id.endswith("@timestudy_com"):
        raise Exception("Wrong format for input folder path. Needs to be '..\\username@timestudy_com'")

    return participant_id


def get_min_diff(prompt_datetime, matched_datetime):
    min_diff = abs((prompt_datetime - matched_datetime).total_seconds() / 60.0)
    return min_diff

def validate_dates_before_after(intermediate_participant_path, date_in_study):
    validated_date_list = []

    # target date
    date_folder_path = intermediate_participant_path + os.sep + date_in_study
    target_date_log_paths = sorted(glob(os.path.join(date_folder_path, target_file_pattern)))  # file name
    if len(target_date_log_paths) == 0:
        print("No battery daily file on {}".format(date_in_study))
    else:

        # 1 day before target date
        date_format = "%Y-%m-%d"
        one_date_before_datetime = datetime.datetime.strptime(date_in_study, date_format).date() - timedelta(days=1)
        one_date_before = one_date_before_datetime.strftime(date_format)
        date_folder_path = intermediate_participant_path + os.sep + one_date_before
        one_day_before_log_paths = sorted(glob(os.path.join(date_folder_path, target_file_pattern)))  # file name
        if len(one_day_before_log_paths) != 0:
            validated_date_list.append(one_date_before)

        # target date
        validated_date_list.append(date_in_study)

        # 1 day after target date
        date_format = "%Y-%m-%d"
        one_date_after_datetime = datetime.datetime.strptime(date_in_study, date_format).date() + timedelta(days=1)
        one_date_after = one_date_after_datetime.strftime(date_format)
        date_folder_path = intermediate_participant_path + os.sep + one_date_after
        one_day_after_log_paths = sorted(glob(os.path.join(date_folder_path, target_file_pattern)))  # file name
        if len(one_day_after_log_paths) != 0:
            validated_date_list.append(one_date_after)

    return validated_date_list

def clean_dataframe(df):
    df.reset_index(inplace=True, drop=True)
    dropped_rows = []
    for idx in df.index:
        local_time_str = df["LOG_TIME"][idx]
        if (local_time_str == "-1") or len(local_time_str.split(' ')) > 3 or len(local_time_str.split('-')[0]) > 4 or len(local_time_str.split(' ')) < 2:
            dropped_rows.append(idx)
    df = df.drop(dropped_rows)

    df.reset_index(inplace=True, drop=True)
    return df

def combine_intermediate_file(intermediate_participant_path):
    df_logs_combined = pd.DataFrame()
    participant_id = extract_participant_id(intermediate_participant_path)
    
    for file in glob(os.path.join(intermediate_participant_path,target_file_pattern)):
        df_logs_combined = pd.read_csv(file)
        

        converter = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")
        df_logs_combined = df_logs_combined.dropna(subset=['LOG_TIME'])
        df_logs_combined = clean_dataframe(df_logs_combined)

        try:
            df_logs_combined["LOG_TIME"] = [x.split(" ")[0] + " " + x.split(" ")[1] for x in
                                              list(df_logs_combined["LOG_TIME"])]

            df_logs_combined['LOG_TIMESTAMP'] = pd.Series(map(converter, df_logs_combined["LOG_TIME"]))
            # print(df_logs_combined['Local_Timestamp'])
            df_logs_combined['Date'] = df_logs_combined['LOG_TIMESTAMP'].dt.date
        except IndexError:
            raise Exception(
                "IndexError: list index out of range (battery_level) : " + intermediate_participant_path + os.sep + date_in_study + str(
                    list(df_logs_combined["LOG_TIME"])))
        except Exception:
            raise Exception("Exception (battery_level) : " + intermediate_participant_path + os.sep + date_in_study)

    return df_logs_combined


def find_closest_time(prompt_time, subset_time_list):
    i = bisect.bisect_left(subset_time_list, prompt_time)
    closet_time = min(subset_time_list[max(0, i - 1): i + 2], key=lambda t: abs(prompt_time - t))
    return closet_time


def match_feature(datetime_match_to_list, df_logs_combined):
    print("     --- start matching")
    matched_battery_level_list = []
    matched_charging_status_list = []
    matched_time_list = []

    for idx in range(len(datetime_match_to_list)):
        prompt_time = datetime_match_to_list[idx]
        # prompt_date = prompt_time.date()

        if df_logs_combined.shape[0] == 0:
            battery_level = "NF"
            charging_status = "NF"
            closest_time = "NF"
        else:
            subset_time_list = list(df_logs_combined["LOG_TIMESTAMP"])

            closest_time = find_closest_time(prompt_time, subset_time_list)

            # check if matched time is 5 minutes away from prompt time
            if get_min_diff(prompt_time, closest_time) < 5:
                battery_level = list(df_logs_combined[df_logs_combined['LOG_TIMESTAMP'] == closest_time][
                                         "Percentage"])[0]
                charging_status = list(df_logs_combined[df_logs_combined['LOG_TIMESTAMP'] == closest_time][
                                           "isCharging"])[0]
            else:
                battery_level = "NF"
                charging_status = "NF"


        matched_time_list.append(closest_time)
        matched_battery_level_list.append(battery_level)
        matched_charging_status_list.append(charging_status)

    return matched_battery_level_list, matched_charging_status_list, matched_time_list

In [55]:
'''
Example
'''

# Input
target_file_pattern = 'phone_battery.csv' # sensor intermediate data file to match datetime with
intermediate_data_path = "/work/mhealthresearchgroup/TIME_STD/time_study_preprocess/intermediate_file/"
pid = "arrivejanitoruniformly@timestudy_com"
intermediate_participant_path = os.path.join(intermediate_data_path, pid)
date_in_study = "2021-03-12"
datetime_match_to_list = [convert_local_time_to_datetime("2021-03-12 07:00:00"), convert_local_time_to_datetime("2021-03-12 08:00:00")] # the list of time you want sensor data match to

# Read, parse and combine related intermediate file
df_logs_combined = combine_intermediate_file(intermediate_participant_path)

# Get the battery, charging status matched to the datetime list
if df_logs_combined.shape[0] > 0:
    # Match the combined parsed intermediate file with prompt feature data frame
    battery_level_column, charging_status_column, match_time = match_feature(datetime_match_to_list, df_logs_combined)
else:
    battery_level_column = ["NF"] * len(datetime_match_to_list)
    charging_status_column = ["NF"] * len(datetime_match_to_list)
    match_time = ["NF"] * len(datetime_match_to_list)
    

  


     --- start matching


In [56]:
print(battery_level_column)

[100, 100]


In [57]:
print(charging_status_column)

[True, True]


In [58]:
print(match_time)

[Timestamp('2021-03-12 07:00:18.291000'), Timestamp('2021-03-12 07:59:57.801000')]


### 4. X-min window of sensor data aggregates before/after timestamp

In [10]:

'''
Input: target_file_pattern, pid, datetime_match_to_list
Output: matched data column, matched time column
'''

def combine_intermediate_file(intermediate_participant_path):
    df_logs_combined = pd.DataFrame()
    participant_id = extract_participant_id(intermediate_participant_path)

    for file in glob(os.path.join(intermediate_participant_path,target_file_pattern)):
        df_logs_combined = pd.read_csv(file)
    

        converter = lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f") if ("." in x) else datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S")

        df_logs_combined = df_logs_combined.dropna(subset=['LOG_TIME'])
        df_logs_combined.reset_index(inplace=True, drop=True)
        df_logs_combined["LOG_TIME"] = [x.split(" ")[0] + " " + x.split(" ")[1] for x in
                                                 list(df_logs_combined["LOG_TIME"])]

        df_logs_combined['LOG_TIMESTAMP'] = pd.Series(map(converter, df_logs_combined["LOG_TIME"]))


    return df_logs_combined


def find_closest_time(prompt_time, subset_time_list):
    pos = bisect.bisect_left(subset_time_list, prompt_time)
    # closet_time = min(subset_time_list[max(0, i - 1): i + 2], key=lambda t: -(prompt_time - t))

    return pos


def get_mims_summary(sec_before, prompt_time, closest_times, df_logs_combined):
    mims_summary = 0
    num_readings = 0
    start_time = None
    for matched_idx in closest_times:
        closest_time = df_logs_combined.loc[matched_idx, "LOG_TIMESTAMP"]
        if get_min_diff(prompt_time, closest_time) <= (sec_before // 60 + 1):
            mims_min = df_logs_combined.loc[matched_idx, "MIMS_UNIT"]
            if mims_min != -0.01:  # mims value -0.01 means cannot be computed
                mims_summary += mims_min
                num_readings += 1
            start_time = closest_time
        else:
            if start_time is None:
                mims_summary = "OB"
            break
    return mims_summary, num_readings, start_time


def match_feature(datetime_match_to_list, df_logs_combined):
    print("     --- start matching")
    # Aggregating 1-10 minutes before prompt time
    sec_before_list = [60 * x for x in list(range(1, 11))] 

    mims_summary_list = []
    start_time_list = []
    readings_list = []

    for idx in range(len(datetime_match_to_list)):
        matched_mims_summary_list = []
        matched_start_time_list = []
        matched_readings_list = []

        prompt_time = datetime_match_to_list[idx]
        # prompt_date = prompt_time.date()

        if df_logs_combined.shape[0] == 0:
            matched_mims_summary_list = [np.nan] * len(sec_before_list)
            matched_readings_list = [np.nan] * len(sec_before_list)
            matched_start_time_list = [np.nan] * len(sec_before_list)
        else:
            subset_time_list = list(df_logs_combined["LOG_TIMESTAMP"])
            pos = find_closest_time(prompt_time, subset_time_list)

            for sec_before in sec_before_list:
                closest_times = []
                for i in range(sec_before):
                    closest_times.append(pos - 1 - i)
                closest_times = [x for x in closest_times if x >= 0]
                mims_summary, readings, start_time = get_mims_summary(sec_before, prompt_time, closest_times,
                                                                      df_logs_combined)
                matched_mims_summary_list.append(mims_summary)
                matched_readings_list.append(readings)
                matched_start_time_list.append(start_time)
        mims_summary_list.append(matched_mims_summary_list)
        readings_list.append(matched_readings_list)
        start_time_list.append(matched_start_time_list)

    mims_summary_df = pd.DataFrame(list(map(np.ravel, mims_summary_list)))
    readings_df = pd.DataFrame(list(map(np.ravel, readings_list)))
    start_time_df = pd.DataFrame(list(map(np.ravel, start_time_list)))

    df = pd.DataFrame()
    for sec_before in sec_before_list:
        col = int(sec_before // 60 - 1)
        df = pd.concat([df, mims_summary_df[col].rename("mims_summary_" + str(sec_before // 60) + "min")], axis=1)
        df = pd.concat([df, readings_df[col].rename("num_readings_" + str(sec_before // 60) + "min")], axis=1)
        df = pd.concat([df, start_time_df[col].rename("start_time_" + str(sec_before // 60) + "min")], axis=1)

    return df

In [11]:
'''
Example
'''


# Input
target_file_pattern = 'watch_accelerometer_mims.csv' # sensor intermediate data file to match datetime with
intermediate_data_path = "/work/mhealthresearchgroup/TIME_STD/time_study_preprocess/intermediate_file/"
pid = "arrivejanitoruniformly@timestudy_com"
intermediate_participant_path = os.path.join(intermediate_data_path, pid)

datetime_match_to_list = [convert_local_time_to_datetime("2021-03-12 07:00:00"), convert_local_time_to_datetime("2021-03-12 08:00:00")] # the list of time you want sensor data match to

# Read, parse and combine related intermediate file
df_logs_combined = combine_intermediate_file(intermediate_participant_path)

# Match the combined parsed intermediate file with prompt feature data frame
df = match_feature(datetime_match_to_list, df_logs_combined)



  from ipykernel import kernelapp as app


     --- start matching


In [13]:
df

Unnamed: 0,mims_summary_1min,num_readings_1min,start_time_1min,mims_summary_2min,num_readings_2min,start_time_2min,mims_summary_3min,num_readings_3min,start_time_3min,mims_summary_4min,...,start_time_7min,mims_summary_8min,num_readings_8min,start_time_8min,mims_summary_9min,num_readings_9min,start_time_9min,mims_summary_10min,num_readings_10min,start_time_10min
0,0.0,60,2021-03-12 06:58:59.002,0.0,120,2021-03-12 06:57:59.002,0.0,180,2021-03-12 06:56:59.002,0.0,...,2021-03-12 06:52:59.002,11.814441,480,2021-03-12 06:51:59.002,11.814441,540,2021-03-12 06:50:59.002,11.814441,600,2021-03-12 06:49:59.002
1,0.0,60,2021-03-12 07:58:59.000,0.0,120,2021-03-12 07:57:59.000,0.0,180,2021-03-12 07:56:59.000,0.0,...,2021-03-12 07:52:59.000,0.0,480,2021-03-12 07:51:59.000,0.0,540,2021-03-12 07:50:59.000,0.0,600,2021-03-12 07:49:59.000
