In [1]:
from Funcs.Utility import *
import numpy as np
import pandas as pd
from typing import Dict, Callable, Union, Tuple, List, Optional, Iterable, Any
from datetime import timedelta as td
from scipy import stats
import ray
import warnings
import time

In [2]:
def _safe_na_check(_v):
    _is_nan_inf = False
    
    try:
        _is_nan_inf = np.isnan(_v) or np.isinf(_v)
    except:
        _is_nan_inf = False
    
    return _is_nan_inf or _v is None

In [3]:
DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))

In [4]:
DATA

{'CAL': pcode  timestamp                       
 P126   2021-12-06 15:40:18.350000+09:00    1.173864
        2021-12-06 15:40:48.350000+09:00    2.311666
        2021-12-06 15:41:18.350000+09:00    2.311666
        2021-12-06 15:41:48.350000+09:00    2.311666
        2021-12-06 15:42:18.350000+09:00    2.311666
                                              ...   
 P083   2021-12-28 21:59:26.709000+09:00    0.543229
        2021-12-28 21:59:56.709000+09:00    0.543229
        2021-12-28 22:00:26.709000+09:00    0.543229
        2021-12-28 22:00:56.709000+09:00    0.543229
        2021-12-28 22:01:00.800000+09:00    0.074078
 Name: calories, Length: 490040, dtype: float32,
 'APP_DUR_UNKNOWN': pcode  timestamp                       
 P126   2021-12-06 15:37:11.006000+09:00     0.695
        2021-12-06 15:37:25.888000+09:00    14.881
        2021-12-06 15:37:26.557000+09:00     0.664
        2021-12-06 15:37:29.454000+09:00     2.894
        2021-12-06 15:37:32.659000+09:00     3.172
     

In [5]:
notification_vis_data = DATA['BT_DeviceType']

# Display the first few rows to understand its structure
print(notification_vis_data.head())

# Find all unique feature types in 'Notification_VIS'
unique_feature_types = notification_vis_data.unique()

# Display the unique feature types
print("Unique feature types in 'BT_DeviceType':")
print(unique_feature_types)

pcode  timestamp                       
P126   2021-12-06 15:31:48.595000+09:00         LE
       2021-12-06 15:31:48.596000+09:00         LE
       2021-12-06 15:31:52.507000+09:00         LE
       2021-12-06 15:31:52.510000+09:00         LE
       2021-12-06 15:31:55.739000+09:00    CLASSIC
Name: deviceType, dtype: object
Unique feature types in 'BT_DeviceType':
['LE' 'CLASSIC' 'UNDEFINED' 'DUAL']


Extraction functions

In [6]:
def _extract_numeric_feature(d_key, d_val) -> Dict:
    feature = {}
    
    # Ensure the input is a NumPy array
    v = np.asarray(d_val)
    
    # Check if the data is numeric
    if not np.issubdtype(v.dtype, np.number):
        raise ValueError(f"Input data for {d_key} must be numeric.")
    
    # Handle NaNs and infinities
    if not np.all(np.isfinite(v)):
        raise ValueError(f"Input data for {d_key} contains NaNs or infinities.")
    
    # Calculate histogram
    hist, _ = np.histogram(v, bins='doane', density=False)
    
    # Calculate standard deviation
    std = np.sqrt(np.var(v, ddof=1)) if len(v) > 1 else 0
    
    # Normalize values
    v_norm = (v - np.mean(v)) / std if std != 0 else np.zeros(len(v))
    
    # Populate feature dictionary
    feature[f'{d_key}#AVG'] = np.mean(v) # Sample mean
    feature[f'{d_key}#STD'] = std # Sample standard deviation
    feature[f'{d_key}#SKW'] = stats.skew(v, bias=False) if std != 0 else 0 # Sample skewness
    feature[f'{d_key}#KUR'] = stats.kurtosis(v, bias=False) if std != 0 else 0 # Sample kurtosis
    feature[f'{d_key}#ASC'] = np.sum(np.abs(np.diff(v))) # Abstract sum of changes
    feature[f'{d_key}#BEP'] = stats.entropy(hist) # Binned entropy
    feature[f'{d_key}#MED'] = np.median(v) # Median
    feature[f'{d_key}#TSC'] = np.sqrt(np.sum(np.power(np.diff(v_norm), 2))) # Timeseries complexity
    
    return feature

In [7]:
def _extract_categorical_feature(cats, d_key, d_val) -> Dict:
    feature = {}
    v = d_val
    cnt = v.value_counts()
    val, sup = cnt.index, cnt.values
    hist = {k: v for k, v in zip(val, sup)}

    # Information Entropy
    feature[f'{d_key}#ETP#'] = stats.entropy(sup / len(v))
    # Abs. Sum of Changes
    feature[f'{d_key}#ASC#'] = np.sum(v.values[1:] != v.values[:-1])
    if len(cats) == 2: # Dichotomous categorical data
        c = cats[0]
        feature[f'{d_key}#RLV_SUP'] = hist[c] / len(v) if c in hist else 0
    else:
        for c in cats:
            feature[f'{d_key}#RLV_SUP={c}'] = hist[c] / len(v)  if c in hist else 0
            
    return feature

In [8]:
def _extract_timeWindow_feature(is_numeric, cats, d_key, d_val) -> Dict:
    feature = {}
    v = d_val
    if is_numeric:
        feature = _extract_numeric_feature(d_key, v)
    else:
        feature =_extract_categorical_feature(cats, d_key, v)
    return feature

In [9]:
def _extract_timeWindow_feature(is_numeric, cats, d_key, d_val) -> Dict:
    feature = {}
    v = d_val
    
    if d_key in ['SCR_EVENT']:
        # Extract features specifically for screen events
        s_on = v[v == 'SCREEN_ON'].index
        s_off = v[v == 'SCREEN_OFF'].index
        duration, onset, midpoint = calculate_sleep_duration(s_on, s_off, theta)
        
        if duration:
            feature['Sleep#Duration'] = duration
            onset_hour = onset.hour
            if onset_hour >= 21:
                feature['Sleep#Onset'] = onset_hour - 21
            else:
                feature['Sleep#Onset'] = onset_hour + 3
            feature['Sleep#Midpoint'] = midpoint.hour + midpoint.minute / 60
        else:
            feature['Sleep#Duration'] = 0
            feature['Sleep#Onset'] = 0
            feature['Sleep#Midpoint'] = 0
    else:
        if is_numeric:
            feature = _extract_numeric_feature(d_key, v)
        else:
            feature = _extract_categorical_feature(cats, d_key, v)

    return feature

In [10]:
#This fucntion is based on the  towards circadian computing: "early to bed and early to rise"
#makes some of us unhealthy and sleep derived
theta=30
def calculate_sleep_duration(s_on, s_off, theta):
    # Merge s_on and s_off into a single DataFrame based on timestamp
    df = pd.merge(pd.DataFrame({'timestamp': s_on, 'event': 'SCREEN_ON'}),
                  pd.DataFrame({'timestamp': s_off, 'event': 'SCREEN_OFF'}),
                  how='outer', on='timestamp')
    # fill missing values in event_x with values from event_y, and vice versa
    df['event_x'] = df['event_x'].fillna(df['event_y'])
    df['event_y'] = df['event_y'].fillna(df['event_x'])
    # drop the event_x and event_y columns
    df = df.drop(columns=['event_y']).rename(columns={'event_x': 'event'})
    # Fill in missing timestamps with NaT and sort by timestamp
    df = df.fillna(pd.NaT).sort_values('timestamp')
    df=df.assign(
         timestamp=lambda x: pd.to_datetime(x['timestamp'], unit='ms', utc=True).dt.tz_convert(DEFAULT_TZ)
     )
    # Filter out screen-on events caused by notifications
    mask = (df['event'] == 'SCREEN_OFF') & ((df['timestamp'].diff().fillna(pd.NaT)  / pd.Timedelta(seconds=1)) > theta)
    filtered_df = df[mask].reset_index(drop=True)
    # Discard non-usage patterns that do not start between 9PM to 7AM (next day)
    sleep_duration = pd.Series(dtype=float)
    sleep_onset = pd.Series(dtype="datetime64[ns]")
    for i in range(len(filtered_df)-1):
        if filtered_df.loc[i, 'timestamp'].hour >= 21 or filtered_df.loc[i, 'timestamp'].hour < 7:
            non_usage_duration = filtered_df.loc[i+1, 'timestamp'] - filtered_df.loc[i, 'timestamp']
            if non_usage_duration.total_seconds() > 0:
                sleep_duration = pd.concat([sleep_duration, pd.Series(non_usage_duration.total_seconds())])
                sleep_onset = pd.concat([sleep_onset , pd.Series(filtered_df.loc[i, 'timestamp'])])
    # Calculate sleep midpoint and apply individual corrective term
    if len(sleep_duration) > 0:
        sleep_duration = sleep_duration.reset_index(drop=True)
        sleep_onset  =sleep_onset.reset_index(drop=True)
        sleep_midpoint = sleep_onset + pd.to_timedelta(sleep_duration/2, unit="s")
        return sleep_duration.max(), sleep_onset.iloc[sleep_duration.idxmax()], sleep_midpoint.iloc[sleep_duration.idxmax()]
    else:
        return None, None, None

In [11]:
epoch_names = {
    0: 'Dawn',
    1: 'Morning',
    2: 'Afternoon',
    3: 'LateAfternoon',
    4: 'Evening',
    5: 'Night'
}
def _extract(
        pid: str,
        data: Dict[str, pd.Series],
        label: pd.Series,
        label_values: List[str],
#        window_data: Dict[str, Union[int, Callable[[pd.Timestamp], int]]],
#        window_label: Dict[str, Union[int, Callable[[pd.Timestamp], int]]],
        categories: Dict[str, Optional[List[any]]] = None,
        constant_features: Dict[str, any] = None,
        resample_s: Dict[str, float] = None
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    _s = time.time()
    log(f"Begin feature extraction on {pid}'s data.")
    categories = categories or dict()
    constant_features = constant_features or dict()
    resample_s = resample_s or dict()
    X, y, date_times = [], [], []
#    count = 0
    for timestamp in label.index:
        row = dict()
        #Find the start of today and yesterday for extracting today epoch features and yesterday epoch features
        start_of_today = datetime(timestamp.year, timestamp.month, timestamp.day, tzinfo=timestamp.tzinfo)
        start_of_today = pd.Timestamp(start_of_today.date(), tz=DEFAULT_TZ)
        start_of_yesterday = timestamp - pd.Timedelta(days=1)
        start_of_yesterday = pd.Timestamp(start_of_yesterday.date(), tz=DEFAULT_TZ)
        label_cur = label.at[timestamp]
        t = timestamp - td(milliseconds=1)

        #Yesterday and Today 3-hour epochs
        yesterday_time_windows_epoch = []
        for i in range(6):
            start = start_of_yesterday + pd.Timedelta(hours=i*3 + 6 )
            end = start_of_yesterday + pd.Timedelta(hours=(i+1)*3 +6)
            if start <= t:
                yesterday_time_windows_epoch.append((start, min(end, t)))
            else:
                break
        today_time_windows_epoch = []
        for i in range(6):
            start = start_of_today + pd.Timedelta(hours=i*3 +6)
            end = start_of_today + pd.Timedelta(hours=(i+1)*3 + 6)
            if start <= t:
                today_time_windows_epoch.append((start, min(end, t)))
            else:
                break
        
        # Features relevant to participants' info
        for d_key, d_val in constant_features.items():
            row[d_key] = d_val
        # Features from sensor data
        for d_key, d_val in data.items():
            is_numeric = d_key not in categories
            cats = categories.get(d_key) or list()
            d_val = d_val.sort_index()
            # Features relevant to latest value of a given data
            # These features are extracted only for bounded categorical data and numerical data.
            if is_numeric or cats:
                try:
                    v = d_val.loc[:t].iloc[-1]
                except (KeyError, IndexError):
                    v = 0
                if is_numeric:
                    row[f'{d_key}#VAL'] = v
                else:
                    for c in cats:
                        row[f'{d_key}#VAL={c}'] = v == c
            # Features relevant to duration since the latest state change.
            # These features are only for categorical data.
            # In addition, duration since a given state is set recently is considered,
            # that are available only at bounded categorical data.
            if not is_numeric:
                try:
                    v = d_val.loc[:t]
                    row[f'{d_key}#DSC'] = (t - v.index[-1]).total_seconds() if len(v) else -1.0
                    for c in cats:
                        v_sub = v.loc[lambda x: x == c].index
                        row[f'{d_key}#DSC={c}'] = (t - v_sub[-1]).total_seconds() if len(v_sub) else -1.0
                except (KeyError, IndexError):
                    row[f'{d_key}#DSC'] = 0
                    for c in cats:
                        row[f'{d_key}#DSC={c}'] = 0
#             No resampling
            d_val_res =d_val

           # Features extracted from 15-min immediate past time-windows
            w_val = 15 * 60
            try:
                v = d_val_res.loc[t - td(seconds=w_val):t]
            except (KeyError, IndexError):
                continue
            with warnings.catch_warnings():
                warnings.simplefilter('ignore')
                new_row = {f'{k}#ImmediatePast_15': v for k, v in _extract_timeWindow_feature(is_numeric, cats, d_key, v).items()}
                row.update(new_row)

            #############################################################    
            #Features extracted from yesterday epoch time windows
            for count, (start, end) in enumerate(yesterday_time_windows_epoch):
                # Get data for the current yesterday epoch time window
                try:
                    v = d_val_res.loc[start:end]
                except (KeyError, IndexError):
                    continue
                epoch_name = epoch_names.get(count)

                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    new_row = {f'{k}#Yesterday{epoch_name}': v for k, v in _extract_timeWindow_feature(is_numeric, cats, d_key, v).items()}
                    row.update(new_row)
                    
            #Features extracted from today epoch time windows until current time
            for count, (start, end) in enumerate(today_time_windows_epoch):
                # Get data for the current time window
                try:
                    v = d_val_res.loc[start:end]
                except (KeyError, IndexError):
                    continue
                epoch_name = epoch_names.get(count)

                with warnings.catch_warnings():
                    warnings.simplefilter('ignore')
                    new_row = {f'{k}_Today{epoch_name}': v for k, v in _extract_timeWindow_feature(is_numeric, cats, d_key, v).items()}
                    row.update(new_row)
        #Sleep feature extracted from last night's data
        onset_min = start_of_yesterday + pd.Timedelta(hours=21)
        onset_max = start_of_today + pd.Timedelta(hours=14)
        s_on =data['SCR_EVENT'].loc[data['SCR_EVENT']=='ON']
        s_off =data['SCR_EVENT'].loc[data['SCR_EVENT']=='OFF']
        duration, onset, midpoint =calculate_sleep_duration(s_on.loc[onset_min:onset_max].reset_index()['timestamp'], s_off.loc[onset_min:onset_max].reset_index()['timestamp'], theta)
        if duration:
            row['Sleep#Duration'] = duration
            onset_hour = onset.hour
            if onset_hour >=21:
                row['Sleep#Onset'] = onset_hour - 21
            else:
                row['Sleep#Onset'] = onset_hour + 3
        else:
            row['Sleep#Duration'] = 0
            row['Sleep#Onset'] = 0
            
        # Features relevant to time
        day_of_week = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'][t.isoweekday() - 1]
        is_weekend = 'Y' if t.isoweekday() > 5 else 'N'
        hour = t.hour

        if 6 <= hour < 9:
            hour_name = 'Dawn'
        elif 9 <= hour < 12:
            hour_name = 'MORNING'
        elif 12 <= hour < 15:
            hour_name = 'AFTERNOON'
        elif 15 <= hour < 18:
            hour_name = 'LATE_AFTERNOON'
        elif 18 <= hour < 21:
            hour_name = 'EVENING'
        elif 21 <= hour < 24:
            hour_name = 'NIGHT'
        else:
            hour_name = 'MIDNIGHT'
            
        for d in ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN']:
            row[f'Time#DOW={d}'] = d == day_of_week
        for d in ['Y', 'N']:
            row[f'Time#WKD={d}'] = d == is_weekend
        for d in ['DAWN', 'MORNING', 'AFTERNOON', 'LATE_AFTERNOON', 'EVENING', 'NIGHT', 'MIDNIGHT']:
            row[f'Time#HRN={d}'] = d == hour_name
        

        try:
            last_label = label.loc[label[:t].index.max()]
        except (KeyError, IndexError):
            last_label = 0
        row[f'ESM#LastLabel'] = last_label

#############################################################################################
        #The following code is designed for fixed threshold
        # Label values extracted from yesterday epochs
        for count, (start, end) in enumerate(yesterday_time_windows_epoch):
            try:
                v = label.loc[start:end]
                epoch_name = epoch_names.get(count)
                if len(label_values) <= 2: # Binary classification
                    row[f'ESM#LIK#Yesterday{epoch_name}'] = np.sum(v == label_values[0]) / len(v) if len(v) > 0 else 0
                else:
                    for l in label_values:
                        row[f'ESM#LIK#Yesterday{epoch_name}'] = np.sum(v == l) / len(v) if len(v) > 0 else 0
            except (KeyError, IndexError):
                epoch_name = epoch_names.get(count)
                if len(label_values) <= 2:
                    row[f'ESM#LIK#Yesterday{epoch_name}'] = 0
                else:
                    for l in label_values:
                        row[f'ESM#LIK#Yesterday{epoch_name}'] = 0
        # Label values extracted from today epochs
        for count, (start, end) in enumerate(today_time_windows_epoch):
            try:
                v = label.loc[start:end]
                epoch_name = epoch_names.get(count)
                if len(label_values) <= 2: # Binary classification
                    row[f'ESM#LIK#Today{epoch_name}'] = np.sum(v == label_values[0]) / len(v) if len(v) > 0 else 0
                else:
                    for l in label_values:
                        row[f'ESM#LIK#Today{epoch_name}'] = np.sum(v == l) / len(v) if len(v) > 0 else 0
            except (KeyError, IndexError):
                epoch_name = epoch_names.get(count)
                if len(label_values) <= 2:
                    row[f'ESM#LIK#Today{epoch_name}'] = 0
                else:
                    for l in label_values:
                        row[f'ESM#LIK#Today{epoch_name}'] = 0
        row = {
            k: 0 if _safe_na_check(v) else v
            for k, v in row.items()
        }

        X.append(row)
        y.append(label_cur)
        date_times.append(timestamp)
    
    log(f"Complete feature extraction on {pid}'s data ({time.time() - _s:.2f} s).")
    
    #Without normalization for each user
    X = pd.DataFrame(X)
    
    # Debugging: Inspect X DataFrame and y list
#     print(f"Contents of X DataFrame: {X.head()}")
#     print(f"Shape of X DataFrame: {X.shape}")
#     print(f"Contents of y before conversion: {y}")
#     print(f"Type of y: {type(y)}")
    
    for i, element in enumerate(y):
        print(f"Element {i}: {element}, Type: {type(element)}, Length: {len(element) if hasattr(element, '__len__') else 'N/A'}")
    
    y = np.array([element if not hasattr(element, '__len__') else element[0] for element in y])

#     y = np.asarray(y)
    group = np.repeat(pid, len(y))
    date_times =  np.asarray(date_times)
    
    # Debugging: Verify shapes
#     print(f"Shape of y after conversion: {y.shape}")
#     print(f"Shape of group: {group.shape}")
#     print(f"Shape of date_times: {date_times.shape}")

    return X, y, group, date_times

In [12]:
def extract(
        pids: Iterable[str],
        data: Dict[str, pd.Series],
        label: pd.Series,
        label_values: List[str],
        categories: Dict[str, Optional[List[Any]]] = None,
        constat_features: Dict[str, Dict[str, Any]] = None,
        resample_s: Dict[str, float] = None,
        with_ray: bool=False
):
    if with_ray and not ray.is_initialized():
        raise EnvironmentError('Ray should be initialized if "with_ray" is set as True.')
    
    func = ray.remote(_extract).remote if with_ray else _extract
    jobs = []
    for pid in pids:
        d = dict()
        for k, v in data.items():
            try:
                d[k] = v.loc[(pid, )]
                if k.startswith('LOC_'):
                    d[k].index = pd.to_datetime(d[k].index, unit='ms', utc=True).tz_convert(DEFAULT_TZ)
                d['SPEED'] = d.pop('LOC_SPEED')
            except (KeyError, IndexError):
                pass
        job = func(
            pid=pid, data=d, label=label.loc[(pid, )],
            label_values=label_values,
            categories=categories,
            constant_features=constat_features[pid],
            resample_s=resample_s
        )
        jobs.append(job)
    
    if with_ray:
        jobs = ray.get(jobs)
    
    # Debugging: Inspect the shapes of the returned elements
#     for i, job in enumerate(jobs):
#         print(f"Job {i}: {[x.shape if hasattr(x, 'shape') else type(x) for x in job]}")
    
    X = pd.concat([x for x, _, _, _ in jobs], axis=0, ignore_index=True)
    y = np.concatenate([x for _, x, _, _ in jobs], axis=0)
    group = np.concatenate([x for _, _, x, _ in jobs], axis=0)
    date_times = np.concatenate([x for _, _, _, x in jobs], axis=0)
    
    # Handle sequences in the DataFrame X
    for column in X.columns:
        if X[column].apply(lambda x: isinstance(x, (list, tuple, np.ndarray))).any():
            X[column] = X[column].apply(lambda x: len(x) if isinstance(x, (list, tuple, np.ndarray)) else x)
    
    t_s = date_times.min().normalize().timestamp()
    t_norm = np.asarray(list(map(lambda x: x.timestamp() - t_s, date_times)))
    C, DTYPE = X.columns, X.dtypes
    
    X = X.fillna({
        **{c: False for c in C[(DTYPE == object) | (DTYPE == bool)]},
        **{c: 0.0 for c in C[(DTYPE != object) & (DTYPE != bool)]},
    }).astype({
        **{c: 'bool' for c in C[(DTYPE == object) | (DTYPE == bool)]},
        **{c: 'float32' for c in C[(DTYPE != object) & (DTYPE != bool)]},
    })
    
    return X, y, group, t_norm, date_times

In [13]:
import os
import cloudpickle
import pandas as pd

LABEL_VALUES = [1, 0]

RESAMPLE_S = {
    'CAL': 1.0,  # 1 second
    'APP_DUR_UNKNOWN': 1.0,  # 1 second
    'BAT_LEV': 1.0,  # 1 second
    'MSG_RCV': 60.0,  # 1 minute
    'DATA_RCV': 10.0,  # 10 seconds
    'HEARTRATE': 1.0,  # 1 second
}


CATEGORIES = {
    'APP_CAT': ['UNKNOWN' 'SYSTEM' 'ENTER' 'SOCIAL' 'HEALTH' 'WORK' 'INFO'], 
    'BAT_PLG': ['UNDEFINED' 'AC' 'USB' 'WIRELESS'],
    'BAT_STA': ['DISCHARGING' 'CHARGING' 'FULL' 'NOT_CHARGING'],
    'CALL_CNT': ['UNKNOWN' 'MOBILE' 'OTHER' 'WORK' 'HOME' 'UNDEFINED' 'MAIN'],
    'LOC_LABEL': ['none' 'work' 'home'],
    'RING': ['VIBRATE' 'SILENT' 'NORMAL'],
    'CHG': ['DISCONNECTED', 'CONNECTED'],
    'PWR': ['ACTIVATE', 'DEACTIVATE'],
#     'ONOFF': ['SHUTDOWN'],
    'Notification_VIS':['PRIVATE' 'SECRET' 'PUBLIC'],
    'Notification_CAT': ['UNDEFINED' 'STATUS' 'MESSAGE' 'SERVICE' 'PROGRESS' 'SYSTEM' 'REMINDER' 'ALARM' 'TRANSPORT' 'EMAIL' 'CALL' 'EVENT' 'PROMO' 'ERROR' 'RECOMMENDATION' 'SOCIAL' 'NAVIGATION'],
    'Dozemode': ['ACTIVATE', 'DEACTIVATE'],
    'SCR_EVENT':['SCREEN_OFF' 'SCREEN_ON' 'USER_PRESENT'],
    'BT_BondState': ['NONE' 'BONDED' 'BONDING'],
    'BT_DeviceType': ['LE' 'CLASSIC' 'UNDEFINED' 'DUAL'],
    'BT_classType': ['UNDEFINED' 'PHONE_SMART' 'WEARABLE_WRIST_WATCH' 'COMPUTER_LAPTOP'
 'AUDIO_VIDEO_SET_TOP_BOX' 'AUDIO_VIDEO_HANDSFREE'
 'AUDIO_VIDEO_VIDEO_DISPLAY_AND_LOUDSPEAKER'
 'AUDIO_VIDEO_WEARABLE_HEADSET' 'PHONE_CELLULAR' 'COMPUTER_DESKTOP'
 'COMPUTER_HANDHELD_PC_PDA' 'AUDIO_VIDEO_UNCATEGORIZED'
 'AUDIO_VIDEO_LOUDSPEAKER' 'AUDIO_VIDEO_HEADPHONES'
 'AUDIO_VIDEO_HIFI_AUDIO' 'AUDIO_VIDEO_CAR_AUDIO' 'HEALTH_PULSE_OXIMETER'
 'HEALTH_UNCATEGORIZED' 'COMPUTER_UNCATEGORIZED' 'WEARABLE_JACKET'
 'HEALTH_PULSE_RATE' 'AUDIO_VIDEO_PORTABLE_AUDIO' 'PHONE_MODEM_OR_GATEWAY'
 'HEALTH_BLOOD_PRESSURE' 'COMPUTER_WEARABLE' 'HEALTH_DATA_DISPLAY'
 'TOY_ROBOT' 'COMPUTER_PALM_SIZE_PC_PDA' 'COMPUTER_SERVER']

}

# Define the path to the user info file
user_info_file = os.path.join(PATH_INTERMEDIATE,'UserInfo.csv')

# Load the user info data
userinfo = pd.read_csv(user_info_file)

# Process participant information
PINFO = userinfo.set_index('pcode').assign(
    AGE=lambda x: x['age'],
    GEN=lambda x: x['gender'],
    BFI_OPN=lambda x: x['openness'],
    BFI_CON=lambda x: x['conscientiousness'],
    BFI_NEU=lambda x: x['neuroticism'],
    BFI_EXT=lambda x: x['extraversion'],
    BFI_AGR=lambda x: x['agreeableness'],
    GHQ=lambda x: x['GHQ12'],
    PSS=lambda x: x['PSS10'],
    CESD=lambda x: x['CESD-R'],
    SE=lambda x: x['self-efficacy'],
    OPT=lambda x: x['optimism'],
    HOPE=lambda x: x['hope'],
    RES=lambda x: x['resiliency']
)

# Convert the processed info into a dictionary
PINFO = pd.get_dummies(PINFO, prefix_sep='=', dtype=bool).to_dict('index')
PINFO = {k: {f'PIF#{x}': y for x, y in v.items()} for k, v in PINFO.items()}
DATA = load(os.path.join(PATH_INTERMEDIATE, 'proc.pkl'))
LABELS_PROC = pd.read_csv(os.path.join(PATH_INTERMEDIATE, 'labels_1h_esmsyn.csv'), index_col=['pcode','timestamp'],parse_dates=True)

In [14]:
import warnings
from pandas.errors import PerformanceWarning

warnings.simplefilter(action='ignore', category=PerformanceWarning)
warnings.simplefilter(action="ignore", category=RuntimeWarning)


with on_ray():
# with on_ray():
    #for l in ['valence', 'arousal', 'stress', 'disturbance']:
    for l in ['stress_binary_personal', 'step_count_binary_personal']:
        #In preprocessing, dynamic threshold shows better data balance
        labels = LABELS_PROC[f'{l}']
#         labels = LABELS_PROC['stress_fixed']
        pids = labels.index.get_level_values('pcode').unique()
        feat = extract(
            pids=pids,
            data=DATA,
            label=labels,
            label_values=LABEL_VALUES,
#            window_data=WINDOW_DATA,
#            window_label=WINDOW_LABEL,
            categories=CATEGORIES,
            constat_features=PINFO,
            resample_s=RESAMPLE_S,
            with_ray=True
        )
        dump(feat, os.path.join(PATH_INTERMEDIATE, f'{l}-15min.pkl'))

2024-07-25 14:42:41,915	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


[2m[36m(_extract pid=2302962)[0m [24-07-25 14:42:44] Begin feature extraction on P001's data.
[2m[36m(_extract pid=2302953)[0m [24-07-25 14:42:49] Begin feature extraction on P010's data.[32m [repeated 6x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(_extract pid=2302954)[0m [24-07-25 14:42:55] Begin feature extraction on P016's data.[32m [repeated 5x across cluster][0m
[2m[36m(_extract pid=2302952)[0m [24-07-25 14:45:07] Complete feature extraction on P014's data (134.59 s).
[2m[36m(_extract pid=2302955)[0m [24-07-25 14:42:59] Begin feature extraction on P021's data.[32m [repeated 4x across cluster][0m
[2m[36m(_extract pid=2302952)[0m Element 0: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302952)[0m Element 1: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(

[2m[36m(_extract pid=2302962)[0m [24-07-25 14:45:16] Complete feature extraction on P001's data (152.47 s).
[2m[36m(_extract pid=2302952)[0m [24-07-25 14:45:07] Begin feature extraction on P022's data.
[2m[36m(_extract pid=2302962)[0m Element 0: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 1: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 2: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 3: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 4: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 5: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 6: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 7: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302962)[0m Element 8: 0, Type: <class '

[2m[36m(_extract pid=2302953)[0m [24-07-25 14:45:22] Complete feature extraction on P010's data (152.72 s).
[2m[36m(_extract pid=2302953)[0m Element 195: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 196x across cluster][0m
[2m[36m(_extract pid=2302953)[0m [24-07-25 14:45:22] Begin feature extraction on P024's data.
[2m[36m(_extract pid=2302957)[0m [24-07-25 14:45:27] Complete feature extraction on P009's data (158.79 s).
[2m[36m(_extract pid=2302957)[0m Element 196: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 197x across cluster][0m
[2m[36m(_extract pid=2302965)[0m [24-07-25 14:45:27] Complete feature extraction on P008's data (160.19 s).
[2m[36m(_extract pid=2302957)[0m [24-07-25 14:45:27] Begin feature extraction on P025's data.
[2m[36m(_extract pid=2302951)[0m Element 196: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 590x across cluster][0m
[2m[36m(_extract pid=2302951)[0m [24-07-25 14:45:32] Complete feature e

[2m[36m(_extract pid=2302964)[0m [24-07-25 14:45:44] Complete feature extraction on P018's data (168.34 s).
[2m[36m(_extract pid=2302964)[0m Element 205: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 206x across cluster][0m
[2m[36m(_extract pid=2302964)[0m [24-07-25 14:45:44] Begin feature extraction on P030's data.
[2m[36m(_extract pid=2302955)[0m [24-07-25 14:46:05] Complete feature extraction on P021's data (185.83 s).
[2m[36m(_extract pid=2302955)[0m [24-07-25 14:46:05] Begin feature extraction on P033's data.
[2m[36m(_extract pid=2302966)[0m [24-07-25 14:46:05] Complete feature extraction on P002's data (200.49 s).
[2m[36m(_extract pid=2302960)[0m Element 215: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 954x across cluster][0m
[2m[36m(_extract pid=2302960)[0m [24-07-25 14:46:08] Begin feature extraction on P037's data.[32m [repeated 3x across cluster][0m
[2m[36m(_extract pid=2302959)[0m [24-07-25 14:46:18] Complete feature

[2m[36m(_extract pid=2302956)[0m [24-07-25 14:50:48] Begin feature extraction on P070's data.
[2m[36m(_extract pid=2302960)[0m [24-07-25 14:50:50] Complete feature extraction on P051's data (139.52 s).[32m [repeated 2x across cluster][0m
[2m[36m(_extract pid=2302960)[0m Element 192: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 390x across cluster][0m
[2m[36m(_extract pid=2302960)[0m [24-07-25 14:50:51] Begin feature extraction on P071's data.
[2m[36m(_extract pid=2302964)[0m [24-07-25 14:50:58] Complete feature extraction on P056's data (130.30 s).
[2m[36m(_extract pid=2302964)[0m [24-07-25 14:50:59] Begin feature extraction on P072's data.
[2m[36m(_extract pid=2302955)[0m [24-07-25 14:51:04] Complete feature extraction on P050's data (156.13 s).
[2m[36m(_extract pid=2302964)[0m Element 186: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 187x across cluster][0m
[2m[36m(_extract pid=2302955)[0m [24-07-25 14:51:05] Begin feature ex

[2m[36m(_extract pid=2302962)[0m [24-07-25 14:55:32] Begin feature extraction on P108's data.
[2m[36m(_extract pid=2302964)[0m [24-07-25 14:55:34] Complete feature extraction on P088's data (143.73 s).[32m [repeated 2x across cluster][0m
[2m[36m(_extract pid=2302964)[0m 
[2m[36m(_extract pid=2302964)[0m 
[2m[36m(_extract pid=2302964)[0m 
[2m[36m(_extract pid=2302964)[0m Element 177: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 383x across cluster][0m
[2m[36m(_extract pid=2302964)[0m [24-07-25 14:55:35] Begin feature extraction on P109's data.
[2m[36m(_extract pid=2302957)[0m [24-07-25 14:55:38] Begin feature extraction on P110's data.
[2m[36m(_extract pid=2302963)[0m [24-07-25 14:56:08] Complete feature extraction on P092's data (162.28 s).[32m [repeated 2x across cluster][0m
[2m[36m(_extract pid=2302957)[0m Element 224: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 225x across cluster][0m
[2m[36m(_extract pid=2302963)[0

[2m[36m(_extract pid=2302964)[0m [24-07-25 15:01:35] Begin feature extraction on P013's data.[32m [repeated 8x across cluster][0m
[2m[36m(_extract pid=2302966)[0m [24-07-25 15:01:41] Begin feature extraction on P020's data.[32m [repeated 6x across cluster][0m
[2m[36m(_extract pid=2302960)[0m [24-07-25 15:04:06] Complete feature extraction on P009's data (153.70 s).
[2m[36m(_extract pid=2302952)[0m [24-07-25 15:01:42] Begin feature extraction on P021's data.
[2m[36m(_extract pid=2302960)[0m Element 0: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302960)[0m Element 1: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302960)[0m Element 2: 1, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302960)[0m Element 3: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302960)[0m Element 4: 0, Type: <class 'numpy.int64'>, Length: N/A
[2m[36m(_extract pid=2302960)[0m Element 5: 0, Type: <class 'num

[2m[36m(_extract pid=2302956)[0m [24-07-25 15:04:11] Complete feature extraction on P016's data (152.64 s).[32m [repeated 5x across cluster][0m
[2m[36m(_extract pid=2302956)[0m Element 188: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 964x across cluster][0m
[2m[36m(_extract pid=2302956)[0m [24-07-25 15:04:11] Begin feature extraction on P027's data.[32m [repeated 5x across cluster][0m
[2m[36m(_extract pid=2302961)[0m [24-07-25 15:04:18] Complete feature extraction on P010's data (165.13 s).
[2m[36m(_extract pid=2302953)[0m [24-07-25 15:04:32] Complete feature extraction on P015's data (174.96 s).
[2m[36m(_extract pid=2302961)[0m Element 195: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 196x across cluster][0m
[2m[36m(_extract pid=2302961)[0m [24-07-25 15:04:19] Begin feature extraction on P028's data.
[2m[36m(_extract pid=2302953)[0m [24-07-25 15:04:33] Begin feature extraction on P029's data.
[2m[36m(_extract pid=2302952)[0

[2m[36m(_extract pid=2302957)[0m [24-07-25 15:10:43] Begin feature extraction on P077's data.
[2m[36m(_extract pid=2302956)[0m [24-07-25 15:10:53] Complete feature extraction on P059's data (119.05 s).
[2m[36m(_extract pid=2302957)[0m Element 256: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 257x across cluster][0m
[2m[36m(_extract pid=2302956)[0m [24-07-25 15:10:54] Begin feature extraction on P078's data.
[2m[36m(_extract pid=2302960)[0m [24-07-25 15:11:10] Complete feature extraction on P058's data (136.64 s).
[2m[36m(_extract pid=2302956)[0m Element 190: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 191x across cluster][0m
[2m[36m(_extract pid=2302960)[0m [24-07-25 15:11:11] Begin feature extraction on P079's data.
[2m[36m(_extract pid=2302954)[0m [24-07-25 15:11:40] Complete feature extraction on P065's data (154.60 s).
[2m[36m(_extract pid=2302960)[0m Element 180: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 18

[2m[36m(_extract pid=2302954)[0m [24-07-25 15:15:39] Begin feature extraction on P121's data.
[2m[36m(_extract pid=2302960)[0m [24-07-25 15:15:54] Complete feature extraction on P101's data (130.85 s).
[2m[36m(_extract pid=2302954)[0m Element 166: 1, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 167x across cluster][0m
[2m[36m(_extract pid=2302960)[0m [24-07-25 15:15:54] Begin feature extraction on P122's data.
[2m[36m(_extract pid=2302958)[0m [24-07-25 15:16:15] Complete feature extraction on P109's data (107.80 s).
[2m[36m(_extract pid=2302960)[0m Element 159: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 160x across cluster][0m
[2m[36m(_extract pid=2302958)[0m [24-07-25 15:16:15] Begin feature extraction on P123's data.
[2m[36m(_extract pid=2302959)[0m [24-07-25 15:16:23] Complete feature extraction on P102's data (140.77 s).
[2m[36m(_extract pid=2302958)[0m Element 147: 0, Type: <class 'numpy.int64'>, Length: N/A[32m [repeated 14