In [1]:
import json
from pathlib import Path
import pandas as pd
from functools import reduce
from itertools import repeat
import numpy as np
%config Completer.use_jedi = False
from tqdm import tqdm

In [2]:
session_identifier = "karolinska"
files = [str(p) for p in Path("sleep_alc_labels/").iterdir()]
    
session_files = sorted([file for file in files if session_identifier in file])
print("\n".join(session_files))

sleep_alc_labels/001_1_a_karolinska.csv
sleep_alc_labels/001_2_s_karolinska.csv
sleep_alc_labels/001_3_b_karolinska.csv
sleep_alc_labels/002_1_b_karolinska.csv
sleep_alc_labels/002_2_a_karolinska.csv
sleep_alc_labels/002_3_s_karolinska.csv
sleep_alc_labels/003_1_b_karolinska.csv
sleep_alc_labels/003_2_s_karolinska.csv
sleep_alc_labels/003_3_a_karolinska.csv
sleep_alc_labels/004_1_s_karolinska.csv
sleep_alc_labels/004_2_a_karolinska.csv
sleep_alc_labels/004_3_b_karolinska.csv
sleep_alc_labels/005_1_s_karolinska.csv
sleep_alc_labels/005_2_b_karolinska.csv
sleep_alc_labels/005_3_a_karolinska.csv
sleep_alc_labels/008_1_b_karolinska.csv
sleep_alc_labels/008_2_a_karolinska.csv
sleep_alc_labels/008_3_s_karolinska.csv
sleep_alc_labels/009_1_b_karolinska.csv
sleep_alc_labels/011_1_s_karolinska.csv
sleep_alc_labels/011_2_b_karolinska.csv
sleep_alc_labels/011_3_a_karolinska.csv
sleep_alc_labels/014_1_b_karolinska.csv


In [3]:
feature_files = sorted([str(p) for p in Path("potsdam_aeye_112020/").iterdir()])
with open(feature_files[0]) as fp:
    data = json.loads(fp.read())
print(data[0])

{'eye_closure': {'combined': 0.26446233811198994, 'left_image': 0.24725983222991488, 'right_image': 0.281664843994065}, 'eye_state': {'combined': 0, 'left_image': 0, 'right_image': 0}, 'index': 0}


In [4]:
def print_nan_intersections(df: pd.DataFrame):
    nan_indices = dict()
    for column in df.columns:
        nan_indices[column] = set(df.loc[df[column].isna()].index)
    
    for column in df.columns:
        nan_indices_copy = nan_indices.copy()
        col_indices = set(nan_indices_copy.pop(column))
        other_indices = reduce(lambda x,y: set(x) | set(y), nan_indices_copy.values())
        unique_nans = col_indices - other_indices
        print(f"Column {column} has {len(unique_nans)} nans that appear in no other column.")
    

In [178]:
def add_karolinska_file_to_feature_df(filepath: str, feature_df: pd.DataFrame):
    df = pd.read_csv(filepath)
    
    # add nearest neighbor response to frames
    begin = 0
    responses = []
    for index, row in df.iterrows():
        frame_end = row["frame_end"]
        response = row["response_karolinska"]
        if index < len(df) - 1:
            next_frame_begin = df.iloc[index + 1]["frame_begin"]
            end = int((frame_end + next_frame_begin) / 2)
            
        else:
            end = max(feature_df.index) + 1
        some_responses = list(zip(range(begin, end), repeat(response)))
        responses.extend(some_responses)
        begin = end
    response_index, responses = zip(*responses)
    
    target_df = pd.DataFrame(responses, columns=["karolinska_response_nearest_interpolation"], index=response_index)
    feature_df = feature_df.join(target_df)
    
    # add linear interpolation
    begin = 0
    responses = []
    for index, row in df.iterrows():
        frame_end = row["frame_end"]
        if index < len(df) - 1:
            next_frame_begin = df.iloc[index + 1]["frame_begin"]
            end = int((frame_end + next_frame_begin) / 2)
            
        else:
            end = max(feature_df.index) + 1
        if index == df.index[0]:
            first_response = row["response_karolinska"]
            second_response = row["response_karolinska"]
        elif index == df.index[-1]:
            first_response = row["response_karolinska"]
            second_response = row["response_karolinska"]
        else:
            first_response = row["response_karolinska"]
            second_response = df.iloc[index]["response_karolinska"]
        
        
        some_responses = np.linspace(first_response, second_response, end-begin, dtype=int).tolist()
        some_responses = list(zip(range(begin, end), some_responses))
        responses.extend(some_responses)
        begin = end
    response_index, responses = zip(*responses)
    target_df = pd.DataFrame(responses, columns=["karolinska_response_linear_interpolation"], index=response_index)
    feature_df = feature_df.join(target_df)
    
    return feature_df
    
    
def add_perclos_features_to_df(feature_df: pd.DataFrame, interval_in_sec: int = 60, fps: int = 30, closed_threshold: float =  .8):
    
    # add perclos column
    feature_df["perclos_closed_combined"] = feature_df["combined_eye_closure"] >= closed_threshold
    
    # add perclos interval
    num_interval_frames = interval_in_sec * fps
    perclos = feature_df["perclos_closed_combined"].to_numpy()
    
    conv_filter = np.ones((num_interval_frames))
    res = np.convolve(perclos, conv_filter, "valid") # divide
    res = np.concatenate(([np.NaN] * (len(conv_filter) - 1), res))
    assert perclos.shape == res.shape

    feature_df["perclos_combined_60s_interval"] = res
    return feature_df

def add_blink_features(feature_df: pd.DataFrame, interval_in_sec: int = 60, fps: int = 30):
    num_interval_frames = interval_in_sec * fps 
    perclos = feature_df["perclos_closed_combined"].to_numpy()
    all_blink_properties = np.empty(perclos.shape + (4,))
    all_blink_properties[:] = np.NaN
    
#     print("calculating blink properties")
#     print("perclos shape: ", perclos.shape)
#     print("empty blink shape", all_blink_properties.shape)
    
    
    for index in tqdm(list(range(len(perclos)))[num_interval_frames:]):
        condition = perclos[index-num_interval_frames:index]
        if not np.any(condition):
            blink_durations = np.array([0,0,0,0])
        else:
            blink_durations = np.diff(np.where(np.concatenate(([condition[0]],
                                     condition[:-1] != condition[1:],
                                     [True])))[0])[::2]
        all_blink_properties[index, :] = (blink_durations.max(), blink_durations.min(), blink_durations.mean(), len(blink_durations))

    
    feature_df["max_blink_duration_60s_interval"] = all_blink_properties[:,0]
    feature_df["min_blink_duration_60s_interval"] = all_blink_properties[:,1]    
    feature_df["mean_blink_duration_60s_interval"] = all_blink_properties[:,2]    
    feature_df["blink_counts_60s_interval"] = all_blink_properties[:,3]    
    
    return feature_df
    

def session_file_to_df(filepath: str, filepath_response: str) -> pd.DataFrame:
    print(f"Extracting file {filepath} and response file: {filepath_response}.")
    with open(filepath) as fp:
        data = json.loads(fp.read())
    
    # join data
    df_eye_closure = pd.DataFrame([item["eye_closure"] for item in data])
    df_eye_closure[df_eye_closure < 0] = 0 # some values are negative and need to be set to zero
    df_eye_state = pd.DataFrame([item["eye_state"] for item in data])
    df_closure_and_state = df_eye_closure.join(df_eye_state, rsuffix="_eye_state", lsuffix="_eye_closure")
    
    #check nan intersection
    #print()
    #print_nan_intersections(df_closure_and_state)
    #print()
    
    # add meta data
    filename = Path(filepath).stem
    subject_id, session_id, session_type = filename.split("_")
    df_closure_and_state["subject_id"] = subject_id
    df_closure_and_state["session_id"] = session_id
    df_closure_and_state["session_type"] = session_type
    
    # drop rows with NaN
#     num_with_nan = len(df_closure_and_state)
#     df_closure_and_state.dropna(inplace=True)
#     num_without_nan = len(df_closure_and_state)
#     print(f"Dropped {num_with_nan - num_without_nan} rows with nans.\n")
    
    # add perclos features
    PERCLOS_THRESHOLD = .8
    df_closure_and_state = add_perclos_features_to_df(df_closure_and_state, closed_threshold=PERCLOS_THRESHOLD)
    df_closure_and_state = add_blink_features(df_closure_and_state)
    
    # add karolinksa response
    df_closure_and_state = add_karolinska_file_to_feature_df(filepath=filepath_response, feature_df=df_closure_and_state)
    
    
    # assign dtypes
    df_closure_and_state["subject_id"] = df_closure_and_state["subject_id"].astype("float").astype("Int8", copy=False)
    df_closure_and_state["session_id"] = df_closure_and_state["session_id"].astype("float").astype("Int8", copy=False)
    df_closure_and_state["session_type"] = df_closure_and_state["session_type"].apply(lambda x: (ord(x)- 97)).astype("float").astype("Int8", copy=False)
    df_closure_and_state[["combined_eye_state", "left*Ä_image_eye_state", "right_image_eye_state"]] = df_closure_and_state[["combined_eye_state", "left_image_eye_state", "right_image_eye_state"]].astype("float").astype("Int8", copy=False)
    df_closure_and_state["karolinska_response_nearest_interpolation"] = df_closure_and_state["karolinska_response_nearest_interpolation"].astype("float").astype("Int8", copy=False)
    df_closure_and_state["karolinska_response_linear_interpolation"] = df_closure_and_state["karolinska_response_linear_interpolation"].astype("float").astype("Int8", copy=False)
    df_closure_and_state["perclos_combined_60s_interval"] = df_closure_and_state["perclos_combined_60s_interval"].astype("Int16", copy=False)
    df_closure_and_state['max_blink_duration_60s_interval'] = df_closure_and_state['max_blink_duration_60s_interval'].astype("Int16", copy=False)
    df_closure_and_state['min_blink_duration_60s_interval'] = df_closure_and_state['min_blink_duration_60s_interval'].astype("Int16", copy=False)
    df_closure_and_state['mean_blink_duration_60s_interval'] = df_closure_and_state['mean_blink_duration_60s_interval'].astype("Float16", copy=False)
    df_closure_and_state['blink_counts_60s_interval'] = df_closure_and_state['blink_counts_60s_interval'].astype("Int16", copy=False)
    
    # create multi-index
    multi_index = pd.MultiIndex.from_product([[filename], df_closure_and_state.index], names=["filename", "frame"])    
    df_closure_and_state.index = multi_index
    
    return df_closure_and_state

In [179]:

#df = session_file_to_df(feature_files[0], session_files[0])

In [181]:
feature_df = pd.concat(list(map(session_file_to_df,feature_files, session_files)))

Extracting file potsdam_aeye_112020/001_1_a.json and response file: sleep_alc_labels/001_1_a_karolinska.csv.


100%|██████████| 167047/167047 [00:06<00:00, 26881.96it/s]


Extracting file potsdam_aeye_112020/001_2_s.json and response file: sleep_alc_labels/001_2_s_karolinska.csv.


100%|██████████| 135737/135737 [00:04<00:00, 28174.83it/s]


Extracting file potsdam_aeye_112020/001_3_b.json and response file: sleep_alc_labels/001_3_b_karolinska.csv.


100%|██████████| 148247/148247 [00:05<00:00, 28605.23it/s]


Extracting file potsdam_aeye_112020/002_1_b.json and response file: sleep_alc_labels/002_1_b_karolinska.csv.


100%|██████████| 173277/173277 [00:06<00:00, 27889.73it/s]


Extracting file potsdam_aeye_112020/002_2_a.json and response file: sleep_alc_labels/002_2_a_karolinska.csv.


100%|██████████| 210282/210282 [00:08<00:00, 26068.49it/s]


Extracting file potsdam_aeye_112020/002_3_s.json and response file: sleep_alc_labels/002_3_s_karolinska.csv.


100%|██████████| 154457/154457 [00:05<00:00, 28653.23it/s]


Extracting file potsdam_aeye_112020/003_1_b.json and response file: sleep_alc_labels/003_1_b_karolinska.csv.


100%|██████████| 163626/163626 [00:06<00:00, 24861.32it/s]


Extracting file potsdam_aeye_112020/003_2_s.json and response file: sleep_alc_labels/003_2_s_karolinska.csv.


100%|██████████| 167353/167353 [00:07<00:00, 23867.65it/s]


Extracting file potsdam_aeye_112020/003_3_a.json and response file: sleep_alc_labels/003_3_a_karolinska.csv.


100%|██████████| 183325/183325 [00:07<00:00, 24356.84it/s]


Extracting file potsdam_aeye_112020/004_1_s.json and response file: sleep_alc_labels/004_1_s_karolinska.csv.


100%|██████████| 167308/167308 [00:06<00:00, 24120.70it/s]


Extracting file potsdam_aeye_112020/004_2_a.json and response file: sleep_alc_labels/004_2_a_karolinska.csv.


100%|██████████| 168213/168213 [00:06<00:00, 25313.20it/s]


Extracting file potsdam_aeye_112020/004_3_b.json and response file: sleep_alc_labels/004_3_b_karolinska.csv.


100%|██████████| 147716/147716 [00:05<00:00, 25295.26it/s]


Extracting file potsdam_aeye_112020/005_1_s.json and response file: sleep_alc_labels/005_1_s_karolinska.csv.


100%|██████████| 158235/158235 [00:06<00:00, 25070.93it/s]


Extracting file potsdam_aeye_112020/005_2_b.json and response file: sleep_alc_labels/005_2_b_karolinska.csv.


100%|██████████| 145573/145573 [00:05<00:00, 24895.90it/s]


Extracting file potsdam_aeye_112020/005_3_a.json and response file: sleep_alc_labels/005_3_a_karolinska.csv.


100%|██████████| 188828/188828 [00:07<00:00, 26140.46it/s]


Extracting file potsdam_aeye_112020/008_1_b.json and response file: sleep_alc_labels/008_1_b_karolinska.csv.


100%|██████████| 166151/166151 [00:06<00:00, 24982.45it/s]


Extracting file potsdam_aeye_112020/008_2_a.json and response file: sleep_alc_labels/008_2_a_karolinska.csv.


100%|██████████| 179933/179933 [00:07<00:00, 25419.36it/s]


Extracting file potsdam_aeye_112020/008_3_s.json and response file: sleep_alc_labels/008_3_s_karolinska.csv.


100%|██████████| 156711/156711 [00:06<00:00, 25988.45it/s]


Extracting file potsdam_aeye_112020/009_1_b.json and response file: sleep_alc_labels/009_1_b_karolinska.csv.


100%|██████████| 159957/159957 [00:05<00:00, 27960.76it/s]


Extracting file potsdam_aeye_112020/011_1_s.json and response file: sleep_alc_labels/011_1_s_karolinska.csv.


100%|██████████| 148083/148083 [00:05<00:00, 26203.94it/s]


Extracting file potsdam_aeye_112020/011_2_b.json and response file: sleep_alc_labels/011_2_b_karolinska.csv.


100%|██████████| 143578/143578 [00:05<00:00, 25961.13it/s]


Extracting file potsdam_aeye_112020/011_3_a.json and response file: sleep_alc_labels/011_3_a_karolinska.csv.


100%|██████████| 160519/160519 [00:06<00:00, 25932.07it/s]


Extracting file potsdam_aeye_112020/014_1_b.json and response file: sleep_alc_labels/014_1_b_karolinska.csv.


100%|██████████| 145555/145555 [00:06<00:00, 24058.46it/s]


In [182]:
feature_df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3781111 entries, ('001_1_a', 0) to ('014_1_b', 147354)
Data columns (total 17 columns):
 #   Column                                     Dtype  
---  ------                                     -----  
 0   combined_eye_closure                       float64
 1   left_image_eye_closure                     float64
 2   right_image_eye_closure                    float64
 3   combined_eye_state                         Int8   
 4   left_image_eye_state                       Int8   
 5   right_image_eye_state                      Int8   
 6   subject_id                                 Int8   
 7   session_id                                 Int8   
 8   session_type                               Int8   
 9   perclos_closed_combined                    bool   
 10  perclos_combined_60s_interval              Int16  
 11  max_blink_duration_60s_interval            Int16  
 12  min_blink_duration_60s_interval            Int16  
 13  mean_blink_durati

In [183]:
feature_df.to_pickle("./all_session.pkl")