**Table of contents**<a id='toc0_'></a>    
- [Purpose of the notebook](#toc1_1_)    
- [General guidelines based on EDA](#toc2_)    
- [Removal based on data percentage threshold](#toc3_)    
- [Removal based on length of NaN sequences](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

## <a id='toc1_1_'></a>[Purpose of the notebook](#toc0_)

This is a notebook for writing an algorithm to accept/ reject trials/ blocks/ participants.

In [2]:
import sys

sys.path.insert(
    1, "..\\utilities\\"
)  # adds utilities folder to path so we can import modules from it, won't be needed after packaging

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import loading_utils as load
import data_utils

participant_list = [200, 201, 202, 204, 205, 206, 207, 209, 210, 211, 212, 213]

In [None]:
# This is a block for making participant dataframes from raw files in directory data_dir (here Windows path to retinawise mirror folder on drive)
# It saves the participant dfs into directory defined in save_path (about 300 - 400 MB per participant), format 2xx_recording_data.csv
# Uncomment and run if you don't have these dataframes (remember that the folder specified in save_path must exist)
# data_dir = "D:/retinawise_mirror/raw/"
# save_path = './results/'
# for participant_id in participant_list:
#     data_df, protocol_timecourse_df, protocol_vars_df = load.load_participant_data(participant_no=participant_id,
#                                                                                    data_dir=data_dir,
#                                                                                    include_failed=False,
#                                                                                    save=True,
#                                                                                    save_path=save_path)

In [None]:
# This is a block for resampling participant dataframes to 30 Hz and extracting only trials -1:18 s
# It loads the participant dfs from data_dir (e.g. the save_path from block above)
# And saves into directory save_path with format 2xxdata_suffix
# Uncomment and run if you don't have the resampled dataframes (remember the folder specified in save_path must exist)
# data_dir = "./results/"
# save_path = './results/resampled/'
# data_suffix = "_nonan_30_resampled_data.csv"
# for participant_id in participant_list:
#     data_path = os.path.join(data_dir,str(participant_id)+'_recording_data.csv')
#     data_df = pd.read_csv(data_path)
#     resampled_df = data_utils.resample_by_trial(data_df,sample_freq=30)
#     save_filepath = os.path.join(save_path,str(participant_id)+data_suffix)
#     resampled_df.to_csv(save_filepath)
#     data_df = []
#     resampled_df = []
    

# <a id='toc2_'></a>[General guidelines based on EDA](#toc0_)

<b>Trial acceptance thresholds:</b>

at least 75% not-nan in 0:6 s period from data at 30 Hz, 

at least 40% not-nan in baseline at 30 Hz,

no NaN sequence longer than x samples - to be determined from EDA


<b>Block acceptance threshold:</b>

minimum 3 trials in flux

minimum 1 other condition with 3 trials than flux


# <a id='toc3_'></a>[Removal based on data percentage threshold](#toc0_)

In [24]:
def remove_trials_below_percentage(resampled_df,baseline_threshold = 40,poi_threshold=75,baseline_time=[-1,0],poi_time=[0,6]):
    
    resampled_df = resampled_df.copy()
    
    # compute poi data percentage present in trials
    poi_df = resampled_df[(resampled_df['Trial time Sec']>=poi_time[0])& 
                          (resampled_df['Trial time Sec']<=poi_time[1])] 
    poi_groupby_df = (
        poi_df[
            ["Trial no", "Stim eye - Size Mm"]
        ]
        .groupby(["Trial no"])
        .agg(["count", "size"])
        .reset_index()
    )
    poi_groupby_df[("Stim eye - Size Mm", "count/size ratio")] = (
        poi_groupby_df[("Stim eye - Size Mm", "count")]
        / poi_groupby_df[("Stim eye - Size Mm", "size")]
    )*100
    
    # compute baseline data percentage present in trials
    baseline_df = resampled_df[(resampled_df['Trial time Sec']>=baseline_time[0])& 
                               (resampled_df['Trial time Sec']<=baseline_time[1])] 
    baseline_groupby_df = (
        baseline_df[
            ["Trial no", "Stim eye - Size Mm"]
        ]
        .groupby(["Trial no"])
        .agg(["count", "size"])
        .reset_index()
    )
    baseline_groupby_df[("Stim eye - Size Mm", "count/size ratio")] = (
        baseline_groupby_df[("Stim eye - Size Mm", "count")]
        / baseline_groupby_df[("Stim eye - Size Mm", "size")]
    )*100
        
    
    # find trials matching poi condition and baseline condition
    pois_above_threshold = (poi_groupby_df[("Stim eye - Size Mm", "count/size ratio")] >= poi_threshold)
    baselines_above_threshold = (baseline_groupby_df[("Stim eye - Size Mm", "count/size ratio")] >= baseline_threshold)
    trials_accepted_indices = (pois_above_threshold & baselines_above_threshold)
    trials_accepted = poi_groupby_df[("Trial no", "")][trials_accepted_indices]
    
    # select only found trials from original dataframe
    resampled_df = resampled_df[resampled_df['Trial no'].isin(trials_accepted)]
    resampled_df = resampled_df.reset_index(drop=True)
    
    return resampled_df

In [25]:
data_dir = "./results/resampled/" #directory with resampled data 
data_suffix = "_nonan_30_resampled_data.csv" #name of file with 30 Hz resampled data from participant 2xx, name format: 2xxdata_suffix

data_path = os.path.join(data_dir, str(201) + data_suffix)
data_df = pd.read_csv(data_path)

thresholded_df = remove_trials_below_percentage(data_df,baseline_threshold = 40,poi_threshold=75,baseline_time=[-1,0],poi_time=[0,6])

In [28]:
no_trials_before_threshold = len(data_df['Trial no'].unique())
no_trials_after_threshold = len(thresholded_df['Trial no'].unique())
print(f'Number of trials before thresholding: {no_trials_before_threshold}')
print(f'Number of trials after thresholding: {no_trials_after_threshold}')


Number of trials before thresholding: 525
Number of trials after thresholding: 492


In [None]:
thresholded_df

Unnamed: 0.1,Unnamed: 0,Trial time datetime,Stim eye - Size Mm,Trial time Sec,Trial no,Trial type,Block,Test,Recording id,Eye,Participant id,Trial phase
0,0,-1 days +23:59:59,8.41547,-1.000000,1.0,lms,0,b,1,R,201,pre-stim
1,1,-1 days +23:59:59.033333334,8.42239,-0.966667,1.0,lms,0,b,1,R,201,pre-stim
2,2,-1 days +23:59:59.066666668,8.43141,-0.933334,1.0,lms,0,b,1,R,201,pre-stim
3,3,-1 days +23:59:59.100000002,8.44198,-0.900000,1.0,lms,0,b,1,R,201,pre-stim
4,4,-1 days +23:59:59.133333336,8.45506,-0.866667,1.0,lms,0,b,1,R,201,pre-stim
...,...,...,...,...,...,...,...,...,...,...,...,...
280435,299245,0 days 00:00:17.833333710,6.01270,17.833333,525.0,flux,10,b,22,R,201,post-stim
280436,299246,0 days 00:00:17.866667044,6.03140,17.866667,525.0,flux,10,b,22,R,201,post-stim
280437,299247,0 days 00:00:17.900000378,6.05144,17.900000,525.0,flux,10,b,22,R,201,post-stim
280438,299248,0 days 00:00:17.933333712,6.07841,17.933333,525.0,flux,10,b,22,R,201,post-stim


# <a id='toc4_'></a>[Removal based on length of NaN sequences](#toc0_)

In [None]:
def find_consecutive_nans(trial):
    #find and make a series of found nan sequences
    nan_list=trial['Stim eye - Size Mm'].isnull().astype(int).groupby(trial['Stim eye - Size Mm'].notnull().astype(int).cumsum()).sum()
    return nan_list

def mark_nan_sequences(trial):
    #count the nans, preserving the indexing of the trial (e.g. for later finding out, in which phase the found long sequences are)
    nan_list_index_preserved = trial['Stim eye - Size Mm'].isnull().astype(int).groupby(trial['Stim eye - Size Mm'].notnull().astype(int).cumsum()).cumsum()
    return nan_list_index_preserved
    