In [1]:
# RAs should qualitative review the data and see what people are doing before we quantitatively remove outliers.

# input data should be main task data, n-back, procspeed, and other behavioral variables

# this notebook should be used to modify main_trial_level, before it's passed onto Subject_level_variable.ipynb

# We want to get rid of:
# 1. outliers that are out of range (e.g. probability is 900 instead of 90)
# 2. outliers that are 3 SD away in the trial level
# 3. outliers that are 3 SD away in the subject level (this will be done in 1.3 subject level data, not here)
# 4. Reaction time under 300 msec is invalid and  not councious.
# make sure that the raw data columns are kept unchanged.
# cleaned data are in new columns. add a column for each cleaning criteria
# a cell should be null if it contains an outlier.

In [2]:
import os
from os import path
from pathlib import Path
import numpy as np
import pandas as pd


In [3]:
def exclude_3sd(row):
    """
    Intended for use with DataFrame.apply()
    
    'subj_means' and 'subj_3sd' are not function args, therefore undefined locally. 
    They need to be defined globally, which we will do below.
    """
    SubjNum,val = row['SubjNum'],row['val_estdiff_valid']
    mean,SD = subj_means[SubjNum],subj_sd[SubjNum]      #bound is the SD
    diff = abs(val - mean)
    if diff < 3*SD:
        return(val)
    else: return(np.nan)
    
def out_of_range (row):
    #main task absolute error has no data that is out of range. So this function is not used here but to serve as an example
    SubjNum,val = row['SubjNum'],row['absolute_error']
    if 0 <= val <= 1:
        return(val)
    else: return(np.nan)    
    
def RT_ (row):
    SubjNum,val = row['SubjNum'],row['absolute_error']
    if 0 <= val <= 1:
        return(val)
    else: return(np.nan)    

In [4]:
location = path.join('..','derivatives','trialwise','main_trial_level.csv')
df=pd.read_csv(location) #opening the excel file - trial level data
df.head()

Unnamed: 0,SubjNum,AgeGroup,ExperimenterName,RunNum,Date,Time,TrialNum,TrialNumbydomdist,Domain,Magnitude,...,ConfidenceST,ConfidenceRT,StockNumber,BondNumber,GenderJudgment,TotalPayout,TrueProbGood,EstWithinRange?,val_estdiff_valid,absolute_error
0,100,1,kf,1,10_12,11:31:01.963000,1,1,LOSS,low,...,2141471.0,3.022637,16,9,1,-6,0.3,0,0.2000001,0.2000001
1,100,1,kf,1,10_12,11:31:01.963000,2,2,LOSS,low,...,2141525.0,3.695852,16,9,1,-12,0.155172,0,0.1448277,0.1448277
2,100,1,kf,1,10_12,11:31:01.963000,3,3,LOSS,low,...,2141546.0,3.121775,16,9,1,-18,0.3,1,1.038193e-07,1.038193e-07
3,100,1,kf,1,10_12,11:31:01.963000,4,4,LOSS,low,...,2141574.0,3.406241,16,9,1,-24,0.5,0,-0.1,0.1
4,100,1,kf,1,10_12,11:31:01.963000,5,5,LOSS,low,...,2141602.0,4.553061,16,9,1,-26,0.7,0,-0.1000001,0.1000001


In [5]:
df['absolute_error'].std()

0.17717746852107116

In [6]:
df['absolute_error'].mean()

0.1929211104725841

In [7]:
subj_means = df.groupby('SubjNum').mean()['val_estdiff_valid']
subj_means.head()

SubjNum
100    0.046724
101   -0.014427
102    0.014467
103    0.036173
104    0.070082
Name: val_estdiff_valid, dtype: float64

In [8]:
subj_sd = df.groupby('SubjNum').std()['val_estdiff_valid']

In [9]:
#out of range
#axis=1 means that you want the data to be processed row by row. axies=1 means column by column
#3 SD away in the trial level
df['val_error_3sd_removed'] = df.apply(exclude_3sd, axis=1)
df['abs_error_3sd_removed'] = df['val_error_3sd_removed'].abs()

In [10]:
derivs_dir = path.join('..','derivatives')
output_path = path.join(derivs_dir,'trialwise','main_trial_level_cleaned.csv')
df.to_csv(output_path)