In [1]:
# RAs should qualitative review the data and see what people are doing before we quantitatively remove outliers.

# input data should be main task data, n-back, procspeed, and other behavioral variables

# this notebook should be used to modify main_trial_level, before it's passed onto Subject_level_variable.ipynb

# We want to get rid of:
# 1. outliers that are out of range (e.g. probability is 900 instead of 90)
# 2. outliers that are 3 SD away in the trial level
# 3. outliers that are 3 SD away in the subject level (this will be done in 1.3 subject level data, not here)
# 4. Reaction time under 300 msec is invalid and  not councious.
# make sure that the raw data columns are kept unchanged.
# cleaned data are in new columns. add a column for each cleaning criteria
# a cell should be null if it contains an outlier.

In [2]:
import os
from os import path
from pathlib import Path
import numpy as np
import pandas as pd

In [3]:
def exclude_3sd(row):
    """
    Intended for use with DataFrame.apply()
    
    'subj_means' and 'subj_3sd' are not function args, therefore undefined locally. 
    They need to be defined globally, which we will do below.
    """
    SubjNum,val = row['SubjNum'],row['val_estdiff_valid']
    mean,SD = subj_means[SubjNum],subj_sd[SubjNum]      #bound is the SD
    diff = abs(val - mean)
    if diff < 3*SD:
        return(val)
    else: return(np.nan)
    
def out_of_range (row):
    #main task absolute error has no data that is out of range. So this function is not used here but to serve as an example
    SubjNum,val = row['SubjNum'],row['absolute_error']
    if 0 <= val <= 1:
        return(val)
    else: return(np.nan)    
    
def RT_ (row):
    SubjNum,val = row['SubjNum'],row['absolute_error']
    if 0 <= val <= 1:
        return(val)
    else: return(np.nan)    

In [4]:
location = path.join('task_active_all_subjects.csv')
df=pd.read_csv(location) #opening the excel file - trial level data
df.head()

Unnamed: 0,experimentname,SubjNum,session,attribbadlosepercent,attribbadwinpercent,attribgainbondamount,attribgainloseamount,attribgainwinamount,attribgoodlosepercent,attribgoodwinpercent,...,firstassetchoseninblock,incorrchoicematchingfrstinblck,firstchoicestock,pctfirstchoicestock,education,household_income,hhinc_bin,college,lowinc,age_decade
0,Task Active Involvement ed. 4.9,1002,1,0.7,0.3,6,2,10,0.3,0.7,...,1.0,0,1.0,0.7,14,"$10,000-$19,999",2,0,1,4
1,Task Active Involvement ed. 4.9,1002,1,0.7,0.3,6,2,10,0.3,0.7,...,1.0,1,,0.7,14,"$10,000-$19,999",2,0,1,4
2,Task Active Involvement ed. 4.9,1002,1,0.7,0.3,6,2,10,0.3,0.7,...,1.0,0,,0.7,14,"$10,000-$19,999",2,0,1,4
3,Task Active Involvement ed. 4.9,1002,1,0.7,0.3,6,2,10,0.3,0.7,...,1.0,0,,0.7,14,"$10,000-$19,999",2,0,1,4
4,Task Active Involvement ed. 4.9,1002,1,0.7,0.3,6,2,10,0.3,0.7,...,1.0,0,,0.7,14,"$10,000-$19,999",2,0,1,4


In [5]:
df['absolute_error'].std()

23.91146146532946

In [6]:
df['absolute_error'].mean()

28.614886047241292

In [7]:
subj_means = df.groupby('SubjNum').mean()['val_estdiff_valid']
subj_means.head()

SubjNum
1002   -42.796611
1004   -35.789474
1007   -27.433334
1008    -4.900000
1009     2.033333
Name: val_estdiff_valid, dtype: float64

In [8]:
subj_sd = df.groupby('SubjNum').std()['val_estdiff_valid']

In [9]:
#out of range
#axis=1 means that you want the data to be processed row by row. axies=1 means column by column
#3 SD away in the trial level
df['val_error_3sd_removed'] = df.apply(exclude_3sd, axis=1)
df['abs_error_3sd_removed'] = df['val_error_3sd_removed'].abs()

In [10]:
df.to_csv("main_trial_level_cleaned_7.8.2019.csv")