In [1]:
import os
import pandas as pd
import numpy as np
from convert_eprime import convert as ep

In [2]:
source_dir = os.path.join('..','sourcedata')
derivs_dir = os.path.join('..','derivatives')

# Convert all N-back data to csv

#### Define a function to clean the N-back data

Stack the blocks vertically instead of horizontally, label the trial rows properly, and tag each trial as a HIT, MISS, FA, CR. We are also ouputting a new CSV data file in the sourcedata folder, all cleaned-up.

In [3]:
def nstack_score_label(fpath,outpath):
    df = pd.read_excel(fpath)
    
    # Hierarchicalize the column index
    df.columns=pd.MultiIndex.from_tuples([
        (df.columns[0].split('.')[0],df.columns[0].split('.')[1]),
        (df.columns[1].split('.')[0],df.columns[1].split('.')[1]),
        (df.columns[2].split('.')[0],df.columns[2].split('.')[1]),
        (df.columns[3].split('.')[0],df.columns[3].split('.')[1]),
        (df.columns[4].split('.')[0],df.columns[4].split('.')[1]),
        (df.columns[5].split('.')[0],df.columns[5].split('.')[1]),
    ])
    
    # Stack blocks, Reset trial row index, and Rename columns to be descriptive
    df = df.stack(0).reset_index().rename(
        columns={'level_0':'trial','level_1':'block'}
    ).sort_values(['block','trial'])
    df['sub'] = os.path.basename(fpath).split('_')[0].split('-')[1]
    df['block'] = df['block'].str[1]
    df['trial'] = df['trial'] + 1
    df = df.set_index(['sub','block','trial'])
    
    # Determine Hits, CRs, FAs
    cr_mask = (df['Rsp'] == 0) & (df['CRsp'] == 0)
    ms_mask = (df['Rsp'] == 0) & (df['CRsp'] == 1)
    fa_mask = (df['Rsp'] == 1) & (df['CRsp'] == 0)
    ht_mask = (df['Rsp'] == 1) & (df['CRsp'] == 1)
    df['CR']   = cr_mask.astype(int)
    df['MISS'] = ms_mask.astype(int)
    df['FA']   = fa_mask.astype(int)
    df['HIT']  = ht_mask.astype(int)
    
    # Convert RT 0 to RT NaN
    df['RT'] = df['RT'].replace(0,np.NaN)
    
    # Output to new CSV datafile
    df.to_csv(outpath)
    print('Output file successfully created- ',outpath)

#### Read all the subject data

Executing N-back data cleaning & Setting up for subject-level analysis.

In [4]:
for s in os.listdir(source_dir):
    if s.startswith('sub-4'):
        sub_dir = os.path.join(source_dir,s)
        for f in os.listdir(sub_dir):
            if f.endswith('.xlsx') and f.split('_')[-2] == 'task-nback':
                #print(f)
                o = f.split('.')[0]+'.csv'
                fpath = os.path.join(sub_dir,f)
                opath = os.path.join(sub_dir,o)
                if os.path.isfile(opath):
                    print(opath,'exists')
                else:
                    o = os.path.basename(fpath).split('.')[0]+'.csv' 
                    opath = os.path.join(sub_dir,o)
#                     print(fpath,'=>',opath)
                    nstack_score_label(fpath,opath)            

..\sourcedata\sub-400\sub-400_task-nback_beh.csv exists
..\sourcedata\sub-401\sub-401_task-nback_beh.csv exists
..\sourcedata\sub-402\sub-402_task-nback_beh.csv exists
..\sourcedata\sub-403\sub-403_task-nback_beh.csv exists
..\sourcedata\sub-404\sub-404_task-nback_beh.csv exists
..\sourcedata\sub-405\sub-405_task-nback_beh.csv exists
..\sourcedata\sub-406\sub-406_task-nback_beh.csv exists
..\sourcedata\sub-407\sub-407_task-nback_beh.csv exists
..\sourcedata\sub-408\sub-408_task-nback_beh.csv exists
..\sourcedata\sub-409\sub-409_task-nback_beh.csv exists
..\sourcedata\sub-410\sub-410_task-nback_beh.csv exists
..\sourcedata\sub-411\sub-411_task-nback_beh.csv exists
..\sourcedata\sub-412\sub-412_task-nback_beh.csv exists
..\sourcedata\sub-413\sub-413_task-nback_beh.csv exists
..\sourcedata\sub-414\sub-414_task-nback_beh.csv exists
..\sourcedata\sub-415\sub-415_task-nback_beh.csv exists
..\sourcedata\sub-416\sub-416_task-nback_beh.csv exists
..\sourcedata\sub-417\sub-417_task-nback_beh.csv

# Combine subjects (OA and YA) and output nback_trial_level

## Read in and concatenate subject datasheets

In [5]:
cleaned_dataframes_list = []

for s in os.listdir(source_dir):
    if s.startswith('sub-4'):
        sub_dir = os.path.join(source_dir,s)
        for f in os.listdir(sub_dir):
            if f.endswith('.csv') and f.split('_')[-2] == 'task-nback':
                fpath = os.path.join(sub_dir,f)
                df = pd.read_csv(fpath)
                cleaned_dataframes_list.append(df)
#                 print(df.head())

In [6]:
oa_nback_trial_level = pd.concat(cleaned_dataframes_list)
oa_nback_trial_level['RT'] = (pd.to_numeric(oa_nback_trial_level['RT'], errors='coerce')) #MAKE RT numeric
# oa_nback_trial_level.columns

###### Output N-back trial-level data

In [7]:
oa_nback_trial_level.to_csv(os.path.join(derivs_dir,'trialwise','oa_nback_trial_level.csv'),index=False)

##### Input YA N-back trial-level data

In [8]:
ya_nback_trial_level = pd.read_csv(os.path.join(derivs_dir,'trialwise','ya_nback_trial_level_1.23.19.csv'))
ya_nback_trial_level['RT'] = (pd.to_numeric(ya_nback_trial_level['RT'], errors='coerce')) #MAKE RT numeric


###### Merge

In [9]:
nback_trial_level = oa_nback_trial_level.merge(ya_nback_trial_level, how='outer')

In [10]:
nback_trial_level.groupby('sub').describe()[['HIT','FA']] #describe is to generate statistics such as central tendency, dispersion and shape

Unnamed: 0_level_0,HIT,HIT,HIT,HIT,HIT,HIT,HIT,HIT,FA,FA,FA,FA,FA,FA,FA,FA
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
sub,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
145,21.0,0.142857,0.358569,0.0,0.0,0.0,0.0,1.0,21.0,0.190476,0.402374,0.0,0.0,0.0,0.00,1.0
146,27.0,0.148148,0.362014,0.0,0.0,0.0,0.0,1.0,27.0,0.222222,0.423659,0.0,0.0,0.0,0.00,1.0
147,27.0,0.037037,0.192450,0.0,0.0,0.0,0.0,1.0,27.0,0.000000,0.000000,0.0,0.0,0.0,0.00,0.0
148,27.0,0.111111,0.320256,0.0,0.0,0.0,0.0,1.0,27.0,0.074074,0.266880,0.0,0.0,0.0,0.00,1.0
149,27.0,0.259259,0.446576,0.0,0.0,0.0,0.5,1.0,27.0,0.074074,0.266880,0.0,0.0,0.0,0.00,1.0
150,27.0,0.037037,0.192450,0.0,0.0,0.0,0.0,1.0,27.0,0.222222,0.423659,0.0,0.0,0.0,0.00,1.0
152,27.0,0.111111,0.320256,0.0,0.0,0.0,0.0,1.0,27.0,0.000000,0.000000,0.0,0.0,0.0,0.00,0.0
153,27.0,0.185185,0.395847,0.0,0.0,0.0,0.0,1.0,27.0,0.185185,0.395847,0.0,0.0,0.0,0.00,1.0
154,26.0,0.230769,0.429669,0.0,0.0,0.0,0.0,1.0,26.0,0.038462,0.196116,0.0,0.0,0.0,0.00,1.0
155,26.0,0.230769,0.429669,0.0,0.0,0.0,0.0,1.0,26.0,0.115385,0.325813,0.0,0.0,0.0,0.00,1.0


## Get rid of RT outliers at the trial level:

In [11]:
from outliers import group_exclude

In [12]:
print(group_exclude.__doc__) #what to pass into the group_exclude function

 Takes a DataFrame, column name to group on, and a value column name.
    
    Performs outlier exclusion based on quantile thresholds and returns a Series.


In [13]:
nback_trial_level['RT_3sd_removed'] = group_exclude(nback_trial_level, 'sub', 'RT')
# nback_trial_level.head(100)

In [14]:
nback_trial_level.to_csv(os.path.join(derivs_dir,'trialwise','nback_trial_level.csv'),index=False)

### Group, expand, trim N-back data
Group by subjects, get the sum of all columns, the count of the trial column, and the mean of the RT column.

Establish Hit % `number of Hits / number of targets` and FA % `number of FAs / number of foils`. 

Corrected Recognition `HIT% - FA%`. 

In [15]:
grouped = nback_trial_level.groupby('sub', as_index=False)
nback_df = grouped.sum()
nback_df['trial'] = grouped.count()['trial']
nback_df['RT'] = grouped.mean()['RT']
nback_df['HIT%'] = nback_df['HIT'] / nback_df['CRsp']
nback_df['FA%'] = nback_df['FA'] / (nback_df['trial'] - nback_df['CRsp'])
nback_df['CoR'] = nback_df['HIT%'] - nback_df['FA%']
nback_df = nback_df[['sub', 'RT','HIT%','FA%','CoR']]

nback_df.loc[nback_df['sub']==432]

Unnamed: 0,sub,RT,HIT%,FA%,CoR
170,432,591.166667,0.75,0.0,0.75


## Get rid of subject level outliers: 
apply this to both RT and CoR.
group them by age group: 100s together, 400s together, 2000s together

In [16]:
# make a column called study to distinguish 100 level, 2000 level, and 400 level participants
nback_df['study'] = (nback_df['sub'].astype(str).str[0])
for i in nback_df.index:
    if 499 < nback_df.at[i, 'sub'] < 502:
        nback_df.at[i, 'study'] = 4

In [17]:
nback_df['RT_3sd_removed'] = group_exclude(nback_df, 'study', 'RT')
nback_df['CoR_3sd_removed'] = group_exclude(nback_df, 'study', 'CoR')

nback_df.head(100)

Unnamed: 0,sub,RT,HIT%,FA%,CoR,study,RT_3sd_removed,CoR_3sd_removed
0,145,1619.714286,0.750000,0.153846,0.596154,1,1619.714286,0.596154
1,146,1401.740741,1.000000,0.230769,0.769231,1,1401.740741,0.769231
2,147,557.777778,0.333333,0.000000,0.333333,1,,0.333333
3,148,710.333333,1.000000,0.074074,0.925926,1,710.333333,0.925926
4,149,819.962963,0.700000,0.100000,0.600000,1,819.962963,0.600000
5,150,1281.148148,0.142857,0.260870,-0.118012,1,1281.148148,-0.118012
6,152,796.259259,0.500000,0.000000,0.500000,1,796.259259,0.500000
7,153,871.074074,0.833333,0.208333,0.625000,1,871.074074,0.625000
8,154,1545.500000,0.857143,0.043478,0.813665,1,1545.500000,0.813665
9,155,1307.076923,1.000000,0.125000,0.875000,1,1307.076923,0.875000


###### Output N-back subject-level data

In [18]:
nback_df.to_csv(os.path.join(derivs_dir,'subjectwise','nback_subject_level.csv'))