In [3]:
from collections import defaultdict
from glob import glob
import os
import pandas as pd
import statistics as stats
import numpy as np
import math
import random
import matplotlib.pyplot as plt
from matplotlib import cm
import itertools

In [137]:
project_dir = '/Users/tonyb/Documents/Networking_Pilot/'
raw_dir = project_dir + 'all_data/'
explore_dir = project_dir + 'explore_n33/'
main_DVs_analyses = explore_dir + 'main_DVs_analyses/'
separate_conditions = explore_dir + 'separate_conditions/'

for i in [main_DVs_analyses, separate_conditions]:
    if not os.path.exists(i): os.makedirs(i)

In [6]:
abbrev = {'directed_forgetting_condition':'DF',
#           'directed_condition': 'DF',
         'flanker_condition':'FLANKER',
         'go_nogo_condition':'GNG',
         'n_back_condition':'NBACK',
          'delay_condition':'DELAY',
         'predictable_condition':'PREDICT',
#           'predictive_condition':'PREDICT',
         'shape_matching_condition':'SHAPE',
         'stop_signal_condition':'SS',
         'cue-task_condition':'TASK',
         'cue_condition':'CUE',
         'cued_condition':'CUE'}

In [7]:
explore_id=['A3QAHF4UUBM7ZO',
'AIQT0DPRTXYYD',
'A3V1ZYXBJYCIIE',
'A2WYCY1FMQOD5F',
'A2S4YDJ9UGAXFQ',
'A3G55RJTW3BSGM',
'A1OSRAPSRT934Z',
'AVJUIF9QHQRY8',
'A1VCAMP3XM62R4',
'A3IW9415ZOO0EX',
'A55CXM7QR7R0N',
'A2XUADP5L61HQ5',
'AQL960O0LTRI8',
'A1DS5O8MSI3ZH0',
'A2YTO4EY3MNYAJ',
'AY7WPVKHVNBLG',
'A2OFN0A5CPLH57',
'A2JI5RNPPXE8QE',
'A1TS2SKXPX7ZED',
'A1YC558J4E5KZ',
'A2DWPP1KKAY0HG',
'AD7CUW86FWEKT',
'A2SIKW18T2DYYW',
'AQJWO4YPR3LUQ',
'A1L1SQ488YCCFJ',
'A3ISDIYTS02E8C',
'A3QJJR5Y3XE92N',
'A1R0689JPSQ3OF',
'A37EV8RZ82WT8E',
'A3GEHH49HNJM57',
'ASTR3EPUOKEXV',
'AVMIXXCHPD291',
'A2581F7TDPAMBQ',]

### One table with all SE, DC, and DE variables (using RT or Acc; 3 steps: taskDict, individual table, one concat table)

In [77]:
### Generate a dictionary of tasks followed by their conditions and corresponding levels: {'taskname': {'dv0':[val0,val1], '...':[...,...]}}
taskDict = {}
for concat_file in glob(explore_dir + 'concat/*'):
    filey = os.path.basename(concat_file).split('.')[0]
    if 'no_go' in filey: filey = filey.replace('no_go','nogo') #SS with GNG, annoyingly, has 'go_no_go' in name instead of 'go_nogo' as in the other 7 GNG tasks
    concat_df = pd.read_csv(concat_file)
    exp_id = concat_df.exp_id.unique()[0] #for the network_attack battery, all rows in the exp_id column has the same value
    # collapsing across H and F in flanker_condition
    if ('flanker' in exp_id) and (len(concat_df['flanker_condition'].unique())>2):
        concat_df = concat_df.replace({'flanker_condition':[r'(^.*inc.*$)',r'(^.*_con.*$)']},\
                          {'flanker_condition':['incongruent','congruent']},regex=True)
    # collapsing 7 levels in shape_matching_condition: noise-response-incompatible (i.e. distractor != target, or D in second letter) vs. no-noise (i.e. no distractor, or N in second letter); also, disregarding SSS & DSD
    if ('shape' in exp_id) and (len(concat_df['shape_matching_condition'].unique())>2):
        concat_df = concat_df.replace({'shape_matching_condition':['SDD','DDS','DDD','D!=T']},\
                          {'shape_matching_condition':['DISTRACTOR','DISTRACTOR','DISTRACTOR','DISTRACTOR']})
        concat_df = concat_df.replace({'shape_matching_condition':['SNN','DNN','NoD']},\
                          {'shape_matching_condition':['CONTROL','CONTROL','CONTROL']})
    conditions = [i for i in concat_df.columns if ('condition' in i) and ('n_back_condition' not in i)]
    conditions.sort()
    val_list = []
    for cond in conditions: #delay_condition, flanker_condition
        values = concat_df[cond].unique() #[1,2,3]; [incongruent,congruent]
        values.sort() #[1,2,3]; [congruent,incongruent]
        values = values[0:2] #[1,2] considering only these two levels for now; [congruent,incongruent]
        val_list.append({abbrev[cond]:values}) #[{DELAY:[1,2]}, {FLANKER:[congruent,incongruent]}]
    taskDict.update({filey:val_list}) #{'n_back_with_flanker_task': [{'DELAY':[1,2]},{'FLANKER':['congruent','incongruent']}]}

len(taskDict) #should equal the # of tasks (i.e., 36)


36

In [129]:
### Individual tables (per task) for single effects (SE), dual-contexts (DC), dual-effects (DE), using RTs or Accuracy Rate
for i in ['RT','Acc']:
    new_path = main_DVs_analyses + 'mainDVs%s/' %i #path for output
    if not os.path.exists(new_path): 
        os.makedirs(new_path)

for qa_file in glob(explore_dir + 'QA/*'):
    taskName = os.path.basename(qa_file).split('.')[0][:-3] #the last three characters are '_qa'
    if 'no_go' in taskName: taskName = taskName.replace('no_go','nogo')
    qaDf = pd.read_csv(qa_file, index_col=0) #index_col is workerID
    #using RTs
    if 'stop' in taskName: #in stop-signal tasks: m(rt)...SS:go is baseline; ssrt...SS:stop is treatment
        qaDf = qaDf.filter(regex='m\(rt\)|ssrt')
    else:
        qaDf = qaDf.filter(regex='m\(')
    #using Accuracy
#     qaDf = qaDf.filter(regex='acc') 

    if len(taskDict[taskName]) == 1:
        dv0   = list(taskDict[taskName][0].keys())[0]
        vals0 = list(taskDict[taskName][0].values())
        
        base0  = qaDf.filter(regex='%s' %vals0[0][0])
        treat0 = qaDf.filter(regex='%s' %vals0[0][1])
        
        if 'stop' in taskName: #stop_signal_single is exception: ssrt, defined as treatment condition, already is SE.
            SE = treat0
        else: #for all other tasks, SE is still treatment minus baseline
            SE = pd.DataFrame(data=treat0.iloc[:,0] - base0.iloc[:,0], columns=['_'.join(['SE',dv0])])
        mainDvDf = SE
    
    elif len(taskDict[taskName]) == 2:
        if taskName == 'stop_signal_with_go_nogo':
            print(taskName+' has many rank<0/NaN values; no further processing for this task.')
        else:
            dv0   = list(taskDict[taskName][0].keys())[0]
            vals0 = list(taskDict[taskName][0].values())
            dv1   = list(taskDict[taskName][1].keys())[0]
            vals1 = list(taskDict[taskName][1].values())
            #taskDict is generated such that the baseline condition of a DV is always first (thus, [0][0]),
            #and the treatment condition is always second (thus, [0][1])
            base0_base1   = qaDf.filter(regex='%s' %vals0[0][0]).filter(regex='%s' %vals1[0][0])
            base0_treat1  = qaDf.filter(regex='%s' %vals0[0][0]).filter(regex='%s' %vals1[0][1])
            treat0_base1  = qaDf.filter(regex='%s' %vals0[0][1]).filter(regex='%s' %vals1[0][0])
            treat0_treat1 = qaDf.filter(regex='%s' %vals0[0][1]).filter(regex='%s' %vals1[0][1])

            DC_0_1 = pd.DataFrame(data=treat0_base1.iloc[:,0]  -  base0_base1.iloc[:,0], columns=['_'.join(['DC',dv0,dv1])])
            DC_1_0 = pd.DataFrame(data=base0_treat1.iloc[:,0]  -  base0_base1.iloc[:,0], columns=['_'.join(['DC',dv1,dv0])])
            DE_0_1 = pd.DataFrame(data=treat0_treat1.iloc[:,0] - base0_treat1.iloc[:,0], columns=['_'.join(['DE',dv0,dv1])])
            DE_1_0 = pd.DataFrame(data=treat0_treat1.iloc[:,0] - treat0_base1.iloc[:,0], columns=['_'.join(['DE',dv1,dv0])])

            mainDvDf = pd.concat([DC_0_1, DC_1_0, DE_0_1, DE_1_0],axis=1)
    
    elif len(taskDict[taskName]) == 3: #only cases are cuedTS tasks: 2 DVs from cuedTS (TASK & CUE); 1 from the other task
        dv0 = 'CUE'
        for i in range(0,3):
            temp = list(taskDict[taskName][i].keys())[0]
            if ('TASK' not in temp) and ('CUE' not in temp):
                dv1 = temp
                vals1 = list(taskDict[taskName][i].values())
        #TASK and CUE are considered as though one DV (e.g., 0); the other DV is 1
        base0_base1   = qaDf.filter(regex='TASK:stay_&_CUE:stay').filter(regex='%s' %vals1[0][0])
        base0_treat1  = qaDf.filter(regex='TASK:stay_&_CUE:stay').filter(regex='%s' %vals1[0][1])
        treat0_base1  = qaDf.filter(regex='TASK:switch_&_CUE:switch').filter(regex='%s' %vals1[0][0])
        treat0_treat1 = qaDf.filter(regex='TASK:switch_&_CUE:switch').filter(regex='%s' %vals1[0][1])

        DC_0_1 = pd.DataFrame(data=treat0_base1.iloc[:,0]  -  base0_base1.iloc[:,0], columns=['_'.join(['DC',dv0,dv1])])
        DC_1_0 = pd.DataFrame(data=base0_treat1.iloc[:,0]  -  base0_base1.iloc[:,0], columns=['_'.join(['DC',dv1,dv0])])
        DE_0_1 = pd.DataFrame(data=treat0_treat1.iloc[:,0] - base0_treat1.iloc[:,0], columns=['_'.join(['DE',dv0,dv1])])
        DE_1_0 = pd.DataFrame(data=treat0_treat1.iloc[:,0] - treat0_base1.iloc[:,0], columns=['_'.join(['DE',dv1,dv0])])

        mainDvDf = pd.concat([DC_0_1, DC_1_0, DE_0_1, DE_1_0],axis=1)
    
    elif len(taskDict[taskName]) == 4:
    # n_back_with_cued_task_switching is the only case (from cuedTS: 'TASK','CUE'; from n-back task: 'NBACK','DELAY').
    # If, when generating QA files (in concat_descriptive_network_attack.ipynb),
        # conditions = [...] is defined to exclude n_back_condition, then there's none.
    # If conditions includes n_back_condition, DELAY and NBACK should be considered as though one DV (like with TASK and CUE above)
        print('4 DVs: taskName')
    
    mainDvDf.to_csv(main_DVs_analyses + 'mainDVsRT/%s_mainDVsRT.csv' %taskName)
#     mainDvDf.to_csv(main_DVs_analyses + 'mainDVsAcc/%s_mainDVsAcc.csv' %taskName)
    

In [None]:
### one table with all SE, DC, DE data, using RTs or Accuracy 
### switch input and output directories accordingly
concat_mainDVs = pd.DataFrame()
for main_DVs_file in glob(main_DVs_analyses + 'mainDVsRT/*'):
# for main_DVs_file in glob(main_DVs_analyses + 'mainDVsAcc/*'):
    DVs_df = pd.read_csv(main_DVs_file,index_col=0)
    #concat all main_DVs_analyses after removing the workerId col
    concat_mainDVs = pd.concat([concat_mainDVs , DVs_df], axis=1)
    concat_mainDVs.columns = concat_mainDVs.columns.str.replace('DELAY','NBACK')
    
#remove duplicated cols by column name
concat_mainDVs = concat_mainDVs.loc[:,~concat_mainDVs.columns.duplicated()]

#reorder concat df to be clustered by tasks then write
single_cols = [i for i in concat_mainDVs.columns if ('SE' in i)]
single_cols = sorted(single_cols,key=str.casefold)

cued_cols = [i for i in concat_mainDVs.columns if ('DC_CUE' in i) or ('DE_CUE' in i)]
DF_cols = [i for i in concat_mainDVs.columns if ('DC_DF' in i) or ('DE_DF' in i)]
flanker_cols = [i for i in concat_mainDVs.columns if ('DC_FLANKER' in i) or ('DE_FLANKER' in i)]
GNG_cols = [i for i in concat_mainDVs.columns if ('DC_GNG' in i) or ('DE_GNG' in i)]
nback_cols = [i for i in concat_mainDVs.columns if ('DC_NBACK' in i) or ('DE_NBACK' in i)]
predict_cols = [i for i in concat_mainDVs.columns if ('DC_PREDICT' in i) or ('DE_PREDICT' in i)]
shape_cols = [i for i in concat_mainDVs.columns if ('DC_SHAPE' in i) or ('DE_SHAPE' in i)]
stop_cols = [i for i in concat_mainDVs.columns if ('DC_SS' in i) or ('DE_SS' in i)]

cued_cols_df = pd.concat([concat_mainDVs['SE_CUE'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(cued_cols)]],axis=1)
DF_cols_df = pd.concat([concat_mainDVs['SE_DF'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(DF_cols)]],axis=1)
flanker_cols_df = pd.concat([concat_mainDVs['SE_FLANKER'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(flanker_cols)]],axis=1)
GNG_cols_df = pd.concat([concat_mainDVs['SE_GNG'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(GNG_cols)]],axis=1)
nback_cols_df = pd.concat([concat_mainDVs['SE_NBACK'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(nback_cols)]],axis=1)
predict_cols_df = pd.concat([concat_mainDVs['SE_PREDICT'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(predict_cols)]],axis=1)
shape_cols_df = pd.concat([concat_mainDVs['SE_SHAPE'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(shape_cols)]],axis=1)
stop_cols_df = pd.concat([concat_mainDVs['SE_SS'],concat_mainDVs.loc[:,concat_mainDVs.columns.isin(stop_cols)]],axis=1)

#write concat df without the mean row
concat_mainDVs = pd.concat([cued_cols_df,DF_cols_df,flanker_cols_df,GNG_cols_df,
                            nback_cols_df,predict_cols_df,shape_cols_df,stop_cols_df],axis=1)
concat_mainDVs.iloc[:-1 , :].to_csv(main_DVs_analyses+'concat/concat_mainDVsRTs.csv', index=False)
# concat_mainDVs.iloc[:-1 , :].to_csv(main_DVs_analyses+'concat/concat_mainDVsAcc.csv', index=False)

In [None]:
### 8x8 mean DCs and DEs summary (using accuracy or RT as main DV, depending on which concat_mainDVs produced above)
tasks = ['CUE','PREDICT','DF','NBACK','FLANKER','SHAPE','GNG','SS']
meanDCsDf = pd.DataFrame(index=tasks, columns=tasks)
meanDEsDf = pd.DataFrame(index=tasks, columns=tasks)
#first generate the diagonal values, which are the SE values, then fill in means of DC and DE for each task cluster
for row in tasks:
    for col in tasks:
        if row==col:
            meanDCsDf.loc[row,col] = concat_mainDVs.loc['mean','SE_%s' %col]
            meanDEsDf.loc[row,col] = concat_mainDVs.loc['mean','SE_%s' %col]
        else:
            meanDCsDf.loc[row,col] = concat_mainDVs.loc['mean','DC_%s_%s' %(row,col)]
            meanDEsDf.loc[row,col] = concat_mainDVs.loc['mean','DE_%s_%s' %(row,col)]
output = pd.concat([meanDCsDf,meanDEsDf],axis=0, keys=['DC','DE'])
output.to_csv('/Users/tonyb/Desktop/meanDCsDEs_RTs_byConstructs.csv')

#include the following 2 for loops if standardizing by SE values
meanDCsDf_std = pd.DataFrame(index=tasks, columns=tasks)
meanDEsDf_std = pd.DataFrame(index=tasks, columns=tasks)
for row in tasks:
    for col in tasks:
        meanDCsDf_std.loc[row,col] = round(float(meanDCsDf.loc[row,col])/meanDCsDf.loc[row,row],3)
        meanDEsDf_std.loc[row,col] = round(float(meanDEsDf.loc[row,col])/meanDEsDf.loc[row,row],3)
output = pd.concat([meanDCsDf_std,meanDEsDf_std],axis=0, keys=['DC','DE'])
output.to_csv('/Users/tonyb/Desktop/meanDCsDEs_RTs_byConstructs_stdBySEs.csv')


### Heatmap among all SE, DC, and DE results

In [None]:
### correlations among all SE, DC, and DE data (from 33 explore subjs)
### change ou
concat_mainDVs_corr = round(concat_mainDVs.corr(),3)
concat_mainDVs_corr.to_csv(main_DVs_analyses+'concat/concat_mainDVsRTs_corr_index=False.csv', index=False)
# concat_mainDVs_corr.to_csv(main_DVs_analyses+'concat/concat_mainDVsAcc_corr_index=False.csv', index=False)

In [None]:
### heatmap: 1's on diagonal replaced by means of corresponding raw data
# annotData = pd.read_csv(main_DVs_analyses+'concat/concat_mainDVsRTs_corr_index=False.csv')
annotData = pd.read_csv(main_DVs_analyses+'concat/concat_mainDVsAcc_corr_index=False.csv')
annotData.index = annotData.columns
for i in annotData.index:
    for j in annotData.columns:
        if i==j:
            annotData.loc[i,j] = concat_mainDVs[j].mean()

#data for drawing heatmap
data = pd.read_csv(raw_dir+'main_DVs_analyses/concat/concat_mainDVsAcc_corr_index=False.csv') 
column_labels = data.columns
row_labels = data.columns

#set up background (fig) and main map (ax)
fig, ax = plt.subplots(figsize=(170,170), facecolor='beige')
ax.set_title('All DV variants Corr Matrix, DV=Accuracy',size=80)
heatmap = ax.pcolor(data, cmap=cm.seismic, vmin=-1, vmax=1)
 
# Put the major ticks at the middle of each cell
ax.set_xticks(np.arange(0.5,data.shape[1],1), minor=False)
ax.set_yticks(np.arange(0.5,data.shape[0],1), minor=False)
ax.tick_params(labelright=True, top=True, labeltop=True, bottom=False)

for y in range(data.shape[0]):
    for x in range(data.shape[1]):
        text = ax.text(x + 0.5, y + 0.5, '%.3f' % annotData.iloc[y, x],
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=11)
 
# Want a more natural, table-like display
ax.axis('square')
ax.invert_yaxis()
# ax.xaxis.tick_top()
 
ax.set_xticklabels(row_labels, minor=False, rotation=80, size=15)
ax.set_yticklabels(column_labels, minor=False, size=15)

# color bar
cb = plt.colorbar(heatmap,shrink=0.5, aspect=30, fraction=.12, pad=.02)
cb.ax.tick_params(labelsize=60)
plt.show()

### Task-cluster by Task-cluster correlation matrix

#### Each cell has the average correlation of all SE, DC, and DE results of a task cluster with those of another task cluster

In [None]:
tasks = ['CUE','PREDICT','DF','NBACK','FLANKER','SHAPE','GNG','SS']
taskByTask_meanCorr_matrix = pd.DataFrame(index=tasks, columns=tasks)
for i in tasks:
    start_index = concat_mainDVs_corr.index.get_loc(i)
    for j in tasks:
        start_col = concat_mainDVs_corr.columns.get_loc(j)
        #get the concat df
        taskByTask_corrDf = concat_mainDVs_corr.iloc[start_index:start_index+15 , start_col:start_col+15] #because there're 15 variables for each task (1 SE, 7 DCs, 7 DEs)
        #exclude 1's on the diagonal in correlation of the same 15 variants (i.e. of the same main_DV)
        if i==j:
            for a in range(taskByTask_corrDf.shape[0]):
                for b in range(taskByTask_corrDf.shape[1]):
                    if a==b:
                        taskByTask_corrDf.iloc[a,b] = np.nan
        #get average of all 15x15 corr numbers
        taskByTask_corrDf_mean = taskByTask_corrDf.iloc[:,:].mean().mean()
        taskByTask_meanCorr_matrix.loc[i,j] = round(taskByTask_corrDf_mean,3)
taskByTask_meanCorr_matrix.to_csv(main_DVs_concat_dir + 'taskByTask_Accuracy_meanCorr_matrix.csv', index=False)

#### Plot (heatmap)

In [None]:
df = pd.read_csv(main_DVs_concat_dir+'taskByTask_Accuracy_meanCorr_matrix.csv')

fig = plt.figure(figsize=(8,8),facecolor='beige')
plt.title('Mean Correlation Matrix, DV=Accuracy',size=14,pad=15)
plt.pcolor(df,cmap=cm.seismic,vmin=-1,vmax=1)
plt.yticks(np.arange(0.5, len(df.columns), 1), df.columns, size=11)
plt.xticks(np.arange(0.5, len(df.columns), 1), df.columns, rotation=70, size=11)
plt.axis('square')
for y in range(df.shape[0]):
    for x in range(df.shape[1]):
        plt.text(x + 0.5, y + 0.5, '%.3f' % df.iloc[y, x],
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=11)
cb = plt.colorbar(shrink=0.8, aspect=12.5, fraction=.12, pad=.03)
cb.ax.tick_params(labelsize=11)
plt.show()

### Separate condition data without any subtraction (as with DC or DE)

#### Separate conditions (using RTs or Accuracy), no further subtractions as when calculating for DC or DE results

In [8]:
separate_conditions_dict = defaultdict(pd.DataFrame)
concat_separate_conditions = pd.DataFrame()
slowing_vs_effectChange = pd.DataFrame()
choice = 1 #1: RTs, with SSRT; 2: RTs, with m(stopfail_rt), which is rt when a subj responded in a stop trial; 3: Accuracy

### adding single tasks data first to its corresponding key in separate_conditions_dict
# first line is regular, second is for use later in EZ-Diff model
for qa_file in sorted(glob(explore_dir + 'QA/*single*')):
# for qa_file in sorted(glob(explore_dir + 'QA_add0.5error_forACC=1/*single*')):
    taskName = os.path.basename(qa_file).split('.')[0][:-3] #the last three characters are '_qa'
    if 'no_go' in taskName: taskName = taskName.replace('no_go','nogo')
    qaDf = pd.read_csv(qa_file, index_col=0)
    if choice == 1:
        if 'stop' in taskName:
            qaDf = qaDf.filter(regex='m\(rt\)|ssrt')
        else:
            qaDf = qaDf.filter(regex='m\(rt\)')
    elif choice == 2:
        qaDf = qaDf.filter(regex='m\(')
    elif choice == 3:
        qaDf = qaDf.filter(regex='acc')
        
    if 'stop' in taskName: #somehow only stop-signal qa files have all columns for the stop condition before go
        qaDf_sortedCol = list(qaDf.columns)
        qaDf_sortedCol.sort()
        qaDf = qaDf[qaDf_sortedCol]
    
    for i in ['task:stay.*cue:switch','delay:3']:
        qaDf = qaDf.loc[:,~qaDf.columns.str.contains(i, case=False)]
    qaDf.columns = qaDf.columns.str.replace('\.0','')
    
    #the immediately following line is the correct way of multiindexing columns
#     qaDf.columns = pd.MultiIndex.from_product([[taskName],list(qaDf.columns)], names=['task_name','DV_condition'])
    #the following 3 lines are to write with only one occurrence per level-0 column index (i.e., aesthetic)
    multiIndex = list(np.repeat(' ', len(qaDf.columns)-1))
    multiIndex.insert(0,taskName)
    qaDf.columns = pd.MultiIndex.from_arrays([multiIndex, qaDf.columns], names=['task_name','DV_condition']) 
    
    task = taskName.split('_single_')[0]
    separate_conditions_dict[task] = pd.concat([separate_conditions_dict[task], qaDf], axis=1)

# then adding dual tasks data to its corresponding key in separate_conditions_dict
# for qa_file in sorted(glob(raw_dir + 'QA/*with*')):
for qa_file in sorted(glob(raw_dir + 'QA_add0.5error_forACC=1/*with*')):
    taskName = os.path.basename(qa_file).split('.')[0][:-3] #the last three characters are '_qa'
    if 'no_go' in taskName: taskName = taskName.replace('no_go','nogo')
    qaDf = pd.read_csv(qa_file, index_col=0)
    qaDf = qaDf.filter(regex='m\(')
    if choice == 1:
        if 'stop' in taskName:
            qaDf = qaDf.filter(regex='m\(rt\)|ssrt')
        else:
            qaDf = qaDf.filter(regex='m\(rt\)')
    elif choice == 2:
        qaDf = qaDf.filter(regex='m\(')
    elif choice == 3:
        qaDf = qaDf.filter(regex='acc')
        
    if 'stop' in taskName:
        qaDf_sortedCol = list(qaDf.columns)
        qaDf_sortedCol.sort()
        qaDf = qaDf[qaDf_sortedCol]

    for i in ['task:stay.*cue:switch','delay:3','m\(nogofail_rt\)_GNG:nogo_&_SS:stop']:
        qaDf = qaDf.loc[:,~qaDf.columns.str.contains(i, case=False)]
    qaDf.columns = qaDf.columns.str.replace('\.0','')

    #the immediately following line is the correct way of multiindexing columns
#     qaDf.columns = pd.MultiIndex.from_product([[taskName],list(qaDf.columns)], names=['task_name','DV_condition'])
    #the following 3 lines are to write with only one occurrence per level-0 column index (i.e., aesthetic)
    multiIndex = list(np.repeat(' ', len(qaDf.columns)-1))
    multiIndex.insert(0,taskName)
    qaDf.columns = pd.MultiIndex.from_arrays([multiIndex, qaDf.columns], names=['task_name','DV_condition'])
    
    for i in taskName.split('_with_'):
        separate_conditions_dict[i] = pd.concat([separate_conditions_dict[i], qaDf], axis=1)

# writing each cluster of 8 tasks (1 single, 7 dual) data to a file; and concat them to a dataframe to write
for key,val in separate_conditions_dict.items():
    val.loc['std'] = round(val.iloc[:-1,:].std(),2)
    #moving 'mean' and 'std' rows to top of dataframe
    new_index = list(val.index[-2:]) + list(val.index[0:-2])
    val = val.loc[new_index,:]
    #The following (up to val = pd.concat) is to average across all dual tasks
    val.columns = val.columns.droplevel(0)
    for i in val.columns[:2]:
        index = val.columns.get_loc(i)
        new_col = '%s_dual' %i
        if 'm(' in i:
            if 'fail' in i:
                val[new_col] = round(val.filter(regex='%s' %(val.columns[index][15:])).iloc[:,2:].mean(axis=1),2)
            else:
                val[new_col] = round(val.filter(regex='%s' %(val.columns[index][6:])).iloc[:,2:].mean(axis=1),2)
    val = pd.concat([val.iloc[:,:2],val.iloc[:,-2:]],axis=1)
    #calculate slowing (baseline dual minus baseline single) and effect-change (task-cost dual minus task-cost single)
    val2 = pd.DataFrame()
    val2['%s_slowing' %key] = val.iloc[:,2] - val.iloc[:,0]
    val2['%s_effect' %key] = (val.iloc[:,3] - val.iloc[:,2]) - (val.iloc[:,1] - val.iloc[:,0])
    #write
    #val.to_csv(separate_conditions+'RTs/%s_RTs.csv' %key,index=True)
    concat_separate_conditions = pd.concat([concat_separate_conditions, val], axis=1)
    slowing_vs_effectChange = pd.concat([slowing_vs_effectChange,val2], axis=1)
   
if choice < 3:
    # first line is regular, second is for use later in EZ-Diff model
    concat_separate_conditions.to_csv(separate_conditions+'RTs/concat_separate_conditions_RTs.csv',index=True)
    #concat_separate_conditions.to_csv(separate_conditions+'RTs/concat_separate_conditions_RTs_add0.5error_forACC=1.csv',index=True)
    slowing_vs_effectChange.to_csv(separate_conditions+'condensed/slowing_vs_effectChange_RTs.csv', index=True)
else:
    concat_separate_conditions.to_csv(separate_conditions+'accuracy/concat_separate_conditions_Accuracy.csv',index=True)
    #concat_separate_conditions.to_csv(separate_conditions+'accuracy/concat_separate_conditions_Accuracy_add0.5error_forACC=1.csv',index=True)
    slowing_vs_effectChange.to_csv(separate_conditions+'condensed/slowing_vs_effectChange_Accuracy.csv', index=True)

#### Heatmap for correlation b/w slowing and effect-change (RTs or Accuracy depending on the slowing_vs_effectChange df generated above)

In [None]:
raw_data = slowing_vs_effectChange.iloc[2:,:] #first two rows are means and std's
annotData = round(raw_data.corr(),4)
annotData.index = annotData.columns
for i in annotData.index:
    for j in annotData.columns:
        if i==j:
            annotData.loc[i,j] = round(raw_data[j].mean(),4)

#data for drawing heatmap
data = round(raw_data.corr(),4) 
column_labels = data.columns
row_labels = data.columns

#set up background (fig) and main map (ax)
fig, ax = plt.subplots(figsize=(20,20), facecolor='beige')
ax.set_title('Slowing vs. Effect-change, DV=RTs',size=20)
heatmap = ax.pcolor(data, cmap=cm.seismic, vmin=-1, vmax=1)
 
# Put the major ticks at the middle of each cell
ax.set_xticks(np.arange(0.5,data.shape[1],1), minor=False)
ax.set_yticks(np.arange(0.5,data.shape[0],1), minor=False)
ax.tick_params(labelright=True, top=True, labeltop=True, bottom=False)

for y in range(data.shape[0]):
    for x in range(data.shape[1]):
        text = ax.text(x + 0.5, y + 0.5, '%.3f' % annotData.iloc[y, x],
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=11)
 
# Want a more natural, table-like display
ax.axis('square')
ax.invert_yaxis()
# ax.xaxis.tick_top()
 
ax.set_xticklabels(row_labels, minor=False, rotation=80, size=15)
ax.set_yticklabels(column_labels, minor=False, size=15)

# color bar
cb = plt.colorbar(heatmap,shrink=0.5, aspect=30, fraction=.12, pad=.25)
cb.ax.tick_params(labelsize=15)
plt.show()

### Check for identical columns in separate-conditions data frames

In [None]:
#won't print if there are no idential columns
for cluster, cluster_df in separate_conditions_dict.items():
    print(cluster)
    test = cluster_df
    for i in range(len(test.columns)-1):
        for j in range(i+1,len(test.columns)):
            count = 0
            for row in range(len(test)):
                if test.iloc[row,i] == test.iloc[row,j] or np.isnan(test.iloc[row,i]) and np.isnan(test.iloc[row,j]): 
                    count +=1
            if count == len(test): print('%s == %s' %(test.columns[i],test.columns[j]))