# HDDM analysis of EMBARC PRT data 

In [78]:
# Few changes here . . .

import datetime, mmap,os,re # convention is to import things without "as x" first, on one line if possible
import numpy as np # always useful
import pandas as pd
import seaborn as sns # uses matplotlib but more intuitive and streamline for what we do
import matplotlib.pyplot as plt
# below makes graphs open in the nb instead of in a separate window
%matplotlib inline 

In [86]:
uname = !whoami
uname = uname[0]
#path2analysis = '/Users/' + uname + '/Work/Expts/PRT_DDM/Analysis/EMBARC_HDDM/' 
path2analysis = '/Users/' + uname + '/Work/Expts/EMBARC/' # also where git repo lives


In [87]:
# This is a more compact way to do the job handled in the next few cells. Tuples are often useful and tuple unpacking 
# (e.g., state, abrev = val) is frequently helpful, esp. with pandas . . . 

for val in [('Massachusetts','MG'),('Michigan','UM'),('New_York','CU'),('Texas','TX')]:
    state, abrev = val
    base_dir = '/Users/' + uname + '/Work/Expts/EMBARC/Data/PRT/' + state + '/'
    temp_dir = base_dir + abrev + 'Temp/'
    old_files = base_dir + 'embarc_CU_' + abrev + '0*/done/sigdet_output*out'
    flist = !ls {old_files}
    
    !mkdir {temp_dir}
    
    for old_fname in flist:
        if abrev != 'CU':
            sub = old_fname.split('_')[2]
        else:
            sub = old_fname.split('_')[3]
        new_file = sub + '_out.txt'
        new_fname = temp_dir + new_file
        
        !cp {old_fname} {new_fname} # Done this way you don't get error messages when there's no old file

mkdir: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/: File exists
mkdir: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Michigan/UMTemp/: File exists
mkdir: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/New_York/CUTemp/: File exists
usage: cp [-R [-H | -L | -P]] [-fi | -n] [-apvX] source_file target_file
       cp [-R [-H | -L | -P]] [-fi | -n] [-apvX] source_file ... target_directory
mkdir: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/: File exists


In [161]:
# I love the code you wrote to parse the files using regex, awesome!! A couple things to keep in mind that would
# make it even better: (a) always include a docstring for your functions and (b) explicit is better than implicit
# (see here: https://www.python.org/dev/peps/pep-0020/) So when you're choosing variable names, try to avoid things 
# like 'temp1' in favor of more descriptive (but still succinct) names (e.g., "old_fname"). Not always possible:)

def SigDetParse(fname):
    '''Open the file, use regex to pull out key vars, output a clean df.'''
    
    df = []
    d = {}
    ct = 0
    
    # Begin reading in data from the line including 'reward_due', as that's unlikely to occur anywhere earlier
    with open(fname) as f:
        for line in f:
            if 'reward_due' in line:
                for line in f:
                    d[ct] = {'trial':line.split('\t')[0],
                             'length':line.split('\t')[1],
                             'time':line.split('\t')[2], # tempting to change to RT but keep same for backwards compat
                             'key_press':line.split('\t')[3],
                             'correct':line.split('\t')[4],
                             'did_reward':line.split('\t')[5],
                             'reward_due':line.split('\t')[6],
                             'rich_due':line.split('\t')[7],
                             'lean_due':line.split('\t')[8],
                             'outlier':line.split('\t')[9].strip('\n')}
                    ct = ct + 1
    df = pd.DataFrame.from_dict(d,orient='index')
    

    for val in [('rich_key','Rich key:'),('lean_key','Lean key:'), ('subject','Subject ID:'), ('date','Date:'), 
                ('bias','Bias:'), ('rich_stim','Rich stimulus:'), ('lean_stim','Lean stimulus:')]:
        
        var_name, pattern = val
        df[var_name] = np.nan # Need some default values b/c some subjects have missing data        

        if var_name != 'date':
            var_def = re.compile(pattern + '[\s]+([\w]+)')
        else:
            var_def = re.compile(pattern + '[\s]+([\d]+/[\d]+/[\d]+)')
        
        with open(fname) as f:
            for line in f:
                var_match = var_def.search(line)
                if var_match:
                    result = var_match.group(1)
                    #checks if the subject ID is weird, prints it, and fixes it. For this data, all 3 digit sub ids
                    #are fixed by adding a 0 to the beginning. 
                    if var_name =='subject' and len(result) !=4:
                        print('Check sub ID: ' + fname)
                        if len(result) == 3:
                            df[var_name]='0'+ result
                        if len(result) == 1:
                            df[var_name] ='000' + result
                    else:
                        df[var_name] = result

    return df

In [166]:
# Now you can just iterate over the files in each dir and use the code above to read each file into a ginorm df
today = datetime.datetime.today().strftime("%m_%d_%Y")

dfs = []
base_dir = '/Users/' + uname + '/Work/Expts/EMBARC/Data/PRT/'
for val in [('Massachusetts','MG'),('Michigan','UM'),('New_York','CU'),('Texas','TX')]:
    state, abrev = val
    state_dir = base_dir + state + '/' + abrev + 'Temp'
    flist = !ls {state_dir}
    
    for fname in flist:
        curr_path = state_dir + '/' + fname 
        statinfo = os.stat(curr_path) # checking for empty files b/c I found one . . .
        if statinfo.st_size == 0:
            print ('Empty file: '+ curr_path)
        else:
            df = SigDetParse(curr_path)
            df['site'] = abrev
            df['ProjectSpecificID'] = df['site'] + df['subject']
            dfs.append(df)
out = pd.concat(dfs)
out = out[['ProjectSpecificID','site','subject','date','bias','rich_stim','lean_stim','rich_key','lean_key',
           'trial','length','time','key_press','correct','did_reward','reward_due','rich_due','lean_due','outlier']]
out.to_csv(path2analysis + 'embarc_PRT_one_sess_' + today + ' .csv')

Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0020MGBP1R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0025MGBP2R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0027MGBP1R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0039MGBP2R1_out.txt
Empty file: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0270MGBP1R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Michigan/UMTemp/UM0001UMBP1R1_out.txt
Empty file: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/TX0038MGBP3R1_out.txt
Empty file: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/TX0198TXBP2R1_out.txt


In [167]:
# I will work on a way to combine this with the above kernel, but for now this makes a separate file 
# that only contains data from the first session (since some people did multiple sessions)
today = datetime.datetime.today().strftime("%m_%d_%Y")

dfs = []
base_dir = '/Users/' + uname + '/Work/Expts/EMBARC/Data/PRT/'
for val in [('Massachusetts','MG'),('Michigan','UM'),('New_York','CU'),('Texas','TX')]:
    state, abrev = val
    state_dir = base_dir + state + '/' + abrev + 'Temp'
    state_one_sess = base_dir + state + '/' + abrev + 'Temp/*P1*'
    flist = !ls {state_dir}
    flist_one_sess = !ls {state_one_sess}
    
    for fname in flist_one_sess:
        curr_path = fname   
        statinfo = os.stat(curr_path) # checking for empty files b/c I found one . . .
        if statinfo.st_size == 0:
            print ('Empty file: '+ curr_path)
        else:
            df = SigDetParse(curr_path)
            df['site'] = abrev
            df['ProjectSpecificID'] = df['site'] + df['subject']
            dfs.append(df)
out = pd.concat(dfs)
out = out[['ProjectSpecificID','site','subject','date','bias','rich_stim','lean_stim','rich_key','lean_key',
           'trial','length','time','key_press','correct','did_reward','reward_due','rich_due','lean_due','outlier']]
out.to_csv(path2analysis + 'embarc_PRT_one_sess_' + today + '.csv')

Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0020MGBP1R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0027MGBP1R1_out.txt
Empty file: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0270MGBP1R1_out.txt
Check sub ID: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Michigan/UMTemp/UM0001UMBP1R1_out.txt


In [168]:
out.head()

Unnamed: 0,ProjectSpecificID,site,subject,date,bias,rich_stim,lean_stim,rich_key,lean_key,trial,length,time,key_press,correct,did_reward,reward_due,rich_due,lean_due,outlier
0,MG0001,MG,1,12/14/2011,short,short,long,c,m,1,short,683,c,1,0,0,0,0,0
1,MG0001,MG,1,12/14/2011,short,short,long,c,m,2,long,1307,c,0,0,1,0,1,1
2,MG0001,MG,1,12/14/2011,short,short,long,c,m,3,short,577,c,1,1,1,0,1,0
3,MG0001,MG,1,12/14/2011,short,short,long,c,m,4,long,677,m,1,1,1,0,0,0
4,MG0001,MG,1,12/14/2011,short,short,long,c,m,5,short,724,c,1,0,0,0,0,0


In [169]:
out.tail()

Unnamed: 0,ProjectSpecificID,site,subject,date,bias,rich_stim,lean_stim,rich_key,lean_key,trial,length,time,key_press,correct,did_reward,reward_due,rich_due,lean_due,outlier
195,TX0204,TX,204,10/12/2015,short,short,long,c,m,196,short,686,m,0,0,1,1,0,0
196,TX0204,TX,204,10/12/2015,short,short,long,c,m,197,long,365,m,1,1,1,1,0,0
197,TX0204,TX,204,10/12/2015,short,short,long,c,m,198,long,846,m,1,0,0,1,0,0
198,TX0204,TX,204,10/12/2015,short,short,long,c,m,199,short,414,c,1,1,1,0,0,0
199,TX0204,TX,204,10/12/2015,short,short,long,c,m,200,short,1083,c,1,0,0,0,0,0


In [170]:
len(out)

79770

In [171]:
# Looks right . . . 
out.ProjectSpecificID.nunique()

389

In [172]:
# Note that the CSV has been written out
%ls {path2analysis}

Add_Group.ipynb                      embarc_PRT_11_15_2016 .csv
[34mData[m[m/                                embarc_PRT_one_sess11_15_2016 .csv
Old_PRT_Prettier.ipynb               embarc_PRT_one_sess11_15_2016.csv
PRT_DDM-Copy1.ipynb                  embarc_PRT_one_sess_11_15_2016 .csv
PRT_DDM.ipynb                        embarc_PRT_one_sess_11_15_2016.csv
PRT_Prettier.ipynb                   groups.csv
README.md                            why
eh.csv                               why.csv


In [173]:
# Check that change fixed the wacky numbers, looks like it did
out.subject.unique()

array(['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008',
       '0018', '0020', '0021', '0025', '0027', '0028', '0030', '0032',
       '0039', '0040', '0051', '0060', '0064', '0066', '0069', '0070',
       '0074', '0076', '0081', '0086', '0101', '0104', '0106', '0112',
       '0116', '0117', '0120', '0125', '0126', '0135', '0137', '0138',
       '0142', '0152', '0155', '0157', '0158', '0161', '0164', '0168',
       '0172', '0180', '0182', '0185', '0187', '0198', '0202', '0206',
       '0207', '0209', '0213', '0214', '0218', '0220', '0222', '0228',
       '0230', '0231', '0238', '0239', '0242', '0243', '0246', '0248',
       '0251', '0252', '0253', '0256', '0257', '0259', '0261', '0269',
       '0270', '0009', '0011', '0012', '0014', '0015', '0016', '0017',
       '0023', '0024', '0029', '0031', '0033', '0034', '0035', '0036',
       '0037', '0038', '0042', '0046', '0047', '0048', '0049', '0050',
       '0052', '0056', '0058', '0065', '0073', '0077', '0078', '0079',
      

In [174]:
#Combines the group data (groups.csv) with the task data, is really big, prob best to not work with too much
df_task = pd.read_csv('embarc_PRT_one_sess_11_15_2016.csv')
df_group = pd.read_csv('groups.csv',encoding="latin-1")
df_all = df_task.merge(df_group, how='outer')

df_all.to_csv('embarc_combined_' + today+'.csv')

  interactivity=interactivity, compiler=compiler, result=result)
