# HDDM analysis of EMBARC PRT data 

In [5]:
# Few changes here . . .

import datetime, mmap,os,re # convention is to import things without "as x" first, on one line if possible
import numpy as np # always useful
import pandas as pd
import seaborn as sns # uses matplotlib but more intuitive and streamline for what we do
import matplotlib.pyplot as plt
# below makes graphs open in the nb instead of in a separate window
%matplotlib inline 

In [2]:
uname = !whoami
uname = uname[0]
path2analysis = '/Users/' + uname + '/Work/Expts/PRT_DDM/Analysis/EMBARC_HDDM/' # also where git repo lives

In [3]:
# This is a more compact way to do the job handled in the next few cells. Tuples are often useful and tuple unpacking 
# (e.g., state, abrev = val) is frequently helpful, esp. with pandas . . . 

for val in [('Massachusetts','MG'),('Michigan','UM'),('New_York','CU'),('Texas','TX')]:
    state, abrev = val
    base_dir = '/Users/' + uname + '/Work/Expts/EMBARC/Data/PRT/' + state + '/'
    temp_dir = base_dir + abrev + 'Temp/'
    old_files = base_dir + 'embarc_CU_' + abrev + '0*/done/sigdet_output*out'
    flist = !ls {old_files}
    
    !mkdir {temp_dir}
    
    for old_fname in flist:
        if abrev != 'CU':
            sub = old_fname.split('_')[2]
        else:
            sub = old_fname.split('_')[3]
        new_file = sub + '_out.txt'
        new_fname = temp_dir + new_file
        
        !cp {old_fname} {new_fname} # Done this way you don't get error messages when there's no old file

mkdir: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/: File exists
mkdir: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Michigan/UMTemp/: File exists
mkdir: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/New_York/CUTemp/: File exists
usage: cp [-R [-H | -L | -P]] [-fi | -n] [-apvX] source_file target_file
       cp [-R [-H | -L | -P]] [-fi | -n] [-apvX] source_file ... target_directory
mkdir: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/: File exists


In [4]:
# I love the code you wrote to parse the files using regex, awesome!! A couple things to keep in mind that would
# make it even better: (a) always include a docstring for your functions and (b) explicit is better than implicit
# (see here: https://www.python.org/dev/peps/pep-0020/) So when you're choosing variable names, try to avoid things 
# like 'temp1' in favor of more descriptive (but still succinct) names (e.g., "old_fname"). Not always possible:)

def SigDetParse(fname):
    '''Open the file, use regex to pull out key vars, output a clean df.'''
    
    df = []
    d = {}
    ct = 0
    
    # Begin reading in data from the line including 'reward_due', as that's unlikely to occur anywhere earlier
    with open(fname) as f:
        for line in f:
            if 'reward_due' in line:
                for line in f:
                    d[ct] = {'trial':line.split('\t')[0],
                             'length':line.split('\t')[1],
                             'time':line.split('\t')[2], # tempting to change to RT but keep same for backwards compat
                             'key_press':line.split('\t')[3],
                             'correct':line.split('\t')[4],
                             'did_reward':line.split('\t')[5],
                             'reward_due':line.split('\t')[6],
                             'rich_due':line.split('\t')[7],
                             'lean_due':line.split('\t')[8],
                             'outlier':line.split('\t')[9].strip('\n')}
                    ct = ct + 1
    df = pd.DataFrame.from_dict(d,orient='index')
    

    for val in [('rich_key','Rich key:'),('lean_key','Lean key:'), ('subject','Subject ID:'), ('date','Date:'), 
                ('bias','Bias:'), ('rich_stim','Rich stimulus:'), ('lean_stim','Lean stimulus:')]:
        
        var_name, pattern = val
        df[var_name] = np.nan # Need some default values b/c some subjects have missing data        

        if var_name != 'date':
            var_def = re.compile(pattern + '[\s]+([\w]+)')
        else:
            var_def = re.compile(pattern + '[\s]+([\d]+/[\d]+/[\d]+)')
        
        with open(fname) as f:
            for line in f:
                var_match = var_def.search(line)
                if var_match:
                    result = var_match.group(1)
                    df[var_name] = result

    return df

In [6]:
# Now you can just iterate over the files in each dir and use the code above to read each file into a ginorm df
today = datetime.datetime.today().strftime("%m_%d_%Y")

dfs = []
base_dir = '/Users/' + uname + '/Work/Expts/EMBARC/Data/PRT/'
for val in [('Massachusetts','MG'),('Michigan','UM'),('New_York','CU'),('Texas','TX')]:
    state, abrev = val
    state_dir = base_dir + state + '/' + abrev + 'Temp'
    flist = !ls {state_dir}
    
    for fname in flist:
        curr_path = state_dir + '/' + fname
        statinfo = os.stat(curr_path) # checking for empty files b/c I found one . . .
        if statinfo.st_size == 0:
            print 'Empty file: ' + curr_path
        else:
            df = SigDetParse(curr_path)
            df['site'] = abrev
            df['ProjectSpecificID'] = df['site'] + '_' + df['subject']
            dfs.append(df)
out = pd.concat(dfs)
out = out[['ProjectSpecificID','site','subject','date','bias','rich_stim','lean_stim','rich_key','lean_key',
           'trial','length','time','key_press','correct','did_reward','reward_due','rich_due','lean_due','outlier']]
out.to_csv(path2analysis + 'embarc_PRT_' + today + '.csv')

Empty file: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp/MG0270MGBP1R1_out.txt
Empty file: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/TX0038MGBP3R1_out.txt
Empty file: /Users/danieldillon/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp/TX0198TXBP2R1_out.txt


In [7]:
out.head()

Unnamed: 0,ProjectSpecificID,site,subject,date,bias,rich_stim,lean_stim,rich_key,lean_key,trial,length,time,key_press,correct,did_reward,reward_due,rich_due,lean_due,outlier
0,MG_0001,MG,1,12/14/2011,short,short,long,c,m,1,short,683,c,1,0,0,0,0,0
1,MG_0001,MG,1,12/14/2011,short,short,long,c,m,2,long,1307,c,0,0,1,0,1,1
2,MG_0001,MG,1,12/14/2011,short,short,long,c,m,3,short,577,c,1,1,1,0,1,0
3,MG_0001,MG,1,12/14/2011,short,short,long,c,m,4,long,677,m,1,1,1,0,0,0
4,MG_0001,MG,1,12/14/2011,short,short,long,c,m,5,short,724,c,1,0,0,0,0,0


In [8]:
out.tail()

Unnamed: 0,ProjectSpecificID,site,subject,date,bias,rich_stim,lean_stim,rich_key,lean_key,trial,length,time,key_press,correct,did_reward,reward_due,rich_due,lean_due,outlier
195,TX_0204,TX,204,10/12/2015,short,short,long,c,m,196,short,686,m,0,0,1,1,0,0
196,TX_0204,TX,204,10/12/2015,short,short,long,c,m,197,long,365,m,1,1,1,1,0,0
197,TX_0204,TX,204,10/12/2015,short,short,long,c,m,198,long,846,m,1,0,0,1,0,0
198,TX_0204,TX,204,10/12/2015,short,short,long,c,m,199,short,414,c,1,1,1,0,0,0
199,TX_0204,TX,204,10/12/2015,short,short,long,c,m,200,short,1083,c,1,0,0,0,0,0


In [9]:
len(out)

148440

In [13]:
# Looks right . . . 
out.ProjectSpecificID.nunique()

396

In [15]:
# Note that the CSV has been written out
%ls {path2analysis}

PRT_Prettier.ipynb         README.md                  embarc_PRT_11_15_2016.csv


In [11]:
# Looking for whacky subject numbers . . . please double-check on the odd ones when you have time . . . 
out.subject.unique()

array(['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008',
       '0018', '020', '0020', '0021', '0025', '025', '0026', '027', '0028',
       '0030', '0032', '0039', '039', '0040', '0051', '0060', '0064',
       '0066', '0069', '0070', '0074', '0076', '0081', '0086', '0101',
       '0104', '0106', '0112', '0116', '0117', '0120', '0125', '0126',
       '0135', '0137', '0138', '0142', '0152', '0155', '0157', '0158',
       '0161', '0164', '0168', '0172', '0180', '0182', '0185', '0187',
       '0198', '0202', '0206', '0207', '0209', '0213', '0214', '0218',
       '0220', '0222', '0228', '0230', '0231', '0238', '0239', '0242',
       '0243', '0246', '0248', '0251', '0252', '0253', '0256', '0257',
       '0259', '0261', '0269', '0270', '1', '0009', '0011', '0012', '0014',
       '0015', '0016', '0017', '0023', '0024', '0027', '0029', '0031',
       '0033', '0034', '0035', '0036', '0037', '0038', '0042', '0046',
       '0047', '0048', '0049', '0050', '0052', '0056', '0058', '0065

In [12]:
# This is what I was talking about doing, iterating over all the rows in out, looking for subject == NaN, then 
# printing the row . . . actually not that helpful for the goal of identifying the file with subject == NaN:(
# Can you do better?

for i,row_i in out.iterrows():
    if pd.isnull(row_i.subject):
        print row_i

ProjectSpecificID      NaN
site                    CU
subject                NaN
date                   NaN
bias                 short
rich_stim            short
lean_stim             long
rich_key                 m
lean_key                 c
trial                    1
length               short
time                   693
key_press                c
correct                  0
did_reward               0
reward_due               0
rich_due                 0
lean_due                 0
outlier                  0
Name: 0, dtype: object
ProjectSpecificID      NaN
site                    CU
subject                NaN
date                   NaN
bias                 short
rich_stim            short
lean_stim             long
rich_key                 m
lean_key                 c
trial                    2
length                long
time                  3362
key_press                c
correct                  1
did_reward               1
reward_due               1
rich_due                 0
lean_

Older code below . . . 

In [299]:
# Names of the temporary directories (sl for slash)
mgtemp = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGTemp'
mgtempsl = mgtemp + '/'
umtemp = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Michigan/UMTemp'
umtempsl = umtemp + '/'
cutemp = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/New_York/CUTemp'
cutempsl = cutemp + '/'
txtemp = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Texas/TXTemp'
txtempsl = txtemp + '/'

# Makes all of the temporary directories
!mkdir {mgtemp}
!mkdir {umtemp}
!mkdir {cutemp}
!mkdir {txtemp}

In [300]:
mg = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Massachusetts/embarc_CU_MG0*'
um = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Michigan/embarc_CU_UM0*'
cu = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/New_York/embarc_CU_CU0*'
tx = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Texas/embarc_CU_TX0*'

# the data files in the folder
mgfolders = !ls -d {mg}
umfolders = !ls -d {um}
cufolders = !ls -d {cu}
txfolders = !ls -d {tx}

In [301]:
# takes in the list of folders in the state, the temp file, and what to split the name by
def grabdata(statefolders, temp, split_by):
    for folder in statefolders:
        sub = folder.split('_')[split_by]
        new_name = sub + '_out.txt'
        new_dir = temp + new_name
        !cp {folder}/done/sigdet_output*out {new_dir}

In [302]:
# grabs the data for all state folders
# output lists people who don't have the data folder
grabdata(mgfolders, mgtempsl, 2)
grabdata(umfolders, umtempsl, 2)
grabdata(cufolders, cutempsl, 3)
grabdata(txfolders, txtempsl, 2)


cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Massachusetts/embarc_CU_MG0026MGBP1R1_flankerprt_20121208/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Michigan/embarc_CU_UM0012UMBP1R1_flankerprt_20120628/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/Michigan/embarc_CU_UM0094UMBP1R1_flankerprt_20141113/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/New_York/embarc_CU_CU0012CUBP1R1_flankerprt_20120515/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/New_York/embarc_CU_CU0012CUBP1R1_flankerprt_20130308/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/New_York/embarc_CU_CU0017UMBP1R1_flankerprt_20120608/done/sigdet_output*out: No such file or directory
cp: /Users/mlm2/Work/Expts/EMBARC/Data/PRT/New_York/embarc_CU_CU0018CUBP1R1_flankerprt_20120515/done/sigdet_output*out:

In [303]:
# for easier referencing 
cusplit = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/New_York/CUSplit/'
mgsplit = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGSplit/'
umsplit = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Michigan/UMSplit/'
txsplit = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Texas/TXSplit/'

cufinal = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/New_York/CUFinal/'
mgfinal = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Massachusetts/MGFinal/'
umfinal = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Michigan/UMFinal/'
txfinal = '/Users/'+ uname +'/Work/Expts/EMBARC/Data/PRT/Texas/TXFinal/'

In [304]:
# Makes the directories where the split data files will go
!mkdir {cusplit}
!mkdir {mgsplit}
!mkdir {umsplit}
!mkdir {txsplit}

# Makes the directories where the final data files will go
!mkdir {cufinal}
!mkdir {mgfinal}
!mkdir {umfinal}
!mkdir {txfinal}

In [305]:
# finds the given key in this file
def findmatch(key, split_by, tempsl, filename):
    rx=r''+key+'(.*)'
    f = open(tempsl + filename, 'r', encoding='windows-1252')
    for eachline in f:
        string=re.match(rx, eachline)
        if string:
            return (string.group().split(' ')[split_by])
    else:
        return ''


In [306]:
def main(temp, tempsl, split, final, loc):
    for filename in os.listdir(temp):
        
        #searching for matches in the original file
        richkey = findmatch('Rich', 2, tempsl, filename)
        leankey = findmatch('Lean key: ', 2, tempsl, filename)
        subid = findmatch('Subject ID: ', 2, tempsl, filename)
        date = findmatch('Date: ', 1, tempsl, filename)
        bias = findmatch('Bias: ', 1, tempsl, filename)
        richstim = findmatch('Rich stimulus: ', 2,  tempsl, filename)
        leanstim = findmatch('Lean stimulus: ', 2, tempsl, filename)
        newfile = open(tempsl + filename, 'r', encoding='windows-1252').read().split('\n\n\n')
        sub = filename.split('_')[0]
        names = [ 'X' + sub +'_info.txt', sub +'.txt']
        
        #splits the the file into two, saves the data one in split folder
        for num, file in enumerate(newfile):
            open(split + names[1],'w').write(file)
        
        #open as df, add the info from the regular expression portion to the df, save as csv
        try:
            df = pd.read_csv(split + sub +'.txt', '\t')
            df["ID"]= loc + subid
            df['Site']= loc
            df['Date']= date
            df['Bias']= bias 
            df['RichStim']= richstim
            df['LeanStim']= leanstim
            df['LeanKey']= leankey
            df['RichKey']= richkey
            df.to_csv(final + sub + '.csv')
        except:
            pass
        

In [307]:
# pass all state files to the main function
main(umtemp, umtempsl, umsplit, umfinal, 'UM')
main(mgtemp, mgtempsl, mgsplit, mgfinal, 'MG')
main(cutemp, cutempsl, cusplit, cufinal, 'CU')
main(txtemp, txtempsl, txsplit, txfinal, 'TX')