In [1]:
# before running this notebook, please make sure all the new tsl files have preprocessed (adding a stim column into the datasheet)
import os
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline
sns.set_style('whitegrid')

In [10]:
language_1 = [1,2,2,2,1,1,2,1,1,2,1,2,1,1,2,2,1,1,2,1,2,2,1,2,2,2,1,2,1,2,1,1]
language_2 = [1,1,2,1,1,1,2,2,2,2,1,1,1,2,2,1,2,2,1,1,2,1,2,1,2,1,2,1,1,2,2,2]

kids_cols = [
    'rt', 'responses', 'trial_type', 'trial_index', 
    'time_elapsed', 'internal_node_id', 'stimulus', 
    'key_press', 'cond', 'targ', 'value'

]

"""
these require someone to check column headers for the following adult participants
# TSL 1, 16, 26, 29, 36, 40, 41
# VSL 3, 4, 7, 16, 30, 34, 36
"""

base_dir ='/om/user/zqi/projects/slbeh/cogscigame/data_structured/retest'
sub_dirs = 'vsl tsl'.split(' ')

def add_cols(data, directory):
    columns_adults = [
        'rt','responses','trial_type',
        'trial_index','time_elapsed',
        'internal_node_id','stimulus',
        'key_press','cond','targ','value'
    ]
    if directory == 'tsl_adults' or directory == 'vsl_adults':
        data.columns = columns_adults
    return data

data_store = {
    #'tsl_adults': [],
    #'vsl_adults': [],
    'tsl_kids': [],
    'vsl_kids': []
}

for key in data_store.keys():
    if (key == 'tsl_adults' or key == 'vsl_adults'):
        for data in os.listdir(os.path.join(base_dir, key)):
            data_store[key].append(add_cols(
                data=pd.read_csv(os.path.join(base_dir, key, data), header=None),
                directory=key
            ))
    elif (key == 'tsl_kids' or key == 'vsl_kids'):
        for data in os.listdir(os.path.join(base_dir, key)):
            df = pd.read_csv(os.path.join(base_dir, key, data))
            #if int(df.ix[0, 1][7:11]) in [3130, 3161, 3224, 3236, 3330, 3331, 3332]:
            #    df = pd.read_csv(os.path.join(base_dir, key, data), header=None)
            #    df.columns = kids_cols
            data_store[key].append(df)
            

In [11]:
def vsl_rt(data):
    """
    first item does not count, c is indexed accordingly
    returns response times as a list of tuples, (n_trial, n_response_time)
    input is single subject data
    """
    c = []
    data.index = [x for x in range(data.shape[0])]
    for row in range(309):
        try:
            if (data.loc[row, 'targ'] in data.loc[row, 'stimulus']) and \
            (int(data.loc[row, 'rt']) != -1) and (int(data.loc[row, 'rt']) < 1000):
                c.append((row, int(data.loc[row, 'rt'])))
        except TypeError:
            pass
    return c

def tsl_rt(data):
    something = []
    data.index = [x for x in range(data.shape[0])]
    for row in range(602): #total number of events
        try:
            if data.loc[row, 'targ'] in data.loc[row, 'stimulus']:
                if int(data.loc[(row - 2), 'rt']) != -1000:
                    something.append((
                                      row - 2,
                                      int(data.loc[(row - 2), 'rt'])
                                     ))
                elif int(data.loc[(row - 1), 'rt']) != -1000:
                    something.append((
                                      row - 1,
                                      int(data.loc[(row - 1), 'rt'])
                                     ))
                elif int(data.loc[(row), 'rt']) != -1000:
                    something.append((
                                      row,
                                      int(data.loc[row, 'rt'])
                                     ))
                elif int(data.loc[(row + 1), 'rt']) != -1000:
                    something.append((
                                      row + 1,
                                      int(data.loc[(row + 1), 'rt'])
                                     ))
                elif int(data.loc[(row + 2), 'rt']) != -1000:
                    something.append((
                                      row + 2,
                                      int(data.loc[(row + 2), 'rt'])
                                     ))
        except TypeError:
            pass
    return something

def acc_vsl(data):
    language = data.cond[100]
    fc_idx = 309 # where the forced-choice task starts in the list of events
    trials = []
    res = data.loc[fc_idx:, ['key_press','stimulus']]
    res.index = [x for x in range(res.shape[0])]
    for i in range(res.shape[0]):
        if (int(res.loc[i, 'key_press']) != -1) and (pd.isnull(res.loc[i, 'stimulus'])):
            trials.append(int(res.loc[i, 'key_press']))
    converted = []
    for val in trials:
        if int(val) == 49: #response button number
            converted.append(1)
        elif int(val) == 50:
                converted.append(2)
    if language == 'lang1':
        pat = language_1
    elif language == 'lang2':
        pat = language_2   
    return (np.array(converted) == np.array(pat)).mean() 

def acc_tsl(data_frame):
    language = data_frame.cond[100]
    # 37 -> left_arrow, 39 -> right_arrow
    fc_idx = 607
    responses = data_frame.loc[fc_idx:, ['key_press','stimulus']]
    responses.index= [x for x in range(responses.shape[0])]
    idxs = []
    converted = []
    c = 0
    i = 6
    while c < 224:
        idxs.append(i)
        i += 7
        c += 7
    responses = responses.loc[idxs, 'key_press']
    for val in responses:
        if int(val) == 37:
            converted.append(1)
        elif int(val) == 39:
            converted.append(2)
    if language == 'lang1':
        pat = language_1
    elif language == 'lang2':
        pat = language_2
    return (np.array(converted) == np.array(pat)).mean()

In [12]:
def phase1_tbt_vsl(data, par_type):
    """
    return single participant response time
    target trial response time
    """
    if par_type == 'kid':
        vsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        vsl_id = data['responses'][0][7:12]
    keep = ['rt', 'id', 'cond', 'targ', 'trial_index', 'trial_mask']
    data.index = [x for x in range(data.shape[0])]
    indexes = []
    for idx in range(309):
        try:
            if data.loc[idx, 'targ'] in data.loc[idx, 'stimulus']:
                if int(data.loc[idx, 'rt']) < 1000 and int(data.loc[idx, 'rt']) != -1:
                    indexes.append(idx)
        except TypeError:
            pass
    new = data.loc[indexes]
    new.index = [x for x in range(new.shape[0])]
    new['id'] = [vsl_id for x in range(new.shape[0])]
    new['trial_mask'] = [x for x in range(new.shape[0])]
    return new[keep]

def phase1_tbt_tsl(data, par_type):
    """
    return single participant response time
    target trial response time
    """
    indexes = []
    if par_type == 'kid':
        tsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        tsl_id = data['responses'][0][7:12]
    keep = ['rt', 'id', 'cond', 'targ', 'trial_index', 'trial_mask']
    data.index = [x for x in range(data.shape[0])]
    for row in range(602):
        try:
            if data.loc[row, 'targ'] in data.loc[row, 'stimulus']:
                if int(data.loc[(row - 2), 'rt']) != -1000:
                    indexes.append(row - 2)
                elif int(data.loc[(row - 1), 'rt']) != -1000:
                    indexes.append(row - 1)
                elif int(data.loc[(row), 'rt']) != -1000:
                    indexes.append(row)
                elif int(data.loc[(row + 1), 'rt']) != -1000:
                    indexes.append(row + 1)
                elif int(data.loc[(row + 2), 'rt']) != -1000:
                    indexes.append(row + 2)
        except TypeError:
            pass
    new = data.loc[indexes]
    new.index = [x for x in range(new.shape[0])]
    new['id'] = [tsl_id for x in range(new.shape[0])]
    new['trial_mask'] = [x for x in range(new.shape[0])]
    return new[keep]

In [13]:
vsl_trial_by_trial = []
for par in data_store['vsl_kids']:
    vsl_trial_by_trial.append(phase1_tbt_vsl(par, 'kid'))
pd.concat(vsl_trial_by_trial, axis=0).to_csv('retest_vsl_tbt_famphase.csv', index=None)

#vsl_trial_by_trial = []
#for par in data_store['vsl_adults']:
#    vsl_trial_by_trial.append(phase1_tbt_vsl(par, 'adult'))
#pd.concat(vsl_trial_by_trial, axis=0).to_csv('adults_vsl_tbt_famphase.csv', index=None)

In [26]:
#data_store['tsl_kids'][0].head()
for x in data_store['tsl_kids']:
    if not 'stimulus' in x.columns:
        print x.responses[0]

{"Q0":"3302"}
{"Q0":"3084"}
{"Q0":"3129"}
{"Q0":"3306"}
{"Q0":"3317"}
{"Q0":"3326"}
{"Q0":"3234"}
{"Q0":"3240"}


In [14]:
tsl_trial_by_trial = []
for par in data_store['tsl_kids']:
    print par
    tsl_trial_by_trial.append(phase1_tbt_tsl(par, 'kid'))
pd.concat(tsl_trial_by_trial, axis=0).to_csv('retest_tsl_tbt_famphase.csv', index=None)

                rt      responses     trial_type  trial_index  time_elapsed  \
0    410620.000000  {"Q0":"3267"}    survey-text            0        410637   
1      5177.000000            NaN           text            1        416826   
2              NaN            NaN  call-function            2        417831   
3              NaN            NaN  call-function            3        417833   
4      5426.349206            NaN   single-audio            4        423363   
5      3597.777778            NaN   single-audio            5        428062   
6      1275.782313            NaN   single-audio            6        430438   
7      5037.414966            NaN   single-audio            7        436581   
8      3986.712018            NaN   single-audio            8        441669   
9      9373.741497            NaN   single-audio            9        452148   
10     6099.727891            NaN   single-audio           10        459355   
11     3829.977324            NaN   single-audio    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


               rt      responses     trial_type  trial_index  time_elapsed  \
0    22881.000000  {"Q0":"3236"}    survey-text            0         22906   
1     8706.000000            NaN           text            1         32616   
2             NaN            NaN  call-function            2         33619   
3             NaN            NaN  call-function            3         33621   
4     1980.000000            NaN   single-audio            4         35705   
5     8145.333333            NaN   single-audio            5         44954   
6     4806.666667            NaN   single-audio            6         50866   
7     5478.666667            NaN   single-audio            7         57446   
8     5532.000000            NaN   single-audio            8         64084   
9     6310.666667            NaN   single-audio            9         71499   
10    6993.333333            NaN   single-audio           10         79588   
11     732.000000            NaN   single-audio           11    

In [10]:
#tsl_trial_by_trial = []
#for par in data_store['tsl_adults']:
#    tsl_trial_by_trial.append(phase1_tbt_tsl(par, 'adult'))
#pd.concat(tsl_trial_by_trial, axis=0).to_csv('adults_tsl_tbt_famphase.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyError: 'the label [stimulus] is not in the [index]'

In [7]:
tsl_trial_by_trial

[           rt    id   cond targ  trial_index  trial_mask
 0  -88.390023  3267  lang1   2C           87           0
 1  -12.925170  3267  lang1   2C          104           1
 2  230.884354  3267  lang1   2C          135           2,
             rt    id   cond targ  trial_index  trial_mask
 0   -92.000000  3182  lang2   3A           33           0
 1   361.333333  3182  lang2   3A           47           1
 2   278.666667  3182  lang2   3A           63           2
 3   150.666667  3182  lang2   3A           72           3
 4   329.333333  3182  lang2   3A           83           4
 5   169.333333  3182  lang2   3A           90           5
 6   340.000000  3182  lang2   3A          107           6
 7   220.000000  3182  lang2   3A          114           7
 8   -30.666667  3182  lang2   3A          132           8
 9    20.000000  3182  lang2   3A          150           9
 10  -20.000000  3182  lang2   3A          165          10
 11  100.000000  3182  lang2   3A          186          11


In [15]:
#  trial-by-trial test button pressing data (all trials): to use multilevel modeling to test learning.

def phase2_tbt_vsl(data, par_type):
    if par_type == 'kid':
        vsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        vsl_id = data['responses'][0][7:12]
    language = data.cond[100]
    fc_idx = 309
    trials, indexes = [], []
    keep = ['rt', 'cond', 'targ', 'id', 'par_answer', 'expected_answer', 'trial_index', 'trial_mask']
    res = data.loc[fc_idx:, ['key_press','stimulus','rt','cond','targ', 'trial_index']]
    res.index = [x for x in range(res.shape[0])]
    for i in range(res.shape[0]):
        if (int(res.loc[i, 'key_press']) != -1) and (pd.isnull(res.loc[i, 'stimulus'])):
            trials.append(int(res.loc[i, 'key_press']))
            indexes.append(i)
    converted = []
    for val in trials:
        if int(val) == 49:
            converted.append(1)
        elif int(val) == 50:
                converted.append(2)
    if language == 'lang1':
        pat = language_1
    elif language == 'lang2':
        pat = language_2   
    out = res.loc[indexes]
    out['id'] = [vsl_id for i in range(out.shape[0])]
    out['par_answer'] = converted
    out['expected_answer'] = pat
    out['trial_mask'] = [x for x in range(out.shape[0])]
    return out[keep]


def phase2_tbt_tsl(data, par_type):
    if par_type == 'kid':
        tsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        tsl_id = data['responses'][0][7:12]
    language = data.cond[100]
    # 37 -> left_arrow, 39 -> right_arrow
    fc_idx = 607
    keep = ['rt', 'cond', 'targ', 'id', 'par_answer', 'expected_answer', 'trial_index', 'trial_mask']
    responses = data.loc[fc_idx:, ['key_press','stimulus','rt','cond','targ', 'trial_index']]
    responses.index= [x for x in range(responses.shape[0])]
    idxs = []
    converted = []
    c = 0
    i = 6
    while c < 224:
        idxs.append(i)
        i += 7
        c += 7
    res = responses.loc[idxs, 'key_press']
    for val in res:
        if int(val) == 37:
            converted.append(1)
        elif int(val) == 39:
            converted.append(2)
    if language == 'lang1':
        pat = language_1
    elif language == 'lang2':
        pat = language_2
    out = responses.loc[idxs]
    out['id'] = [tsl_id for i in range(out.shape[0])]
    out['par_answer'] = converted
    out['expected_answer'] = pat
    out['trial_mask'] = [x for x in range(out.shape[0])]
    return out[keep]

In [16]:
vsl_trial_by_trial_test = []
for par in data_store['vsl_kids']:
    vsl_trial_by_trial_test.append(phase2_tbt_vsl(par, 'kid'))
pd.concat(vsl_trial_by_trial_test, axis=0).to_csv('retest_vsl_tbt_testphase.csv', index=None)

#vsl_trial_by_trial_test = []
#for par in data_store['vsl_adults']:
#    vsl_trial_by_trial_test.append(phase2_tbt_vsl(par, 'adult'))
#pd.concat(vsl_trial_by_trial_test, axis=0).to_csv('adults_vsl_tbt_testphase.csv', index=None)

tsl_trial_by_trial_test = []
for par in data_store['tsl_kids']:
    tsl_trial_by_trial_test.append(phase2_tbt_tsl(par, 'kid'))
pd.concat(tsl_trial_by_trial_test, axis=0).to_csv('retest_tsl_tbt_testphase.csv', index=None)

#tsl_trial_by_trial_test = []
#for par in data_store['tsl_adults']:
#    tsl_trial_by_trial_test.append(phase2_tbt_tsl(par, 'adult'))
#pd.concat(tsl_trial_by_trial_test, axis=0).to_csv('adults_tsl_tbt_testphase.csv', index=None)

In [17]:
def phase1_tbt_vsl_fp(data, par_type):
    if par_type == 'kid':
        vsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        vsl_id = data['responses'][0][7:12]
    keep = ['rt', 'id', 'cond', 'targ', 'stimulus', 'key_press', 'trial_index', 'trial_mask']
    data.index = [x for x in range(data.shape[0])]
    indexes = []
    for idx in range(309):
        try:
            if not pd.isnull(data.loc[idx, 'key_press']) and not pd.isnull(data.loc[idx, 'stimulus']) \
            and int(data.loc[idx, 'rt']) < 1200  and int(data.loc[idx, 'rt']) != -1:
                    indexes.append(idx)
        except TypeError:
            pass
    new = data.loc[indexes]
    new.index = [x for x in range(new.shape[0])]
    new['id'] = [vsl_id for x in range(new.shape[0])]
    new['trial_mask'] = [x for x in range(new.shape[0])]
    return new[keep]

def phase1_tbt_tsl_fp(data, par_type):
    indexes = []
    if par_type == 'kid':
        tsl_id = data['responses'][0][7:11]
    elif par_type == 'adult':
        tsl_id = data['responses'][0][7:12]
    keep = ['rt', 'id', 'cond', 'targ', 'stimulus', 'key_press', 'trial_index','trial_mask']
    data.index = [x for x in range(data.shape[0])]
    for row in range(602):
        try:
            if not pd.isnull(data.loc[row, 'key_press']) and int(data.loc[row, 'rt']) != -1000 \
            and not pd.isnull(data.loc[row, 'stimulus']):
                if int(data.loc[(row - 2), 'rt']) != -1000:
                    indexes.append(row - 2)
                elif int(data.loc[(row - 1), 'rt']) != -1000:
                    indexes.append(row - 1)
                elif int(data.loc[(row), 'rt']) != -1000:
                    indexes.append(row)
                elif int(data.loc[(row + 1), 'rt']) != -1000:
                    indexes.append(row + 1)
                elif int(data.loc[(row + 2), 'rt']) != -1000:
                    indexes.append(row + 2)
        except TypeError:
            pass
    new = data.loc[indexes]
    new.index = [x for x in range(new.shape[0])]
    new['id'] = [tsl_id for x in range(new.shape[0])]
    new['trial_mask'] = [x for x in range(new.shape[0])]
    return new[keep]

In [18]:
vsl_trial_by_trial_fp = []
for par in data_store['vsl_kids']:
    vsl_trial_by_trial_fp.append(phase1_tbt_vsl_fp(par, 'kid'))
pd.concat(vsl_trial_by_trial_fp, axis=0).to_csv('retest_vsl_tbt_famphase_falsePos.csv', index=None)

#vsl_trial_by_trial_fp = []
#for par in data_store['vsl_adults']:
#    vsl_trial_by_trial_fp.append(phase1_tbt_vsl_fp(par, 'adult'))
#pd.concat(vsl_trial_by_trial_fp, axis=0).to_csv('adults_vsl_tbt_famphase_falsePos.csv', index=None)

tsl_trial_by_trial_fp = []
for par in data_store['tsl_kids']:
    tsl_trial_by_trial_fp.append(phase1_tbt_tsl_fp(par, 'kid'))
pd.concat(tsl_trial_by_trial_fp, axis=0).to_csv('retest_tsl_tbt_famphase_falsePos.csv', index=None)

#tsl_trial_by_trial_fp = []
#for par in data_store['tsl_adults']:
#    tsl_trial_by_trial_fp.append(phase1_tbt_tsl_fp(par, 'adult'))
#pd.concat(tsl_trial_by_trial_fp, axis=0).to_csv('adults_tsl_tbt_famphase_falsePos.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
