## Initialization
- Import the package
- Import the data

In [2]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [8]:
path_to_data = '/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp3_20CR11/results/20CR11_5f7d2e104bfca86e4f58ef9a-data.csv'

In [9]:
## Read the data
df = pd.read_csv(path_to_data)
# df_clean = df
## Take a look of the dataset
# df.head()
# df.tail()
# df.loc[558:663] # see specific rows of data

In [10]:
## Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])

In [11]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = []
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_correct'][i] == True:
        Correct.append(1)
    else: 
        Correct.append(0)
    if df['response_value'][i] == "Shorter":
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [12]:
## Parameters
threshold = .55
catch_threshold = .8
RT_threshold = 10000

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Practice accuracy in conditional branching block
- Easiest comparison length (shortest and longest) in the main trial 
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)

**Reject trials**
- Task-relevant trials
- Extreme RT

### Reject subjects 

In [13]:
## How many subjects pass the practice > 0.75 on the easiest ontime trials 
df_clean = df[df['branch_failpass'] == 'pass'].reset_index(drop = True)
print('How many subjects passed the practice trials:', len(df_clean['sub_id'].unique())
      ,'out of',len(df['sub_id'].unique()))

KeyError: 'branch_failpass'

In [24]:
## How many subjects pass the main task easiest ontime trial > 0.55 
# main_task_easiest = ['single_300ms_delay_2_81','single_300ms_delay_2_86','single_300ms_delay_4_81','single_300ms_delay_4_86'] # EXP4
df_clean = df # EXP3
main_task_easiest = ['300ms_delay_2_81','300ms_delay_2_86','300ms_delay_4_81','300ms_delay_4_86'] # EXP3
df_main_task_easiest = df_clean[(df_clean['stimuli_presented'].isin(main_task_easiest)) & (df_clean['task'] == 'maintaskTrials')].reset_index(drop = True)
pass_subs = (df_main_task_easiest.groupby(['sub_id']).mean()['Correct'] > threshold).reset_index(name = 'main_task_easiest_pass')
df_clean = pd.merge(df_clean,pass_subs,on = 'sub_id')
df_clean = df_clean[df_clean['main_task_easiest_pass'] == True]
print('How many subjects passed the practice trials:', len(df_clean['sub_id'].unique())
      ,'out of',len(df['sub_id'].unique()))

How many subjects passed the practice trials: 34 out of 63


In [None]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_beat','Catch_bag']
df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('participant_id')['response_correct'].count()
catch_acc[catch_acc < catch_threshold]

In [25]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
df_device = df[(df['response_name'] == 'survey_headphone')].reset_index(drop = True)
df_device[['sub_id','response_value']]

Unnamed: 0,sub_id,response_value
0,885d1,Built in Laptop/Desktop Speaker (Please use he...
1,34b38,Wired Headphones
2,2d961,Wired Earbuds
3,0528e,Wired Headphones
4,c186a,Wired Headphones
...,...,...
58,8ffb7,Wired Earbuds
59,57736,Wireless Earbuds
60,6b384,Wireless Headphones
61,74872,Wired Earbuds


### Task relevant trials

In [27]:
## subj id
subID = df['sub_id'].unique()

In [None]:
## All condition
task = ["maintaskTrials"] 
# task = ["pretestTrials"] 
conds = ['single_300ms_delay_2_51','single_300ms_delay_2_52','single_300ms_delay_2_53','single_300ms_delay_2_54','single_300ms_delay_2_55','single_300ms_delay_2_56',
        'single_300ms_delay_2_81','single_300ms_delay_2_82','single_300ms_delay_2_83','single_300ms_delay_2_84','single_300ms_delay_2_85','single_300ms_delay_2_86',
        'single_300ms_delay_2_91','single_300ms_delay_2_92','single_300ms_delay_2_93','single_300ms_delay_2_94','single_300ms_delay_2_95','single_300ms_delay_2_96',
        'single_300ms_delay_4_51','single_300ms_delay_4_52','single_300ms_delay_4_53','single_300ms_delay_4_54','single_300ms_delay_4_55','single_300ms_delay_4_56',
        'single_300ms_delay_4_81','single_300ms_delay_4_82','single_300ms_delay_4_83','single_300ms_delay_4_84','single_300ms_delay_4_85','single_300ms_delay_4_86',
        'single_300ms_delay_4_91','single_300ms_delay_4_92','single_300ms_delay_4_93','single_300ms_delay_4_94','single_300ms_delay_4_95','single_300ms_delay_4_96']

In [31]:
## All condition
task = ["maintaskTrials"] 
# task = ["pretestTrials"] 
conds = ['300ms_delay_2_51','300ms_delay_2_52','300ms_delay_2_53','300ms_delay_2_54','300ms_delay_2_55','300ms_delay_2_56',
        '300ms_delay_2_81','300ms_delay_2_82','300ms_delay_2_83','300ms_delay_2_84','300ms_delay_2_85','300ms_delay_2_86',
        '300ms_delay_2_91','300ms_delay_2_92','300ms_delay_2_93','300ms_delay_2_94','300ms_delay_2_95','300ms_delay_2_96']

In [32]:
## Tasks & Conditions pretest
pretest_conds = ['short_tone_1','long_tone_6']
subID = df_clean['sub_id'].unique()

In [36]:
df_clean.iloc[55:100]

Unnamed: 0,expt_id,group_id,network_error_repeat,participant_id,participation_duration,response_correct,response_mode,response_name,response_rt,response_target,...,session_id,stimuli_presented,trial_duration,trial_num,trial_template,task,sub_id,Correct,Shorter,main_task_easiest_pass
294,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,False,keypress,testresponse_s_J,331,['Shorter'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_92,4513,56,maintaskTrials_2_s_reversekey,maintaskTrials,34b38,0,0,True
295,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,False,keypress,testresponse_l_F,595,['Longer'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_96,4890,57,maintaskTrials_2_l_reversekey,maintaskTrials,34b38,0,1,True
296,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,True,keypress,testresponse_s_J,1336,['Shorter'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_91,5576,58,maintaskTrials_2_s_reversekey,maintaskTrials,34b38,1,1,True
297,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,False,keypress,testresponse_l_F,992,['Longer'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_94,5260,59,maintaskTrials_2_l_reversekey,maintaskTrials,34b38,0,1,True
298,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,False,keypress,testresponse_l_F,2144,['Longer'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_95,6394,60,maintaskTrials_2_l_reversekey,maintaskTrials,34b38,0,1,True
299,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,True,keypress,testresponse_l_F,1604,['Longer'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_85,5783,61,maintaskTrials_2_l_reversekey,maintaskTrials,34b38,1,0,True
300,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,False,keypress,testresponse_l_F,3055,['Longer'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_55,7076,62,maintaskTrials_2_l_reversekey,maintaskTrials,34b38,0,1,True
301,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,True,keypress,testresponse_s_J,543,['Shorter'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_53,4613,63,maintaskTrials_2_s_reversekey,maintaskTrials,34b38,1,1,True
302,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,True,keypress,testresponse_s_J,55,['Shorter'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_81,4174,64,maintaskTrials_2_s_reversekey,maintaskTrials,34b38,1,1,True
303,5f7d2e104bfca86e4f58ef9a,Group_JF,0,34b38b84b781d28751b6846057394c59330d43f01ffed5...,1666,True,keypress,testresponse_s_J,1173,['Shorter'],...,5fbc5590be36f3ba328bd883,300ms_delay_2_52,5138,65,maintaskTrials_2_s_reversekey,maintaskTrials,34b38,1,1,True


In [38]:
df_clean[(df_clean['stimuli_presented'].isin(conds)) & (df_clean['task'] == task)]

ValueError: Lengths must match to compare

In [37]:
## select the trials 
df_clean = df_clean[(df_clean['stimuli_presented'].isin(conds)) & (df_clean['task'] == task)].reset_index(drop = True)

ValueError: Lengths must match to compare

In [None]:
## Long format for each condition
# B/T: presenting order & response key
# W/I: Onset time, comparison length, delay length
df_clean['onset'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-1][0])
df_clean['delay'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-2])
df_clean['comparison'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-1][1])
df_clean['order'] = df_clean['group_id'].apply(lambda x: x.split("_")[-1])
df_clean['key'] = df_clean['group_id'].apply(lambda x: x.split("_")[-2])

### Extreme reaction time

In [None]:
df_clean_longRT = df_clean[df_clean['response_rt'] > RT_threshold]

## Analyze noise and overall accuracy/catch accuracy

In [None]:
noise = noise.astype(int) # turn str to int to do correlaton
acc = df_clean.groupby('sub_id')['response_correct'].sum()/\
df_clean.groupby('sub_id')['response_correct'].count()

In [None]:
corr = np.corrcoef(noise,acc)[0,1]
print(corr, '\n')
corr = np.corrcoef(noise,catch_acc)[0,1]
print(corr, '\n')

In [None]:
# plot the data...
plt.scatter(acc,noise,color='r')
plt.xlabel('Accuracy')
plt.ylabel('Noise')
plt.show()

In [None]:
# plot the data...
plt.scatter(catch_acc,noise,color='r')
plt.hist(catch_acc,noise)
plt.xlabel('Accuracy')
plt.ylabel('Noise')
plt.show()

## Save the onset and comparison length to the csv file for R analysis

In [None]:
## save df to csv
df_clean.to_csv(r'/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp4_20CR12/results_shortlongdelay_2021/v2_20CR12_clean_n67_cleaned.csv', header=True)

## Analyze trials of the main task

In [None]:
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['sub_id'].unique())
n_trial = len(df_clean)//15
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

In [None]:
## Group_by
overall_acc = df_clean.groupby(['sub_id']).mean()['Correct'].reset_index(drop = True)
overall_pps = df_clean.groupby(['sub_id']).mean()['Shorter'].reset_index(drop = True)
all_conds_acc = df_clean.groupby(['sub_id','onset','delay','comparison','order','key']).mean()['Correct'].reset_index(drop = True)
all_conds_pps = df_clean.groupby(['sub_id','onset','delay','comparison','order','key']).mean()['Shorter'].reset_index(drop = True)

## Visualization

In [None]:
## Visualize acc across delay and onset
df_clean.groupby(['delay','onset']).mean()['Correct'].plot.bar(rot = 0)

In [None]:
## Visualize proportion short across delay and onset 
df_clean.groupby(['delay','onset']).mean()['Shorter'].plot.bar(rot = 0)

In [None]:
## Early, ontime, late long delay
early = [conds.index('single_300ms_delay_4_51'),conds.index('single_300ms_delay_4_52'),conds.index('single_300ms_delay_4_53'),conds.index('single_300ms_delay_4_54'),conds.index('single_300ms_delay_4_55'),conds.index('single_300ms_delay_4_56')]
ontime = [conds.index('single_300ms_delay_4_81'),conds.index('single_300ms_delay_4_82'),conds.index('single_300ms_delay_4_83'),conds.index('single_300ms_delay_4_84'),conds.index('single_300ms_delay_4_85'),conds.index('single_300ms_delay_4_86')]
late = [conds.index('single_300ms_delay_4_91'),conds.index('single_300ms_delay_4_92'),conds.index('single_300ms_delay_4_93'),conds.index('single_300ms_delay_4_94'),conds.index('single_300ms_delay_4_95'),conds.index('single_300ms_delay_4_96')]
Acc_early = Acc_final_sample.iloc[:, early].mean(axis = 1)
Acc_ontime = Acc_final_sample.iloc[:, ontime].mean(axis = 1)
Acc_late = Acc_final_sample.iloc[:, late].mean(axis = 1)
print("Accuracy")
print("Early:", np.mean(Acc_early))
print("Ontime:", np.mean(Acc_ontime))
print("Late:",np.mean(Acc_late))

In [None]:
## Visualization: bar plot
ACC_plot = pd.DataFrame({'Conditions': conds, 'acc': np.mean(Acc_final_sample, axis = 0)})
ax = ACC_plot.plot.bar(rot = 0)

### Proportion short

In [None]:
resp_short = []
resp_short = [True if d == 'Shorter' else False for d in df_clean['response_value']]

In [None]:
df_clean['resp_short'] = resp_short

In [None]:
PPS = []
PPS_cond = []
for sub in subID:
    tmpdf = df_clean[df_clean['participant_id'] == sub]
    for cond in conds:
        tmpPPS = np.mean(tmpdf[tmpdf['stimuli_presented'] == cond]['resp_short'])
        PPS_cond.append(tmpPPS)
    PPS.append(PPS_cond)
    PPS_cond = []

In [None]:
df_PPS = pd.DataFrame(data = PPS, index = subID, columns = conds)

In [None]:
PPS_final_sample = df_PPS[Acc_easiest >= threshold]
PPS_final_sample.head()
PPS_final_sample.describe()

In [None]:
## Early, ontime, late short
early = [conds.index('single_300ms_delay_2_51'),conds.index('single_300ms_delay_2_52'),conds.index('single_300ms_delay_2_53'),conds.index('single_300ms_delay_2_54'),conds.index('single_300ms_delay_2_55'),conds.index('single_300ms_delay_2_56')]
ontime = [conds.index('single_300ms_delay_2_81'),conds.index('single_300ms_delay_2_82'),conds.index('single_300ms_delay_2_83'),conds.index('single_300ms_delay_2_84'),conds.index('single_300ms_delay_2_85'),conds.index('single_300ms_delay_2_86')]
late = [conds.index('single_300ms_delay_2_91'),conds.index('single_300ms_delay_2_92'),conds.index('single_300ms_delay_2_93'),conds.index('single_300ms_delay_2_94'),conds.index('single_300ms_delay_2_95'),conds.index('single_300ms_delay_2_96')]
PPS_early = PPS_final_sample.iloc[:, early].mean(axis = 1)
PPS_ontime = PPS_final_sample.iloc[:, ontime].mean(axis = 1)
PPS_late = PPS_final_sample.iloc[:, late].mean(axis = 1)
print("Proportion short")
print("Early:", np.mean(PPS_early))
print("Ontime:", np.mean(PPS_ontime))
print("Late:",np.mean(PPS_late))

In [None]:
## Early, ontime, late long
early = [conds.index('single_300ms_delay_4_51'),conds.index('single_300ms_delay_4_52'),conds.index('single_300ms_delay_4_53'),conds.index('single_300ms_delay_4_54'),conds.index('single_300ms_delay_4_55'),conds.index('single_300ms_delay_4_56')]
ontime = [conds.index('single_300ms_delay_4_81'),conds.index('single_300ms_delay_4_82'),conds.index('single_300ms_delay_4_83'),conds.index('single_300ms_delay_4_84'),conds.index('single_300ms_delay_4_85'),conds.index('single_300ms_delay_4_86')]
late = [conds.index('single_300ms_delay_4_91'),conds.index('single_300ms_delay_4_92'),conds.index('single_300ms_delay_4_93'),conds.index('single_300ms_delay_4_94'),conds.index('single_300ms_delay_4_95'),conds.index('single_300ms_delay_4_96')]
PPS_early = PPS_final_sample.iloc[:, early].mean(axis = 1)
PPS_ontime = PPS_final_sample.iloc[:, ontime].mean(axis = 1)
PPS_late = PPS_final_sample.iloc[:, late].mean(axis = 1)
print("Proportion short")
print("Early:", np.mean(PPS_early))
print("Ontime:", np.mean(PPS_ontime))
print("Late:",np.mean(PPS_late))

In [None]:
## Visualization: bar plot
PSS_plot = pd.DataFrame({'Conditions': conds, 'PSS': np.mean(PPS_final_sample, axis = 0)})
ax = PSS_plot.plot.bar(rot = 0)

### Statistic tests

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))