In [8]:
## Import packages
import csv
import glob
import os
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [9]:
## Load data
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp8/results/old/"
os.chdir(path_to_data)
df = pd.read_csv("combined_csv.csv")

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [10]:
## Define parameters 
which_exp = 'EXP8c'
threshold = .55
catch_threshold = .9
RT_threshold = 10000

In [11]:
## Tasks & Conditions: Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[1] if (x.split("_")[0] == which_exp) else x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])
## for exp8abc
df['exp'] = df['group_id'].apply(lambda x: x.split("_")[0])
df = df[(df['exp'] == which_exp)].reset_index(drop = True)

In [12]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = [] # only applied for the prescreen
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_correct'][i] == True:
        Correct.append(1)
    else: 
        Correct.append(0)
    if ((df['response_value'][i] == "Lap") or (df['response_value'][i] == "Shorter")): 
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)
- Fail the practice trial on the last 12 practice trial: pass people have done 24 trials, fail people have done 36 trials

**Reject trials**
- Task-relevant trials
- Extreme RT

In [13]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_trials'] ## for exp8abc

df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('sub_id')['response_correct'].count()
fail_catch = catch_acc[catch_acc < catch_threshold].reset_index()
print(len(fail_catch))
print(fail_catch)

11
   sub_id  response_correct
0   05944          0.461538
1   08d1c          0.807692
2   2c1a9          0.461538
3   37e15          0.538462
4   55175          0.846154
5   6ce60          0.884615
6   85f09          0.730769
7   c47ae          0.346154
8   d0b0c          0.461538
9   d9664          0.730769
10  ea15b          0.384615


In [14]:
## remove subjects who failed the catch trial based on the threshold
df_clean = df[~df['sub_id'].isin(fail_catch['sub_id'])].reset_index(drop = True)

In [15]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
too_noisy = noise[noise == '4'].reset_index()
print(len(too_noisy))
print(too_noisy)

1
  sub_id response_value
0  4e6de              4


In [16]:
## remove subjects who completed the task in noisy environment
df_clean = df_clean[~df_clean['sub_id'].isin(too_noisy['sub_id'])].reset_index(drop = True)

In [17]:
# may remove subjects who completed the task using low-quality audio device
df_device = df[(df['response_name'] == 'survey_headphone1')].reset_index()
len(df_device[['sub_id','response_value']] == 'Yes')

100

In [18]:
# if which_exp != 'EXP8a':
#     ## remove subjects who failed the practice 
#     fail_prac = df[df['branch_failpass'] == 'fail']['sub_id'].unique()
#     len(fail_prac)
#     df_clean = df_clean[(~df_clean['sub_id'].isin(fail_prac))].reset_index()

In [19]:
## After cleaning how many subjects left
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['participant_id'].unique())
n_trial = len(df_clean)//n_subj # how many trials per subject
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

Participant_number:
88
Trial number:
426
Condition number:
46


### Task relevant trials
#### Select the trials to analyze
- Pretest trails (n = 60): single tone 1 or 8
- Practice trials (n = 24 or 36): 6 context tones + 1 ontime single tone 1 or 8 with correct/incorrect feedback
- Main task (n = 576): 6 context tones + 1 single tone (early, ontime, late; 1 to 8 steps) without feedback

### Select the pretest trials 

In [78]:
df_pre = df_clean[(df_clean['task'] == "PretestTrials")].reset_index()
df_pre.groupby(['sub_id','stimuli_presented'])['Correct'].mean()
pre_acc = df_pre.groupby(['sub_id'])['Correct'].mean()
conds = df_pre['stimuli_presented'].unique()
print(df_pre.groupby('stimuli_presented')['Shorter'].mean())

stimuli_presented
long_tone_8     0.350000
short_tone_1    0.619697
Name: Shorter, dtype: float64


In [84]:
sub_prac_acc_last

stimuli_presented                       sub_id
single_300ms_ontime_delay_2_tone_Lab_1  07d11     1
                                        07f97     1
                                        085e4     1
                                        0b3ef     1
                                        0e2a5     1
                                                 ..
single_300ms_ontime_delay_2_tone_Lab_8  f767a     0
                                        fa58e     1
                                        fb147     0
                                        fb46c     0
                                        fdc6c     0
Name: Shorter, Length: 176, dtype: int64

In [86]:
(0.45454545454545453+0.5113636363636364)/2

0.4829545454545454

In [87]:
(0.6363636363636364+0.6477272727272727)/2

0.6420454545454546

In [85]:
sub_pre_acc_first = df_pre.groupby(['stimuli_presented','sub_id'])['Shorter'].first()
sub_pre_acc_last = df_pre.groupby(['stimuli_presented','sub_id'])['Shorter'].last()
print(sub_pre_acc_first['short_tone_1'].mean())
print(sub_pre_acc_last['short_tone_1'].mean())

print(1 - sub_pre_acc_first['long_tone_8'].mean())
print(1 - sub_pre_acc_last['long_tone_8'].mean())

0.45454545454545453
0.6363636363636364
0.5113636363636364
0.6477272727272727


### Select the practice trials

In [68]:
df_prac = df_clean[(df_clean['task'] == "practiceTrials")].reset_index()
conds = df_prac['stimuli_presented'].unique()
sub = df_prac['sub_id'].unique()
print(df_prac.groupby('stimuli_presented')['Shorter'].mean())

stimuli_presented
single_300ms_ontime_delay_2_tone_Lab_1    0.767007
single_300ms_ontime_delay_2_tone_Lab_8    0.236395
Name: Shorter, dtype: float64


In [74]:
sub_prac_acc_first = df_prac.groupby(['stimuli_presented','sub_id'])['Shorter'].first()
sub_prac_acc_last = df_prac.groupby(['stimuli_presented','sub_id'])['Shorter'].last()

print(sub_prac_acc_first['single_300ms_ontime_delay_2_tone_Lab_1'].mean())
print(sub_prac_acc_last['single_300ms_ontime_delay_2_tone_Lab_1'].mean())

print(1 - sub_prac_acc_first['single_300ms_ontime_delay_2_tone_Lab_8'].mean())
print(1 - sub_prac_acc_last['single_300ms_ontime_delay_2_tone_Lab_8'].mean())

0.75
0.8522727272727273
0.7159090909090908
0.8636363636363636


In [76]:
(0.8522727272727273+ 0.8636363636363636)/2

0.8579545454545454

In [43]:
sub_prac_acc['single_300ms_ontime_delay_2_tone_Lab_1'].mean()

0.7894570707070708

In [46]:
sub_prac_acc['single_300ms_ontime_delay_2_tone_Lab_8'].std()

0.18541724691180264

In [47]:
sub_prac_acc['single_300ms_ontime_delay_2_tone_Lab_1'].std()

0.17322769108667888

In [25]:
sub = df_prac['sub_id'].unique()

In [28]:
sub

array(['4c685', 'e1f71', '8167c', 'cbdda', '3a8b4', '83580', '499df',
       '97d8a', '779fa', 'dae9a', 'f6891', 'd88d8', '1c0b6', '29762',
       'fdc6c', '74003', '1695b', 'd8ba9', 'ae396', '9ca85', '70337',
       'c8716', '93732', 'b902b', '0e2a5', '75a23', '07d11', 'fa58e',
       '94aef', 'cbea0', 'bb761', '5f8f2', '8560a', '40074', '49aec',
       'aaf24', '558e8', '9e531', 'd759b', '62ec7', '58ba4', '0f82e',
       'd816b', 'b85ff', 'd88eb', '489ca', '9c787', '0ed6d', '69689',
       '085e4', 'fb147', '7effe', '192b3', 'd900d', 'f767a', 'c3d0d',
       '18b6b', 'a0bba', '8bacf', '4acd3', '201da', '721ad', '892e0',
       '323a1', '07f97', '51570', 'c9bed', '8bed7', '37262', 'f216f',
       '1bc48', '81a9f', '31330', 'fb46c', 'cc81e', 'ecc20', '9ec33',
       'ae71d', '13729', '79539', '0b3ef', '615be', '12e7b', '13aa8',
       'a131b', 'ee423', '9077a', '5f27d'], dtype=object)

In [29]:
df_prac[:10]

Unnamed: 0,index,branch_failpass,expt_id,group_id,network_error_repeat,participant_id,participation_duration,response_correct,response_mode,response_name,...,session_start_time,stimuli_presented,trial_duration,trial_num,trial_template,task,sub_id,exp,Correct,Shorter
0,71,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_s_J,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_1,6985,72,EXP8c_practiceTrials_s_rv,practiceTrials,4c685,EXP8c,1,1
1,72,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_l_F,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_8,6276,73,EXP8c_practiceTrials_l_rv,practiceTrials,4c685,EXP8c,1,0
2,73,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_s_J,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_1,6266,74,EXP8c_practiceTrials_s_rv,practiceTrials,4c685,EXP8c,1,1
3,74,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_l_F,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_8,6126,75,EXP8c_practiceTrials_l_rv,practiceTrials,4c685,EXP8c,1,0
4,75,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_s_J,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_1,6027,76,EXP8c_practiceTrials_s_rv,practiceTrials,4c685,EXP8c,1,1
5,76,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_l_F,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_8,6093,77,EXP8c_practiceTrials_l_rv,practiceTrials,4c685,EXP8c,1,0
6,77,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_l_F,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_8,6192,78,EXP8c_practiceTrials_l_rv,practiceTrials,4c685,EXP8c,1,0
7,78,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_s_J,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_1,6082,79,EXP8c_practiceTrials_s_rv,practiceTrials,4c685,EXP8c,1,1
8,79,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_s_J,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_1,6042,80,EXP8c_practiceTrials_s_rv,practiceTrials,4c685,EXP8c,1,1
9,80,pass,6345b7f49677d36bd07c5a10,EXP8c_JF,0,4c685da32c0562c7f3203118347ebe68afd88aa9613a0c...,2422.600319,True,keypress,EXP8c_practiceRsp_l_F,...,2022-11-11 03:22:51-05:00,single_300ms_ontime_delay_2_tone_Lab_8,6005,81,EXP8c_practiceTrials_l_rv,practiceTrials,4c685,EXP8c,1,0


In [None]:
df_clean_all = df_clean # create a new variable just in case

### Select the main trials

In [None]:
df_clean = df_clean[(df_clean['task'] == "maintaskTrials")].reset_index(drop = True)
conds = df_clean['stimuli_presented'].unique()
print(conds) # check the conditions

In [None]:
## Create new cols for onset and length
onset = []
length = []
df_clean = df_clean.set_index(pd.Index(np.arange(0,len(df_clean)))) # change the index for the for loop

for i in np.arange(0,len(df_clean)):
    stimuli_presented = df_clean['stimuli_presented'][i].split('_')
    onset.append(stimuli_presented[2])
    length.append(stimuli_presented[-1])
df_clean['Onset'] = onset
df_clean['Length'] = length

### Accuracy
#### Not for EXP8c

In [None]:
# df_clean.groupby(['Onset'])['Correct'].mean()

In [None]:
# accuracy_all = df_clean.groupby(['stimuli_presented','Onset'])['Correct'].mean()

In [None]:
# ## consider outliers based on the accuracy of the easiest trials 
# outliers = Acc_easiest < threshold
# outliers = outliers[outliers == True]
# df_clean_nooutlier = df_clean[~df_clean['participant_id'].isin(outliers.index)]

In [None]:
# ## Visualization: bar plot
# ACC_plot = pd.DataFrame({'Conditions': conds, 'accuracy': accuracy_all})
# ax = ACC_plot.plot.bar(rot = 0)

### Proportion short

In [None]:
print("Proportion Short")
print(df_clean.groupby(['Onset'])['Shorter'].mean())

In [None]:
pps = df_clean.groupby(['Onset'])['Shorter'].mean()

In [None]:
pps_all = df_clean.groupby(['stimuli_presented','Onset'])['Shorter'].mean()

In [None]:
## Visualization: bar plot
PPS_plot = pd.DataFrame({'Conditions': conds, 'PSS': pps_all})
ax = PPS_plot.plot.bar(rot = 0)
pps_all.plot(style='k--')

In [None]:
pps_all[0:8].plot(style = 'o-') # early
pps_all[8:16].plot(style ='ko-') # late
ax = pps_all[16:24].plot(style ='ro-') # ontime
ax.set_xticklabels([0,1,2,3,4,5,6,7,8])

### Statistic tests

In [None]:
PPS = df_clean.groupby(['sub_id','Onset'])['Shorter'].mean()
PPS_early = PPS[::3]
PPS_ontime = PPS[2::3]
PPS_late = PPS[1::3]

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))

## Save to the csv file for R analysis

In [None]:
## save df to csv
df_clean.to_csv(path_to_data+"exp8_clean.csv", header=True)
print('Cleaned data saved!')