## Initialization
- Import the package
- Import the data
    - prescreen
    - discrimination
    - main task 
    - catch trials

In [2]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [3]:
## Read the data
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp7_21CR04_VOT/FF2021/results/results.csv"
df = pd.read_csv(path_to_data)
# df_clean = df
## Take a look of the dataset
# df.head()
# df.tail()
# df.loc[558:663] # see specific rows of data

In [4]:
## Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])

In [5]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = [] # only applied for the prescreen
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_value'][i] == df['stimuli_presented'][i]:
        Correct.append(1)
    else: 
        Correct.append(0)
    if df['response_value'][i] == "Ba":
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [6]:
## Parameters
threshold = .55
catch_threshold = .8
RT_threshold = 10000

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)

**Reject trials**
- Task-relevant trials
- Extreme RT

### Reject subjects 

In [7]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_cat','Catch_bird']
df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('sub_id')['response_correct'].count()
catch_acc[catch_acc < catch_threshold]

Series([], Name: response_correct, dtype: float64)

In [8]:
len(df_catch['sub_id'].unique())

17

In [9]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
df_device = df[(df['response_name'] == 'survey_headphone1')].reset_index(drop = True)
df_device[['sub_id','response_value']]
df_device = df[(df['response_name'] == 'survey_headphone2')].reset_index(drop = True)
df_device[['sub_id','response_value']]

Unnamed: 0,sub_id,response_value
0,56f87,Wireless Earbuds
1,3d5b1,Wireless Earbuds
2,cf62e,Wired Earbuds
3,fca8a,Wired Headphones
4,44e72,Wireless Headphones
5,a5bba,Wireless Earbuds
6,d607c,Wired Headphones
7,1206e,Wireless Earbuds
8,49f86,Wired Headphones
9,354ba,Wireless Headphones


### Task relevant trials

In [10]:
## subj id
subID = df['sub_id'].unique()

In [11]:
## Discrimination task 
conds = ['pa1','pa2','pa3','pa4','pa5','pa6','pa7','pa8']
## select the discrimination trials 
df_clean = df[(df['stimuli_presented'].isin(conds)) & (df['task'] == "DiscriminationTrials")].reset_index(drop = True)
df_clean.groupby('stimuli_presented')['Shorter'].mean()

stimuli_presented
pa1    0.045098
pa2    0.056863
pa3    0.060784
pa4    0.070588
pa5    0.258824
pa6    0.507843
pa7    0.684314
pa8    0.794118
Name: Shorter, dtype: float64

In [12]:
## Main task conditions
conds = ['early_pa1','early_pa2','early_pa3','early_pa4','early_pa5','early_pa6','early_pa7','early_pa8',
         'ontime_pa1','ontime_pa2','ontime_pa3','ontime_pa4','ontime_pa5','ontime_pa6','ontime_pa7','ontime_pa8',
         'late_pa1','late_pa2','late_pa3','late_pa4','late_pa5','late_pa6','late_pa7','late_pa8']
## select the main trials 
df_clean = df[(df['stimuli_presented'].isin(conds)) & (df['task'] == "maintaskTrials")].reset_index(drop = True)
df_clean.groupby('stimuli_presented')['Shorter'].mean()

stimuli_presented
early_pa1     0.039216
early_pa2     0.024510
early_pa3     0.053922
early_pa4     0.137255
early_pa5     0.259804
early_pa6     0.573529
early_pa7     0.691176
early_pa8     0.789216
late_pa1      0.058824
late_pa2      0.044118
late_pa3      0.058824
late_pa4      0.107843
late_pa5      0.269608
late_pa6      0.524510
late_pa7      0.730392
late_pa8      0.803922
ontime_pa1    0.053922
ontime_pa2    0.039216
ontime_pa3    0.034314
ontime_pa4    0.083333
ontime_pa5    0.269608
ontime_pa6    0.534314
ontime_pa7    0.759804
ontime_pa8    0.848039
Name: Shorter, dtype: float64

In [13]:
## Long format for each condition
# B/T: presenting order & response key
# W/I: Onset time, comparison length, delay length
df_clean['onset'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-2])
df_clean['comparison'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-1][-1])
df_clean['key'] = df_clean['group_id'].apply(lambda x: x.split("_")[-1])

In [14]:
## Sanity check for trial number
df_clean.groupby('stimuli_presented').describe()

Unnamed: 0_level_0,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,participation_duration,participation_duration,...,Correct,Correct,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
stimuli_presented,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
early_pa1,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.039216,0.194585,0.0,0.0,0.0,0.0,1.0
early_pa2,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.02451,0.155006,0.0,0.0,0.0,0.0,1.0
early_pa3,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.053922,0.226418,0.0,0.0,0.0,0.0,1.0
early_pa4,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.137255,0.344963,0.0,0.0,0.0,0.0,1.0
early_pa5,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.259804,0.439606,0.0,0.0,0.0,1.0,1.0
early_pa6,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.573529,0.495781,0.0,0.0,1.0,1.0,1.0
early_pa7,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.691176,0.463145,0.0,0.0,1.0,1.0,1.0
early_pa8,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.789216,0.408869,0.0,1.0,1.0,1.0,1.0
late_pa1,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.058824,0.235873,0.0,0.0,0.0,0.0,1.0
late_pa2,204.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,204.0,1926.453571,...,0.0,0.0,204.0,0.044118,0.205862,0.0,0.0,0.0,0.0,1.0


### Extreme reaction time

In [15]:
df_clean_longRT = df_clean[df_clean['response_rt'] > RT_threshold]

## Save the onset and comparison length to the csv file for R analysis

In [None]:
## remove outliers 
outlier = df_clean['sub_id'].unique()[10]
df_clean = df_clean[(~df_clean['sub_id'].isin([outlier]))].reset_index(drop = True)

In [20]:
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['sub_id'].unique())
n_trial = len(df_clean)//15
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

Participant_number:
16
Trial number:
307
Condition number:
24


In [28]:
## Group_by
overall_acc = df_clean.groupby(['sub_id']).mean()['Correct'].reset_index(drop = True)
overall_pps = df_clean.groupby(['sub_id']).mean()['Shorter'].reset_index(drop = True)
print(overall_pps)
## Outliers: define as the ones who are three stds away from the mean
overall_pps - overall_pps.mean() > 3*overall_pps.std()

0     0.451389
1     0.229167
2     0.319444
3     0.347222
4     0.319444
5     0.440972
6     0.385417
7     0.253472
8     0.354167
9     0.138889
10    0.267361
11    0.298611
12    0.378472
13    0.364583
14    0.388889
15    0.357639
Name: Shorter, dtype: float64

In [None]:
## save df to csv
df_clean.to_csv(r'/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp4_20CR12/results_shortlongdelay_2021/v2_20CR12_clean_n67_cleaned.csv', header=True)

## Analyze trials of the main task

### Proportion Lap

In [23]:
df_clean.head()

Unnamed: 0,expt_id,group_id,network_error_repeat,participant_id,participation_duration,response_correct,response_mode,response_name,response_rt,response_target,...,trial_duration,trial_num,trial_template,task,sub_id,Correct,Shorter,onset,comparison,key
0,608073d7f5aa0f29e467168b,Group_FJ,0,56f876e6eea5aae750f1c8745f8ad6ce1d6433030fbd1b...,2298.363743,False,keypress,testRsp,372,[],...,3672,249,maintaskTrials,maintaskTrials,56f87,0,1,late,1,FJ
1,608073d7f5aa0f29e467168b,Group_FJ,0,56f876e6eea5aae750f1c8745f8ad6ce1d6433030fbd1b...,2298.363743,False,keypress,testRsp,345,[],...,3464,250,maintaskTrials,maintaskTrials,56f87,0,0,late,2,FJ
2,608073d7f5aa0f29e467168b,Group_FJ,0,56f876e6eea5aae750f1c8745f8ad6ce1d6433030fbd1b...,2298.363743,False,keypress,testRsp,199,[],...,3307,251,maintaskTrials,maintaskTrials,56f87,0,0,early,2,FJ
3,608073d7f5aa0f29e467168b,Group_FJ,0,56f876e6eea5aae750f1c8745f8ad6ce1d6433030fbd1b...,2298.363743,False,keypress,testRsp,316,[],...,3750,252,maintaskTrials,maintaskTrials,56f87,0,1,late,6,FJ
4,608073d7f5aa0f29e467168b,Group_FJ,0,56f876e6eea5aae750f1c8745f8ad6ce1d6433030fbd1b...,2298.363743,False,keypress,testRsp,285,[],...,3498,253,maintaskTrials,maintaskTrials,56f87,0,1,ontime,8,FJ


In [24]:
df_clean.groupby('onset')['Shorter'].mean()

onset
early     0.329427
late      0.330729
ontime    0.332682
Name: Shorter, dtype: float64

### Statistic tests

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))