## Initialization
- Import the package
- Import the data
    - prescreen
    - discrimination
    - main task 
    - catch trials

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [2]:
## Read the data
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp6_21CR03_Vowel_length/FF2021/results/session-6215a55807fa7c7666d39d6e-data.csv"
df = pd.read_csv(path_to_data)
# df.groupby('stimuli_presented').describe()[30:63]
# df_clean = df
## Take a look of the dataset
# df.head()
# df.tail()
# df.loc[558:663] # see specific rows of data

In [3]:
## Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])

In [4]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = [] # only applied for the prescreen
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_value'][i] == df['stimuli_presented'][i]:
        Correct.append(1)
    else: 
        Correct.append(0)
    if df['response_value'][i] == "Lap":
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

In [5]:
## Total participated subjects
len(df['sub_id'].unique())

80

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [6]:
## Parameters
threshold = .55
catch_threshold = .7
RT_threshold = 10000
pretest_threshold = 0.6

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)

**Reject trials**
- Task-relevant trials
- Extreme RT

### Reject subjects 

In [7]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_cat','Catch_bird']
df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('sub_id')['response_correct'].count()
fail_catch = catch_acc[catch_acc < catch_threshold].reset_index()
fail_catch

Unnamed: 0,sub_id,response_correct
0,2d344,0.653846
1,57a9f,0.576923
2,60bf7,0.653846
3,64819,0.538462
4,8d890,0.692308
5,91922,0.653846
6,9a506,0.423077
7,c24a4,0.576923
8,cf849,0.5
9,fc319,0.653846


In [8]:
## remove subjects who failed the catch trial based on the threshold
df_clean = df[~df['sub_id'].isin(fail_catch['sub_id'])].reset_index(drop = True)

In [9]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
too_noisy = noise[noise == '4'].reset_index()
too_noisy

Unnamed: 0,sub_id,response_value
0,2686e,4


In [10]:
## remove subjects who completed the task in noisy environment
df_clean = df_clean[~df_clean['sub_id'].isin(too_noisy['sub_id'])].reset_index(drop = True)

In [11]:
df_device = df[(df['response_name'] == 'survey_headphone1')].reset_index()
len(df_device[['sub_id','response_value']] == 'Yes')

80

In [12]:
df_device = df[(df['response_name'] == 'survey_headphone2')].reset_index(drop = True)
df_device[['sub_id','response_value']]

Unnamed: 0,sub_id,response_value
0,503c8,Wireless Headphones
1,16b53,Wired Earbuds
2,68dac,Built in Laptop/Desktop Speaker (Please use he...
3,421ca,External Ambient Speaker (Please use headphone...
4,c24a4,Wireless Earbuds
...,...,...
75,57a9f,Wired Headphones
76,7c569,Wired Headphones
77,60bf7,Wired Headphones
78,cd75e,Wired Earbuds


### Task relevant trials

In [13]:
## prescreen condition
conds = ['Lack','Lag','Cap','Cab','Beat','Bead',
        'Fat','Fad','Back','Bag','Lap','Lab']
## select the prescreen trials 
df_pre = df_clean[(df_clean['stimuli_presented'].isin(conds)) & (df_clean['task'] == "PrescreenTrials")].reset_index()
df_pre.groupby(['sub_id','stimuli_presented'])['Correct'].mean()
pre_acc = df_pre.groupby(['sub_id'])['Correct'].mean()
fail_pre = pre_acc[pre_acc < pretest_threshold].reset_index()

In [14]:
df_clean = df_clean[~df_clean['sub_id'].isin(fail_pre['sub_id'])].reset_index(drop = True)

In [15]:
## After cleaning how many subjects left
len(df_clean['sub_id'].unique())

64

In [16]:
## Discrimination task 
conds = ['Lab1','Lab2','Lab3','Lab4','Lab5','Lab6','Lab7','Lab8']
## select the discrimination trials 
df_disc = df_clean[(df_clean['stimuli_presented'].isin(conds)) & (df_clean['task'] == "DiscriminationTrials")].reset_index(drop = True)
df_disc.groupby('stimuli_presented')['Shorter'].mean()

stimuli_presented
Lab1    0.709896
Lab2    0.631771
Lab3    0.496354
Lab4    0.358333
Lab5    0.216146
Lab6    0.183854
Lab7    0.158333
Lab8    0.128646
Name: Shorter, dtype: float64

In [17]:
## Main task conditions
conds = ['early_Lab1','early_Lab2','early_Lab3','early_Lab4','early_Lab5','early_Lab6','early_Lab7','early_Lab8',
         'ontime_Lab1','ontime_Lab2','ontime_Lab3','ontime_Lab4','ontime_Lab5','ontime_Lab6','ontime_Lab7','ontime_Lab8',
         'late_Lab1','late_Lab2','late_Lab3','late_Lab4','late_Lab5','late_Lab6','late_Lab7','late_Lab8']
## select the main trials 
df_clean = df_clean[(df_clean['stimuli_presented'].isin(conds)) & (df_clean['task'] == "maintaskTrials")].reset_index(drop = True)
print(len(df_clean['sub_id'].unique()))
df_clean.groupby('stimuli_presented')['Shorter'].mean()

64


stimuli_presented
early_Lab1     0.774740
early_Lab2     0.674479
early_Lab3     0.550781
early_Lab4     0.292969
early_Lab5     0.126302
early_Lab6     0.111979
early_Lab7     0.092448
early_Lab8     0.069010
late_Lab1      0.779948
late_Lab2      0.680990
late_Lab3      0.510417
late_Lab4      0.277344
late_Lab5      0.108073
late_Lab6      0.079427
late_Lab7      0.066406
late_Lab8      0.058594
ontime_Lab1    0.800781
ontime_Lab2    0.662760
ontime_Lab3    0.526042
ontime_Lab4    0.321615
ontime_Lab5    0.122396
ontime_Lab6    0.114583
ontime_Lab7    0.065104
ontime_Lab8    0.062500
Name: Shorter, dtype: float64

In [18]:
## Long format for each condition
# B/T: presenting order & response key
# W/I: Onset time, comparison length, delay length
df_clean['Onset'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-2])
df_clean['comparison'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-1][3])
df_clean['key'] = df_clean['group_id'].apply(lambda x: x.split("_")[-1])

In [19]:
## Sanity check for trial number
df_clean.groupby('stimuli_presented').describe()

Unnamed: 0_level_0,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,participation_duration,participation_duration,...,Correct,Correct,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
stimuli_presented,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
early_Lab1,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.77474,0.418026,0.0,1.0,1.0,1.0,1.0
early_Lab2,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.674479,0.468874,0.0,0.0,1.0,1.0,1.0
early_Lab3,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.550781,0.497739,0.0,0.0,1.0,1.0,1.0
early_Lab4,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.292969,0.455421,0.0,0.0,0.0,1.0,1.0
early_Lab5,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.126302,0.332406,0.0,0.0,0.0,0.0,1.0
early_Lab6,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.111979,0.315546,0.0,0.0,0.0,0.0,1.0
early_Lab7,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.092448,0.289846,0.0,0.0,0.0,0.0,1.0
early_Lab8,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.06901,0.253637,0.0,0.0,0.0,0.0,1.0
late_Lab1,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.779948,0.414551,0.0,1.0,1.0,1.0,1.0
late_Lab2,768.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,768.0,2728.165776,...,0.0,0.0,768.0,0.68099,0.466397,0.0,0.0,1.0,1.0,1.0


### Extreme reaction time

In [None]:
df_clean_longRT = df_clean[df_clean['response_rt'] > RT_threshold]

In [None]:
len(df_clean_longRT['sub_id'].unique())

In [None]:
RT_threshold
df_clean_longRT.head()

In [None]:
df_clean = df_clean[df_clean['response_rt'] < RT_threshold]

## Group by to do condition comparision

In [None]:
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['sub_id'].unique())
n_trial = len(df_clean)//(24*n_subj)
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

In [None]:
## Group_by across subjects
df_clean.groupby(['Onset'])['Shorter'].mean().reset_index()

In [None]:
pps = df_clean.groupby(['Onset'])['Shorter'].mean()
print('Proportion short')
print('Early',pps[0]) 
print('Ontime',pps[2])
print('Late',pps[1])

In [None]:
## Group_by for each subject: mean
pps_all = df_clean.groupby(['stimuli_presented'])['Shorter'].mean()
print(pps_all)
## Outliers: define as the ones who are three stds away from the mean
#outliers = overall_pps - overall_pps.mean() > 3*overall_pps.std()
#sum(outliers)

In [None]:
## Visualization: bar plot
PPS_plot = pd.DataFrame({'Conditions': conds, 'PSS': pps_all})
ax = PPS_plot.plot.bar(rot = 0)
pps_all.plot(style='k--')

In [None]:
pps_all[0:8].plot(style = 'o-') # early
pps_all[8:16].plot(style ='ko-') # late
ax = pps_all[16:24].plot(style ='ro-') # ontime
ax.set_xticklabels([0,1,2,3,4,5,6,7,8])

### Statistic tests

In [None]:
PPS = df_clean.groupby(['sub_id','Onset'])['Shorter'].mean()
PPS_early = PPS[::3]
PPS_ontime = PPS[2::3]
PPS_late = PPS[1::3]

In [None]:
PPS_early.mean()

In [None]:
PPS_ontime.mean()

In [None]:
PPS_late.mean()

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))

## Save the onset and comparison length to the csv file for R analysis

In [20]:
## save df to csv
df_clean.to_csv(r'/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp6_21CR03_Vowel_length/FF2021/results/EXP6_clean_n64.csv', header=True)