## Initialization
- Import the package
- Import the data
    - prescreen
    - discrimination
    - main task 
    - catch trials

In [1]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [2]:
## Read the data
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp6_21CR03_Vowel_length/FF2021/results/results.csv"
df = pd.read_csv(path_to_data)
# df_clean = df
## Take a look of the dataset
# df.head()
# df.tail()
# df.loc[558:663] # see specific rows of data

In [3]:
## Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])

In [4]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = [] # only applied for the prescreen
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_value'][i] == df['stimuli_presented'][i]:
        Correct.append(1)
    else: 
        Correct.append(0)
    if df['response_value'][i] == "Lap":
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [5]:
## Parameters
threshold = .55
catch_threshold = .8
RT_threshold = 10000

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)

**Reject trials**
- Task-relevant trials
- Extreme RT

### Reject subjects 

In [6]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_cat','Catch_bird']
df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('sub_id')['response_correct'].count()
catch_acc[catch_acc < catch_threshold]

sub_id
22a8d    0.785714
a2efa    0.785714
Name: response_correct, dtype: float64

In [7]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
df_device = df[(df['response_name'] == 'survey_headphone1')].reset_index(drop = True)
df_device[['sub_id','response_value']]
df_device = df[(df['response_name'] == 'survey_headphone2')].reset_index(drop = True)
df_device[['sub_id','response_value']]

Unnamed: 0,sub_id,response_value
0,303f5,Wired Earbuds
1,b62fd,Wired Earbuds
2,89b38,Wired Headphones
3,7843f,Wired Earbuds
4,22a8d,Wireless Earbuds
5,821b4,Wireless Earbuds
6,a2efa,Wireless Earbuds
7,164b9,Wireless Earbuds
8,c3bcd,Wireless Headphones
9,32376,Wireless Earbuds


### Task relevant trials

In [8]:
## subj id
subID = df['sub_id'].unique()

In [17]:
## prescreen condition
conds = ['Lack','Lag','Cap','Cab','Beat','Bead',
        'Fat','Fad','Back','Bag','Lap','Lab']
## select the prescreen trials 
df_clean = df[(df['stimuli_presented'].isin(conds)) & (df['task'] == "PrescreenTrials")].reset_index(drop = True)
a = df_clean.groupby(['sub_id','stimuli_presented'])['Correct'].mean()
#df_clean.groupby('stimuli_presented')['Correct'].mean().mean()

In [10]:
## Discrimination task 
conds = ['Lab1','Lab2','Lab3','Lab4','Lab5','Lab6','Lab7','Lab8']
## select the discrimination trials 
df_clean = df[(df['stimuli_presented'].isin(conds)) & (df['task'] == "DiscriminationTrials")].reset_index(drop = True)
df_clean.groupby('stimuli_presented')['Shorter'].mean()

stimuli_presented
Lab1    0.704444
Lab2    0.648889
Lab3    0.555556
Lab4    0.431111
Lab5    0.240000
Lab6    0.217778
Lab7    0.222222
Lab8    0.168889
Name: Shorter, dtype: float64

In [11]:
## Main task conditions
conds = ['early_Lab1','early_Lab2','early_Lab3','early_Lab4','early_Lab5','early_Lab6','early_Lab7','early_Lab8',
         'ontime_Lab1','ontime_Lab2','ontime_Lab3','ontime_Lab4','ontime_Lab5','ontime_Lab6','ontime_Lab7','ontime_Lab8',
         'late_Lab1','late_Lab2','late_Lab3','late_Lab4','late_Lab5','late_Lab6','late_Lab7','late_Lab8']
## select the main trials 
df_clean = df[(df['stimuli_presented'].isin(conds)) & (df['task'] == "maintaskTrials")].reset_index(drop = True)
df_clean.groupby('stimuli_presented')['Shorter'].mean()

stimuli_presented
early_Lab1     0.833333
early_Lab2     0.722222
early_Lab3     0.577778
early_Lab4     0.350000
early_Lab5     0.122222
early_Lab6     0.100000
early_Lab7     0.105556
early_Lab8     0.105556
late_Lab1      0.816667
late_Lab2      0.816667
late_Lab3      0.561111
late_Lab4      0.333333
late_Lab5      0.133333
late_Lab6      0.144444
late_Lab7      0.105556
ontime_Lab1    0.850000
ontime_Lab2    0.794444
ontime_Lab3    0.561111
ontime_Lab4    0.372222
ontime_Lab5    0.150000
ontime_Lab6    0.111111
ontime_Lab7    0.133333
ontime_Lab8    0.122222
Name: Shorter, dtype: float64

In [12]:
## Long format for each condition
# B/T: presenting order & response key
# W/I: Onset time, comparison length, delay length
df_clean['onset'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-2])
df_clean['comparison'] = df_clean['stimuli_presented'].apply(lambda x: x.split("_")[-1][3])
df_clean['key'] = df_clean['group_id'].apply(lambda x: x.split("_")[-1])

In [13]:
## Sanity check for trial number
df_clean.groupby('stimuli_presented').describe()

Unnamed: 0_level_0,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,network_error_repeat,participation_duration,participation_duration,...,Correct,Correct,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter,Shorter
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
stimuli_presented,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
early_Lab1,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.833333,0.373718,0.0,1.0,1.0,1.0,1.0
early_Lab2,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.722222,0.449153,0.0,0.0,1.0,1.0,1.0
early_Lab3,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.577778,0.495291,0.0,0.0,1.0,1.0,1.0
early_Lab4,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.35,0.4783,0.0,0.0,0.0,1.0,1.0
early_Lab5,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.122222,0.328456,0.0,0.0,0.0,0.0,1.0
early_Lab6,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.1,0.300837,0.0,0.0,0.0,0.0,1.0
early_Lab7,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.105556,0.308125,0.0,0.0,0.0,0.0,1.0
early_Lab8,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.105556,0.308125,0.0,0.0,0.0,0.0,1.0
late_Lab1,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.816667,0.388019,0.0,1.0,1.0,1.0,1.0
late_Lab2,180.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,180.0,2546.379744,...,0.0,0.0,180.0,0.816667,0.388019,0.0,1.0,1.0,1.0,1.0


### Extreme reaction time

In [None]:
df_clean_longRT = df_clean[df_clean['response_rt'] > RT_threshold]

## Save the onset and comparison length to the csv file for R analysis

In [None]:
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['sub_id'].unique())
n_trial = len(df_clean)//15
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

In [None]:
## Group_by
overall_acc = df_clean.groupby(['sub_id']).mean()['Correct'].reset_index(drop = True)
overall_pps = df_clean.groupby(['sub_id']).mean()['Shorter'].reset_index(drop = True)
all_conds_acc = df_clean.groupby(['sub_id','onset','delay','comparison','order','key']).mean()['Correct'].reset_index(drop = True)
all_conds_pps = df_clean.groupby(['sub_id','onset','delay','comparison','order','key']).mean()['Shorter'].reset_index(drop = True)

In [None]:
## save df to csv
df_clean.to_csv(r'/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp4_20CR12/results_shortlongdelay_2021/v2_20CR12_clean_n67_cleaned.csv', header=True)

## Analyze trials of the main task

### Proportion Lap

In [14]:
df_clean.groupby('onset')['Shorter'].mean()

onset
early     0.364583
late      0.377083
ontime    0.386806
Name: Shorter, dtype: float64

### Statistic tests

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))