In [2]:
import csv
import numpy as np
import pandas as pd
from collections import defaultdict
import string
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn import metrics # confusion matrix, MSE etc.

In [3]:
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp4_20CR12/4c/results/session-62159fb8dfe54372fbc126e5-data.csv"
path_to_data = "/Users/t.z.cheng/Google_Drive/Research/cross_domain_entrainment/exp8/results/session-635300ef7f3ee915ba688e53-data.csv"

In [4]:
## Read the data
df = pd.read_csv(path_to_data)
# df.groupby('stimuli_presented').describe()

In [5]:
## Tasks & Conditions: Clean up task and subject ID (first five characters)
df['task'] = df['trial_template'].apply(lambda x: x.split("_")[0])
df['sub_id'] = df['participant_id'].apply(lambda x: x.split()[0][0:5])

In [6]:
## Add the accuracy and PPS column to the dataset: transform True and Shorter to 1, False and longer to 0
Correct = [] # only applied for the prescreen
Shorter = []
for i in np.arange(0,len(df)):
    if df['response_correct'][i] == True:
        Correct.append(1)
    else: 
        Correct.append(0)
    if df['response_value'][i] == "Shorter":
        Shorter.append(1)
    else: 
        Shorter.append(0)
df['Correct'] = Correct
df['Shorter'] = Shorter

In [7]:
## Total participated subjects
len(df['sub_id'].unique())

96

In [None]:
## for exp8abc
df['exp'] = df['group_id'].apply(lambda x: x.split("_")[0])
df_exp8c = df[(df['exp'] == 'EXP8c')].reset_index(drop = True)
df = df_exp8c

In [15]:
df['exp'] = df['group_id'].apply(lambda x: x.split("_")[0])
len(df.groupby('exp')['sub_id'].unique()['EXP8a'])
len(df.groupby('exp')['sub_id'].unique()['EXP8b'])
len(df.groupby('exp')['sub_id'].unique()['EXP8c'])

32

In [None]:
len(df.groupby('exp')['sub_id'].unique()['EXP8a'])

## Set parameters
- Accuracy threshold for the easiest trials of the main task 
- Extreme RT threshold
- Catch trial accuracy

In [None]:
## Parameters
threshold = .55
catch_threshold = .7
RT_threshold = 10000
practice_threshold = 'pass'

## Data cleaning 
***Super important: df_clean is overwritten after each step of data cleaning***

**Reject subjects**
- Catch trial
- Environmental noise & audio device (may not need to use them as criteria)
- Fail the practice trial on the last 12 practice trial: pass people have done 24 trials, fail people have done 36 trials

**Reject trials**
- Task-relevant trials
- Extreme RT

In [None]:
## How many subjects miss the catch trial 
catch_trials = ['Catch_cat','Catch_bird']
catch_trials = ['Catch_trials'] ## for exp8abc

df_catch = df[(df['trial_template'].isin(catch_trials))].reset_index(drop = True) # reset index from 1
catch_acc = df_catch.groupby('sub_id')['response_correct'].sum()/df_catch.groupby('sub_id')['response_correct'].count()
fail_catch = catch_acc[catch_acc < catch_threshold].reset_index()
fail_catch

In [None]:
## remove subjects who failed the catch trial based on the threshold
df_clean = df[~df['sub_id'].isin(fail_catch['sub_id'])].reset_index(drop = True)

In [None]:
## How many subjects had a bad environmental noise and device
df_noise = df[(df['response_name'] == 'survey_noise')].reset_index(drop = True)
noise = df_noise.groupby('sub_id')['response_value'].sum()
too_noisy = noise[noise == '4'].reset_index()
too_noisy

In [None]:
## remove subjects who completed the task in noisy environment
df_clean = df_clean[~df_clean['sub_id'].isin(too_noisy['sub_id'])].reset_index(drop = True)

In [None]:
# may remove subjects who completed the task using low-quality audio device
df_device = df[(df['response_name'] == 'survey_headphone1')].reset_index()
len(df_device[['sub_id','response_value']] == 'Yes')

In [None]:
## remove subjects who failed the practice 
fail_prac = df[df['branch_failpass'] == 'fail']['sub_id'].unique()
df_clean = df_clean[(~df_clean['sub_id'].isin(fail_prac))].reset_index()

In [None]:
## After cleaning how many subjects left
len(df_clean['sub_id'].unique())

### Task relevant trials
#### Select the trials to analyze
- Pretest trails (n = 60): single tone 1 or 8
- Practice trials (n = 24 or 36): 6 context tones + 1 ontime single tone 1 or 8 with correct/incorrect feedback
- Main task (n = 576): 6 context tones + 1 single tone (early, ontime, late; 1 to 8 steps) without feedback

### Select the pretest trials 

In [None]:
df_pre = df_clean[(df_clean['task'] == "PretestTrials")].reset_index()
df_pre.groupby(['sub_id','stimuli_presented'])['Correct'].mean()
pre_acc = df_pre.groupby(['sub_id'])['Correct'].mean()
conds = df_pre['stimuli_presented'].unique()
print(conds) # check the conditions
df_pre.groupby('stimuli_presented')['Shorter'].mean()

### Select the practice trials

In [None]:
df_prac = df_clean[(df_clean['task'] == "practiceTrials")].reset_index()
conds = df_prac['stimuli_presented'].unique()
print(conds) # check the conditions
df_prac.groupby('stimuli_presented')['Shorter'].mean()

In [None]:
df_clean_all = df_clean # create a new variable just in case

## ??? under progress, need to write a code to save those *fail* who but pass on the last 12 trials??? 

In [None]:
# df_pra = df_pra[-12:-1]
# df_pra.groupby(['sub_id','stimuli_presented'])['Correct'].mean()
# pra_acc = df_pra.groupby(['sub_id'])['Correct'].mean()
# fail_pra = pra_acc[pra_acc != practice_threshold].reset_index()

### Select the main trials

In [None]:
df_clean = df_clean[(df_clean['task'] == "maintaskTrials")].reset_index(drop = True)
conds = df_clean['stimuli_presented'].unique()
print(conds) # check the conditions

In [None]:
## Create new cols for onset and length
onset = []
length = []
df_clean = df_clean.set_index(pd.Index(np.arange(0,len(df_clean)))) # change the index for the for loop

for i in np.arange(0,len(df_clean)):
    stimuli_presented = df_clean['stimuli_presented'][i].split('_')
    onset.append(stimuli_presented[2])
    length.append(stimuli_presented[-1])
df_clean['Onset'] = onset
df_clean['Length'] = length

In [None]:
## Number of subjects, trials & conditions for each subjects
n_subj = len(df_clean['participant_id'].unique())
n_trial = len(df_clean)//n_subj # how many trials per subject
n_conds = len(df_clean['stimuli_presented'].unique())
print('Participant_number:', n_subj,'Trial number:', n_trial,'Condition number:', n_conds, sep='\n')

### Accuracy

In [None]:
df_clean.groupby(['Onset'])['Correct'].mean()

In [None]:
accuracy_all = df_clean.groupby(['stimuli_presented','Onset'])['Correct'].mean()

In [None]:
## consider outliers based on the accuracy of the easiest trials 
outliers = Acc_easiest < threshold
outliers = outliers[outliers == True]
df_clean_nooutlier = df_clean[~df_clean['participant_id'].isin(outliers.index)]

In [None]:
## Visualization: bar plot
ACC_plot = pd.DataFrame({'Conditions': conds, 'accuracy': accuracy_all})
ax = ACC_plot.plot.bar(rot = 0)

### Proportion short

In [None]:
df_clean.groupby(['Onset'])['Shorter'].mean()

In [None]:
pps = df_clean.groupby(['Onset'])['Shorter'].mean()
print('Proportion short')
print('Early',pps[0]) 
print('Ontime',pps[2])
print('Late',pps[1])

In [None]:
pps_all = df_clean.groupby(['stimuli_presented','Onset'])['Shorter'].mean()

In [None]:
## Visualization: bar plot
PPS_plot = pd.DataFrame({'Conditions': conds, 'PSS': pps_all})
ax = PPS_plot.plot.bar(rot = 0)
pps_all.plot(style='k--')

In [None]:
pps_all[8:16]

In [None]:
pps_all[0:8].plot(style = 'o-') # early
pps_all[8:16].plot(style ='ko-') # late
ax = pps_all[16:24].plot(style ='ro-') # ontime
ax.set_xticklabels([0,1,2,3,4,5,6,7,8])

### Statistic tests

In [None]:
PPS = df_clean.groupby(['sub_id','Onset'])['Shorter'].mean()
PPS_early = PPS[::3]
PPS_ontime = PPS[1::3]
PPS_late = PPS[2::3]

In [None]:
print(stats.ttest_rel(PPS_early, PPS_late))
print(stats.ttest_rel(PPS_early, PPS_ontime))
print(stats.ttest_rel(PPS_late, PPS_ontime))

## Save the onset and comparison length to the csv file for R analysis

In [None]:
## save df to csv
df_clean.to_csv(r'/Users/t.z.cheng/Google_Drive/Research/Delaydoesmatter/real_exp/exp4_20CR12/4c/results/EXP4c_clean_n59.csv', header=True)