In [83]:
import pandas as pd
import scipy as sp
import numpy as np

In [58]:
EXPORT_DIRECTORY = '../preprocessing/data_review'

In [59]:
# read the experimental data
data = pd.read_csv('../data/experimental_data.csv')

# Get the accuracies by subject

In [60]:
# get the correct answers from stimulus list
stimuli = pd.read_csv('../stimuli/Stimuli(Sheet1).csv',sep=';')
has_comprehension_q = stimuli[~(stimuli['Question'].isnull())]
q_answers = has_comprehension_q[['Item','Condition','QuestionCorrect']]
q_answers = q_answers.set_index(['Item','Condition'])
q_answers = q_answers.replace({
    'Yes':True,
    'No':False
})

# also get which items actually have comprehension questions
items_w_qs = set([index[0] for index in q_answers.index])
print(items_w_qs)

# number of items with comprehension questions
print(len(items_w_qs))

{6, 7, 8, 13, 16, 17, 19, 20, 22, 23, 25, 26, 27, 28, 29, 30, 35, 37, 39, 40, 41, 43, 44, 45, 46, 47, 48, 50, 51, 53, 54, 55, 58, 59, 61, 62, 63, 64, 66, 67, 68, 69, 71, 72, 73, 75, 77, 78}
48


In [61]:
#get the responses of each participant to comprehension questions
comp_resp = data[data['PennElementName'] == 'ComprehensionDecide']
comp_resp = comp_resp[comp_resp['Item'].isin(items_w_qs)]
comp_resp = comp_resp[["MD5 hash of participant's IP address",'Item','Condition','Value']]
comp_resp = comp_resp.set_index(['Item','Condition'])
comp_resp['Value'] = comp_resp['Value'].replace({
    'K':True,
    'D':False
})

In [76]:
check = pd.merge(comp_resp,q_answers, left_index=True, right_index=True)
check['SubjectCorrect'] = (check['Value'] == check['QuestionCorrect'])
subject_check = check.groupby("MD5 hash of participant's IP address").mean('QuestionCorrect')
subject_check = subject_check[['Value','SubjectCorrect']]
subject_check.columns = ['PercentageYes','SubjectCorrect']

In [63]:
# write results to file
subject_check.to_csv(f'{EXPORT_DIRECTORY}/accuracy_by_subject.csv')

In [77]:
# also get the mean accuracy and standard deviation
print(subject_check['SubjectCorrect'].mean())
print(subject_check['SubjectCorrect'].std())

0.8375
0.08828490893804462


# Get accuracies by condition

In [88]:
table = check.reset_index().groupby('Condition').agg({'SubjectCorrect':['mean','std','sum']})
table

Unnamed: 0_level_0,SubjectCorrect,SubjectCorrect,SubjectCorrect
Unnamed: 0_level_1,mean,std,sum
Condition,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,0.85,0.357818,204
B,0.831325,0.375219,207
C,0.816239,0.388119,191
D,0.852321,0.355533,202


In [97]:
export_data = check.drop(['Value','QuestionCorrect'], axis=1) # to do glmer, we need R, not Python
export_data.to_csv(f'{EXPORT_DIRECTORY}/accuracy_data.csv')

# Get completions by subject

In [170]:
completions = data[(data['PennElementName'] == 'BlankAnswer') & (data['Parameter'] == 'Final')]
completions = completions[["MD5 hash of participant's IP address",'Value','Item','Condition']]
completions

Unnamed: 0,MD5 hash of participant's IP address,Value,Item,Condition
7,b8b10c5c32c46a8f278dc868ef65fd9d,drive,24,D
18,b8b10c5c32c46a8f278dc868ef65fd9d,cut it,78,B
29,b8b10c5c32c46a8f278dc868ef65fd9d,leave,17,C
38,b8b10c5c32c46a8f278dc868ef65fd9d,pour,34,B
49,b8b10c5c32c46a8f278dc868ef65fd9d,scavenge,44,D
...,...,...,...,...
16353,9f8736428d68ef86c654e1f0f0b1058e,translate them,70,C
16364,9f8736428d68ef86c654e1f0f0b1058e,prepare,59,B
16373,9f8736428d68ef86c654e1f0f0b1058e,stop for a drink,5,D
16382,9f8736428d68ef86c654e1f0f0b1058e,get on the bus,42,C


In [171]:
ips = set(completions["MD5 hash of participant's IP address"])
completions_by_subject = {}
for ip in ips:
    subject_completions = list(completions[completions["MD5 hash of participant's IP address"] == ip]['Value'])
    completions_by_subject[ip] = subject_completions
    
with open(f'{EXPORT_DIRECTORY}/completions_by_subject.txt','w') as f:
    for ip in ips:
        f.write(f'---{ip}---\n')
        f.write('\n'.join(completions_by_subject[ip]))
        f.write('\n\n')

# Get completions by item

In [172]:
CONDITIONS = ('A','B','C','D')
ITEMS = tuple(range(1,81))

In [173]:
with open(f'{EXPORT_DIRECTORY}/completions_by_item.txt','w') as f:
    for item in ITEMS:
        for condition in CONDITIONS:
            item_completions = list(completions[(completions['Condition'] == condition) & (completions['Item'] == item)]['Value'])
            f.write(f'---{item}{condition}---\n')
            f.write('\n'.join(item_completions))
            f.write('\n\n')