In [2]:
import pandas as pd
import altair as alt
from scipy.stats import pearsonr
import numpy as np

In [62]:
survey_df = pd.read_csv('data/Survey_data.csv')
data_df = pd.read_csv('data/final-study-raw-2.csv')
data_df = data_df[data_df['responseId'].isin(['voq1', 'voq2', 'voq3', 'voq4',
                                              'voq5', 'voq6', 'voq7', 'voq8'])]
data_df['answerIsCorrect'] = data_df['correctAnswer'] == data_df['answer'] 
data_df['answerIsCorrect'] = data_df['answerIsCorrect'].apply(lambda x: 1.0 if x else 0.0)


prolific_files = ['prolific_export_1.csv', 'prolific_export_2.csv',
                  'prolific_export_3.csv', 'prolific_export_4.csv', 'prolific_export_5.csv', 'prolific_export_6.csv', 'prolific_export_7.csv']
prolific_df = pd.concat([pd.read_csv('prolific-data/' + file) for file in prolific_files])
prolific_df = prolific_df[prolific_df['Status'] == 'APPROVED']
prolific_df['Age'] = prolific_df['Age'].apply(lambda x: int(x))


# remove the one participant who did not finish the study
counter_df = data_df.groupby('participantId').agg(
    {'status': 'count'}).reset_index()
bad_participant_ids = list(
    counter_df[counter_df['status'] != 36]['participantId'].unique())
data_df = data_df[~data_df['participantId'].isin(bad_participant_ids)]
survey_df = survey_df[~survey_df['participantId'].isin(bad_participant_ids)]
prolific_df = prolific_df[~prolific_df['Participant id'].isin(bad_participant_ids)]

# Insanity checking if there's an obvious correlation between self reported 

In [64]:
participant_exp_df = survey_df[survey_df['responsePrompt'] ==
          'How experienced are you with Data Visualization?'][['participantId', 'answer']]
participant_exp_df['answer'] = participant_exp_df['answer'].apply(lambda x: int(x))

participant_correct_df = data_df.groupby('participantId').agg({'answerIsCorrect': 'sum'}).reset_index()
# merge data frames
temp_df = participant_correct_df.merge(participant_exp_df, on='participantId')

rho = temp_df.corr(numeric_only=True)
pval = temp_df.corr(method=lambda x, y: pearsonr(x, y)[1], numeric_only=True) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t]))
print(pval)
rho.round(2).astype(str) + p

                 answerIsCorrect    answer
answerIsCorrect         0.000000  0.207693
answer                  0.207693  0.000000


  p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t]))


Unnamed: 0,answerIsCorrect,answer
answerIsCorrect,1.0***,0.14
answer,0.14,1.0***


In [49]:
alt.Chart(temp_df).mark_point().encode(
    y='answer:O',
    yOffset="jitter:Q",
    x='answerIsCorrect:Q',
    color='answer:N'
).transform_calculate(
    jitter='0.3*random()'
)

# Preferences

In [50]:
rel_qs = {
    'Did you find UpSet plots easy to read?': 'UpSet Easy to Read?',
    'Did you find the text descriptions easy to read?': 'Text Easy to Read?',
    'Did you find the text description too short or too long?': 'Text Length?',
}

pref_df = survey_df[survey_df['responsePrompt'].isin(rel_qs.keys())][['participantId', 'responsePrompt', 'answer']]
pref_df['answer'] = pref_df['answer'].apply(lambda x: int(x))
pref_df['responsePrompt'] = pref_df['responsePrompt'].map(rel_qs)
pref_df

Unnamed: 0,participantId,responsePrompt,answer
3,574dc90512d86b000f833ab0,UpSet Easy to Read?,5
4,574dc90512d86b000f833ab0,Text Easy to Read?,4
5,574dc90512d86b000f833ab0,Text Length?,3
12,5755c957eb80c4000741a9ce,UpSet Easy to Read?,5
13,5755c957eb80c4000741a9ce,Text Easy to Read?,4
...,...,...,...
733,66cdd1ad42828e8da522ddb1,Text Easy to Read?,2
734,66cdd1ad42828e8da522ddb1,Text Length?,4
741,66ce0e269e0d2cd124df4887,UpSet Easy to Read?,3
742,66ce0e269e0d2cd124df4887,Text Easy to Read?,4


In [78]:
boxes = alt.Chart(pref_df).mark_bar().encode(
    y='answer:O',
    x='count()',
    color='responsePrompt:N'
)

text = alt.Chart(pref_df).mark_text().encode(
    y='answer:O',
    x='count()',
    text='count()'
)
(boxes + text).facet(row='responsePrompt:N')

In [79]:
pref_df.groupby('responsePrompt').agg({'answer': ['mean', 'std'] }).reset_index()

Unnamed: 0_level_0,responsePrompt,answer,answer
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
0,Text Easy to Read?,3.542169,1.02768
1,Text Length?,3.313253,0.697167
2,UpSet Easy to Read?,3.722892,1.140317


# Demographic data

In [70]:
prolific_df.columns

Index(['Submission id', 'Participant id', 'Status',
       'Custom study tncs accepted at', 'Started at', 'Completed at',
       'Reviewed at', 'Archived at', 'Time taken', 'Completion code',
       'Total approvals', 'Age', 'Sex', 'Ethnicity simplified',
       'Country of birth', 'Country of residence', 'Nationality', 'Language',
       'Student status', 'Employment status'],
      dtype='object')

In [65]:
prolific_df['Sex'].value_counts()

Sex
Male      52
Female    31
Name: count, dtype: int64

In [66]:
print(prolific_df['Age'].mean(), prolific_df['Age'].std())
alt.Chart(prolific_df).mark_bar().encode(
    x=alt.X("Age:Q", bin=True),
    y='count()',
)

30.710843373493976 9.637996604499282


In [73]:
# # prolific_df['Country'].value_counts()
cols_of_interest = [
#     # 'Submission id',
#     # 'Participant id',
#     # 'Status',
#     # 'Custom study tncs accepted at',
#     # 'Started at',
#     # 'Completed at',
#     # 'Reviewed at',
#     # 'Archived at',
#     # 'Time taken',
#     # 'Completion code',
#     # 'Total approvals',
#     'Age',
#     'Sex',
#     # 'Ethnicity simplified',
#     # 'Country of birth',
    'Country of residence',
#     # 'Nationality',
#     'Language',
#     # 'Student status',
#     # 'Employment status'
]
for col in cols_of_interest:
    print(prolific_df[col].value_counts())
    print('\n')

Country of residence
Canada            32
United Kingdom    30
United States     21
Name: count, dtype: int64


