In [140]:
import pandas as pd
import altair as alt
from scipy.stats import pearsonr, chi2_contingency
import numpy as np

In [155]:
survey_df = pd.read_csv('data/Survey_data.csv')
data_df = pd.read_csv('data/final-study-raw-2.csv')
data_df = data_df[data_df['responseId'].isin(['voq1', 'voq2', 'voq3', 'voq4',
                                              'voq5', 'voq6', 'voq7', 'voq8'])]
data_df['answerIsCorrect'] = data_df['correctAnswer'] == data_df['answer'] 
data_df['answerIsCorrect'] = data_df['answerIsCorrect'].apply(lambda x: 1.0 if x else 0.0)


prolific_files = ['prolific_export_1.csv', 'prolific_export_2.csv',
                  'prolific_export_3.csv', 'prolific_export_4.csv', 'prolific_export_5.csv', 'prolific_export_6.csv', 'prolific_export_7.csv']
prolific_df = pd.concat([pd.read_csv('prolific-data/' + file) for file in prolific_files])
prolific_df = prolific_df[prolific_df['Status'] == 'APPROVED']
prolific_df['Age'] = prolific_df['Age'].apply(lambda x: int(x))


# remove the one participant who did not finish the study
counter_df = data_df.groupby('participantId').agg(
    {'status': 'count'}).reset_index()
bad_participant_ids = list(
    counter_df[counter_df['status'] != 36]['participantId'].unique())
data_df = data_df[~data_df['participantId'].isin(bad_participant_ids)]
survey_df = survey_df[~survey_df['participantId'].isin(bad_participant_ids)]
prolific_df = prolific_df[~prolific_df['Participant id'].isin(bad_participant_ids)]

qual_codes = pd.read_csv('data/qual-coding.csv')

llm_df = pd.read_csv('data/LLM-Study.tsv', sep='\t')

# Insanity checking if there's an obvious correlation between self reported 

In [64]:
participant_exp_df = survey_df[survey_df['responsePrompt'] ==
          'How experienced are you with Data Visualization?'][['participantId', 'answer']]
participant_exp_df['answer'] = participant_exp_df['answer'].apply(lambda x: int(x))

participant_correct_df = data_df.groupby('participantId').agg({'answerIsCorrect': 'sum'}).reset_index()
# merge data frames
temp_df = participant_correct_df.merge(participant_exp_df, on='participantId')

rho = temp_df.corr(numeric_only=True)
pval = temp_df.corr(method=lambda x, y: pearsonr(x, y)[1], numeric_only=True) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t]))
print(pval)
rho.round(2).astype(str) + p

                 answerIsCorrect    answer
answerIsCorrect         0.000000  0.207693
answer                  0.207693  0.000000


  p = pval.applymap(lambda x: ''.join(['*' for t in [.05, .01, .001] if x <= t]))


Unnamed: 0,answerIsCorrect,answer
answerIsCorrect,1.0***,0.14
answer,0.14,1.0***


In [49]:
alt.Chart(temp_df).mark_point().encode(
    y='answer:O',
    yOffset="jitter:Q",
    x='answerIsCorrect:Q',
    color='answer:N'
).transform_calculate(
    jitter='0.3*random()'
)

# Preferences

In [50]:
rel_qs = {
    'Did you find UpSet plots easy to read?': 'UpSet Easy to Read?',
    'Did you find the text descriptions easy to read?': 'Text Easy to Read?',
    'Did you find the text description too short or too long?': 'Text Length?',
}

pref_df = survey_df[survey_df['responsePrompt'].isin(rel_qs.keys())][['participantId', 'responsePrompt', 'answer']]
pref_df['answer'] = pref_df['answer'].apply(lambda x: int(x))
pref_df['responsePrompt'] = pref_df['responsePrompt'].map(rel_qs)
pref_df

Unnamed: 0,participantId,responsePrompt,answer
3,574dc90512d86b000f833ab0,UpSet Easy to Read?,5
4,574dc90512d86b000f833ab0,Text Easy to Read?,4
5,574dc90512d86b000f833ab0,Text Length?,3
12,5755c957eb80c4000741a9ce,UpSet Easy to Read?,5
13,5755c957eb80c4000741a9ce,Text Easy to Read?,4
...,...,...,...
733,66cdd1ad42828e8da522ddb1,Text Easy to Read?,2
734,66cdd1ad42828e8da522ddb1,Text Length?,4
741,66ce0e269e0d2cd124df4887,UpSet Easy to Read?,3
742,66ce0e269e0d2cd124df4887,Text Easy to Read?,4


In [78]:
boxes = alt.Chart(pref_df).mark_bar().encode(
    y='answer:O',
    x='count()',
    color='responsePrompt:N'
)

text = alt.Chart(pref_df).mark_text().encode(
    y='answer:O',
    x='count()',
    text='count()'
)
(boxes + text).facet(row='responsePrompt:N')

In [79]:
pref_df.groupby('responsePrompt').agg({'answer': ['mean', 'std'] }).reset_index()

Unnamed: 0_level_0,responsePrompt,answer,answer
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
0,Text Easy to Read?,3.542169,1.02768
1,Text Length?,3.313253,0.697167
2,UpSet Easy to Read?,3.722892,1.140317


# Demographic data

In [70]:
prolific_df.columns

Index(['Submission id', 'Participant id', 'Status',
       'Custom study tncs accepted at', 'Started at', 'Completed at',
       'Reviewed at', 'Archived at', 'Time taken', 'Completion code',
       'Total approvals', 'Age', 'Sex', 'Ethnicity simplified',
       'Country of birth', 'Country of residence', 'Nationality', 'Language',
       'Student status', 'Employment status'],
      dtype='object')

In [65]:
prolific_df['Sex'].value_counts()

Sex
Male      52
Female    31
Name: count, dtype: int64

In [66]:
print(prolific_df['Age'].mean(), prolific_df['Age'].std())
alt.Chart(prolific_df).mark_bar().encode(
    x=alt.X("Age:Q", bin=True),
    y='count()',
)

30.710843373493976 9.637996604499282


In [73]:
# # prolific_df['Country'].value_counts()
cols_of_interest = [
#     # 'Submission id',
#     # 'Participant id',
#     # 'Status',
#     # 'Custom study tncs accepted at',
#     # 'Started at',
#     # 'Completed at',
#     # 'Reviewed at',
#     # 'Archived at',
#     # 'Time taken',
#     # 'Completion code',
#     # 'Total approvals',
#     'Age',
#     'Sex',
#     # 'Ethnicity simplified',
#     # 'Country of birth',
    'Country of residence',
#     # 'Nationality',
#     'Language',
#     # 'Student status',
#     # 'Employment status'
]
for col in cols_of_interest:
    print(prolific_df[col].value_counts())
    print('\n')

Country of residence
Canada            32
United Kingdom    30
United States     21
Name: count, dtype: int64




# Qual code counting

In [125]:
qual_codes['high-level'] = qual_codes['Tags'].fillna('').apply(
    lambda x: 'high level data' in x.lower())
qual_codes['low-level'] = qual_codes['Tags'].fillna('').apply(
    lambda x: 'low level data' in x.lower())

pure_counts = qual_codes.groupby(['Condition']).aggregate({
    'high-level': 'sum',
    'low-level': 'sum'
}).reset_index()
pure_counts = pure_counts[pure_counts['Condition'] != 'Post Study Survey']

pure_counts_pre_melt = pure_counts.copy()
# melt the data
pure_counts = pd.melt(pure_counts, id_vars=['Condition'], value_vars=[
                   'high-level', 'low-level'])
pure_counts

Unnamed: 0,Condition,variable,value
0,Both,high-level,20
1,Text,high-level,12
2,Vis,high-level,17
3,Both,low-level,21
4,Text,low-level,23
5,Vis,low-level,25


In [104]:
base = alt.Chart(pure_counts).encode(x=alt.X('Condition:N', scale=alt.Scale(domain=['Vis', 'Text', 'Both'])),y='variable:N')
colors = base.mark_rect().encode(color=alt.Color('value:Q', scale=alt.Scale(scheme="blues")))
text = base.mark_text(baseline='middle').encode(text='value:Q')
colors + text

In [116]:
qual_codes['isBoth'] = qual_codes['Condition'].apply(lambda x : 1 if x == 'Both' else 0)
qual_codes['isVis'] = qual_codes['Condition'].apply(lambda x : 1 if x == 'Vis' else 0)
qual_codes['isText'] = qual_codes['Condition'].apply(lambda x : 1 if x == 'Text' else 0)
qual_codes['isHigh'] = qual_codes['high-level'].apply(lambda x : 1 if x else 0)
qual_codes['isLow'] = qual_codes['low-level'].apply(lambda x : 1 if x else 0)
qual_codes['insight-type'] = qual_codes.apply(lambda x: 'both' if x['high-level'] and x['low-level'] else ('high' if x['high-level'] else ('low' if x['low-level'] else 'neither')), axis=1)   
qual_codes

Unnamed: 0,participantId,trialId,Condition,responsePrompt,answer,Sentiments,Tags,Index,high-level,low-level,isBoth,isVis,isText,isHigh,isLow,insight-type
0,6695703cec19c758ec8dbd7c,Covid-Both-Q3,Both,What are your insights and take-aways from the...,Fatigue is the highest occurring symptom. The ...,,"High level data observation,Similar observatio...",302,True,False,1,0,0,1,0,high
1,5ba855d47c0ebe0001272f70,Organizations-Both-Q3,Both,What are your insights and take-aways from the...,UPU is the organization with most member count...,,High level data observation,38,True,False,1,0,0,1,0,high
2,62bb4c95ca36f792585cb4b5,Organizations-Both-Q3,Both,What are your insights and take-aways from the...,I like having both the description and the vis...,,"Insight about Text and Vis both,Easy to parse ...",152,False,False,1,0,0,0,0,neither
3,6668c49e07ce221e29f2bc5a,Covid-Both-Q3,Both,What are your insights and take-aways from the...,I find that if I am presented with both text d...,,"Helpful as cross reference,Insight about Text ...",243,False,False,1,0,0,0,0,neither
4,5ee1cf1e0df8e55c6b9e1f1a,Covid-Both-Q3,Both,What are your insights and take-aways from the...,Coolest data so far,,Comment about data content,93,False,False,1,0,0,0,0,neither
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
410,66bce91186a8da8151c30f61,Organizations-Vis-Q3,Vis,What are your insights and take-aways from thi...,1.That most of the member countries are part o...,,"Low level data observation,Insight about inter...",356,False,True,0,1,0,0,1,low
411,5dd71896670d1a6af9f20eee,Covid-Vis-Q3,Vis,What are your insights and take-aways from thi...,I appreciated the visual much more than the te...,Positive,"Insight about Vis,Preference on VIS (over text...",52,False,False,0,1,0,0,0,neither
412,628254ec789e78a3dcd22d99,Covid-Vis-Q3,Vis,What are your insights and take-aways from thi...,Sleep is a core component of having cohesive t...,,,148,False,False,0,1,0,0,0,neither
413,5c6c260dce4e560001f75731,Covid-Vis-Q3,Vis,What are your insights and take-aways from thi...,There are lots of combinations of covid sympto...,,"Comment about data content,More set combinatio...",48,False,False,0,1,0,0,0,neither


# LLM Study

In [170]:
cols =  [
    # 'Timestamp (DDMMYY)', 
    # 'Image', 
    # 'Accessible Processed Data',
    # 'APD includes attributes?', 
    # 'Example alttext', 
    # 'Strong/Identity prompt',
    # 'Pattern prompt',
    'Case descriptions',
    # 'Study',
    # 'Generation',
    'Model',
    'Dataset',

    'Total facts',
    'Falsehoods',
    'Duplicate facts',
    'Number of words',
    'Lvl 1 facts',
    'Lvl 2 facts',
    'Lvl 3 facts',
    'Lvl 4 facts',
    ]
df_llm = llm_df[cols][~llm_df['Duplicate facts'].isna()]
# melt off the level facts
df_llm = df_llm.melt(id_vars=cols[:-4], value_vars=cols[-4:])
df_llm.head(5)


Unnamed: 0,Case descriptions,Model,Dataset,Total facts,Falsehoods,Duplicate facts,Number of words,variable,value
0,Abalation Base,Claude-3-5-sonnet-20240620,Movies,31,2,1.0,324,Lvl 1 facts,2
1,"Abalation, No Image",Claude-3-5-sonnet-20240620,Movies,28,0,4.0,393,Lvl 1 facts,1
2,"Ablation, No data",Claude-3-5-sonnet-20240620,Movies,8,13,1.0,332,Lvl 1 facts,0
3,"Ablation, No Example alt text",Claude-3-5-sonnet-20240620,Movies,25,1,3.0,321,Lvl 1 facts,1
4,"Ablation, No identity prompt",Claude-3-5-sonnet-20240620,Movies,20,3,3.0,298,Lvl 1 facts,5


In [182]:
base = alt.Chart(df_llm).encode(
    y="Case descriptions:N",

)
bars = base.mark_bar().encode(
    x="value:Q",
    color='variable:N',
)
text = base.mark_text().encode(
    x="sum(value):Q",
    text='sum(value):Q'
)
bars + text

In [203]:
llm_df_temp = llm_df[cols].copy()
llm_df_temp = llm_df_temp[~llm_df_temp['Duplicate facts'].isna()]
llm_df_temp['Total facts'] = llm_df_temp['Total facts'].apply(lambda x: int(x))
llm_df_temp['Study'] = llm_df_temp['Case descriptions'].apply(lambda x: 'Ablation' if x[0:2] == 'Ab' else ('Control' if 'Control' in x else 'Best Effort'))
llm_df_temp_cond = llm_df_temp.groupby('Study').agg(
    {'Total facts': ['mean'], 'Number of words': ['mean']}).reset_index()
llm_df_temp_cond.columns = ['Study', 'Facts', 'Words']
llm_df_temp_cond

Unnamed: 0,Study,Facts,Words
0,Ablation,23.0,325.166667
1,Best Effort,21.333333,384.0
2,Control,38.333333,290.333333


In [199]:
alt.Chart(llm_df_temp_cond).mark_tick().encode(
    y='Study:N',
    x=alt.X('Facts:Q').scale(alt.Scale(domain=[0, 45]))
)

Unnamed: 0,Timestamp (DDMMYY),Image,Accessible Processed Data,APD includes attributes?,Example alttext,Strong/Identity prompt,Pattern prompt,Case descriptions,Study,Generation,Model,Dataset,Lvl 1 facts,Lvl 2 facts,Lvl 3 facts,Lvl 4 facts,Total facts,Falsehoods,Duplicate facts,Number of words
0,04/09/2024,True,True,True,True,True,True,Pilot 1,Pilot,This UpSet plot visualizes the genre combinati...,Claude-3-5-sonnet-20240620,Movies,1,14,4,10,29,0,,430
1,04/09/24,True,True,True,True,True,True,Pilot 2,Pilot,This UpSet plot visualizes the intersection of...,gpt-4o,Movies,2,14,2,4,22,9,,484
2,04/09/2024,True,True,True,True,True,True,Pilot 3,Pilot,This UpSet plot reveals several interesting pa...,Claude-3-5-sonnet-20240620,Anon_movies,0,20,7,6,33,0,,439
3,04/09/2024,True,True,True,True,True,True,Pilot 4,Pilot,This UpSet plot visualizes the intersections o...,gpt-4o,Anon_movies,4,23,6,0,33,3,,419
4,05/09/2024,True,True,False,True,True,True,Abalation Base,Ablation,This UpSet plot visualizes the distribution of...,Claude-3-5-sonnet-20240620,Movies,2,18,10,1,31,2,1.0,324
5,05/09/2024,False,True,False,True,True,True,"Abalation, No Image",Ablation,Based on the provided JSON data for this UpSet...,Claude-3-5-sonnet-20240620,Movies,1,14,11,2,28,0,4.0,393
6,05/09/2024,True,False,False,True,True,True,"Ablation, No data",Ablation,Thank you for providing this UpSet plot for an...,Claude-3-5-sonnet-20240620,Movies,0,2,3,3,8,13,1.0,332
7,05/09/2024,True,True,False,False,True,True,"Ablation, No Example alt text",Ablation,"Based on my analysis of this UpSet plot, here'...",Claude-3-5-sonnet-20240620,Movies,1,15,7,2,25,1,3.0,321
8,05/09/2024,True,True,False,True,False,True,"Ablation, No identity prompt",Ablation,Here's an alt text description for this UpSet ...,Claude-3-5-sonnet-20240620,Movies,5,9,6,0,20,3,3.0,298
9,05/09/2024,True,True,False,True,True,False,"Ablation, no pattern prompt",Ablation,This UpSet plot visualizes the distribution of...,Claude-3-5-sonnet-20240620,Movies,2,15,7,2,26,0,0.0,283
