In [1]:
%cd github/soni4vis-spatial/backend-server

/jupyter/github/soni4vis-spatial/backend-server


In [19]:
from pathlib import Path

import pandas as pd
import pydash as _
import numpy as np

In [5]:
df_answer = pd.read_csv(Path('./answers.csv'))
df_answer

Unnamed: 0,_index,question_id,correct_answer_a,correct_answer_b
0,1,pitch-random-single-0-q1,6,
1,2,pitch-random-single-1-q1,4,
2,3,pitch-random-single-2-q1,4,
3,4,pitch-random-single-3-q1,8,
4,5,pitch-random-double-0-q1,4,-4
...,...,...,...,...
67,68,tempo-linear-double-1-q2,c,c
68,69,tempo-linear-double-2-q1,d,c
69,70,tempo-linear-double-2-q2,a,c
70,71,tempo-linear-double-3-q1,b,c


In [66]:
# write a function to check if the input is a numeric value
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


In [73]:
answers = []
for r in df_answer.to_dict(orient='records'):
    condition, trend_type, series_type, dataset_index, question_index = r['question_id'].split('-')
    # print(condition, trend_type, series_type, dataset_index, question_index)
    answers.append({
        'question_id': f"{r['question_id']}-a",
        'condition': condition,
        'trend_type': trend_type,
        'series_type': series_type,
        'dataset_index': dataset_index,
        'question_index': question_index,
        'question_sequence:': 'a',
        'answer_type': 'numeric' if is_numeric(r['correct_answer_a']) else 'categorical',
        'answer': r['correct_answer_a']
    })
    if not pd.isna(r['correct_answer_b']):
        answers.append({
            'question_id': f"{r['question_id']}-b",
            'condition': condition,
            'trend_type': trend_type,
            'series_type': series_type,
            'dataset_index': dataset_index,
            'question_index': question_index,
            'question_sequence:': 'b',
            'answer_type': 'numeric' if is_numeric(r['correct_answer_a']) else 'categorical',
            'answer': r['correct_answer_b']
        })

df_answers = pd.DataFrame(answers)
df_answers

Unnamed: 0,question_id,condition,trend_type,series_type,dataset_index,question_index,question_sequence:,answer_type,answer
0,pitch-random-single-0-q1-a,pitch,random,single,0,q1,a,numeric,6
1,pitch-random-single-1-q1-a,pitch,random,single,1,q1,a,numeric,4
2,pitch-random-single-2-q1-a,pitch,random,single,2,q1,a,numeric,4
3,pitch-random-single-3-q1-a,pitch,random,single,3,q1,a,numeric,8
4,pitch-random-double-0-q1-a,pitch,random,double,0,q1,a,numeric,4
...,...,...,...,...,...,...,...,...,...
103,tempo-linear-double-2-q2-b,tempo,linear,double,2,q2,b,categorical,c
104,tempo-linear-double-3-q1-a,tempo,linear,double,3,q1,a,categorical,b
105,tempo-linear-double-3-q1-b,tempo,linear,double,3,q1,b,categorical,c
106,tempo-linear-double-3-q2-a,tempo,linear,double,3,q2,a,categorical,c


In [71]:
df_answers[df_answers['question_id'] == 'tempo-random-single-0-q1-a']['answer'].values[0]

'-3'

In [75]:
def normalize_answer(df):
    results = []
    for r in df.to_dict(orient='records'):
        results.append({
            'question_id': f"{r['question_id']}-a",
            'response': r['answer'],
        })
        if not pd.isna(r['answer_2']):
            results.append({
                'question_id': f"{r['question_id']}-b",
                'response': r['answer_2'],
            })
    return pd.DataFrame(results)

def calculate_diff(r):
    question = df_answers[df_answers['question_id'] == r['question_id']]
    ans = question['answer'].values[0]
    ans_type = question['answer_type'].values[0]
    if ans_type == 'numeric':
        return abs(int(ans) - int(r['response']))
    else:
        return ans == r['response']

results_dfs = []
exp_n = 0
for f in Path('./exp/').iterdir():
    exp_n += 1
    exp_id = f.stem
    df_exp_result = pd.read_csv(f)
    df_exp_result = normalize_answer(df_exp_result)
    df_exp_result[f'diff_{exp_id}'] = df_exp_result.apply(calculate_diff, axis=1)
    df_exp_result = df_exp_result.rename(columns={'response': f"response_{exp_id}"})
    results_dfs.append(df_exp_result)

df_results = results_dfs[0]
for df in results_dfs[1:]:
    df_results = pd.merge(df_results, df, on='question_id')

df_results

Unnamed: 0,question_id,response_2,diff_2,response_1,diff_1
0,tempo-random-single-0-q1-a,-6,3,-5,2
1,tempo-random-single-1-q1-a,-2,1,-3,0
2,tempo-random-single-2-q1-a,-2,3,-1,4
3,tempo-random-single-3-q1-a,3,1,5,1
4,tempo-random-double-0-q1-a,-5,2,-5,2
...,...,...,...,...,...
103,spatial-linear-double-2-q2-b,c,True,c,True
104,spatial-linear-double-3-q1-a,b,True,a,False
105,spatial-linear-double-3-q1-b,c,False,a,False
106,spatial-linear-double-3-q2-a,c,False,b,False


In [79]:
df_diff = df_results.filter(regex='question_id|diff_')
df_diff

Unnamed: 0,question_id,diff_2,diff_1
0,tempo-random-single-0-q1-a,3,2
1,tempo-random-single-1-q1-a,1,0
2,tempo-random-single-2-q1-a,3,4
3,tempo-random-single-3-q1-a,1,1
4,tempo-random-double-0-q1-a,2,2
...,...,...,...
103,spatial-linear-double-2-q2-b,True,True
104,spatial-linear-double-3-q1-a,True,False
105,spatial-linear-double-3-q1-b,False,False
106,spatial-linear-double-3-q2-a,False,False


In [87]:
df_answer_diff = df_answers.merge(df_diff, on='question_id')
df_answer_diff_melted = df_answer_diff.melt(value_vars=df_answer_diff.columns[9:], id_vars=df_answer_diff.columns[:9])
df_answer_diff_melted

Unnamed: 0,question_id,condition,trend_type,series_type,dataset_index,question_index,question_sequence:,answer_type,answer,variable,value
0,pitch-random-single-0-q1-a,pitch,random,single,0,q1,a,numeric,6,diff_2,4
1,pitch-random-single-1-q1-a,pitch,random,single,1,q1,a,numeric,4,diff_2,1
2,pitch-random-single-2-q1-a,pitch,random,single,2,q1,a,numeric,4,diff_2,1
3,pitch-random-single-3-q1-a,pitch,random,single,3,q1,a,numeric,8,diff_2,2
4,pitch-random-double-0-q1-a,pitch,random,double,0,q1,a,numeric,4,diff_2,3
...,...,...,...,...,...,...,...,...,...,...,...
211,tempo-linear-double-2-q2-b,tempo,linear,double,2,q2,b,categorical,c,diff_1,True
212,tempo-linear-double-3-q1-a,tempo,linear,double,3,q1,a,categorical,b,diff_1,False
213,tempo-linear-double-3-q1-b,tempo,linear,double,3,q1,b,categorical,c,diff_1,False
214,tempo-linear-double-3-q2-a,tempo,linear,double,3,q2,a,categorical,c,diff_1,False


In [94]:
for qid in df_answer_diff_melted['question_id'].unique():
    df = df_answer_diff_melted[df_answer_diff_melted['question_id'] == qid]
    print(qid, df['value'].mean(), df['value'].std(), df['value'].min(), df['value'].max(), df['value'].max() - df['value'].min())

df_answer_diff_melted.groupby('question_id').agg(['mean', 'std', 'min', 'max']).to_csv('./data_analysis.csv')
# df_answer_diff_melted.groupby('question_id').agg(['mean', 'std', 'min', 'max'])

pitch-random-single-0-q1-a 2.5 2.1213203435596424 1 4 3
pitch-random-single-1-q1-a 1.0 0.0 1 1 0
pitch-random-single-2-q1-a 1.0 0.0 1 1 0
pitch-random-single-3-q1-a 3.0 1.4142135623730951 2 4 2
pitch-random-double-0-q1-a 3.0 0.0 3 3 0
pitch-random-double-0-q1-b 5.0 1.4142135623730951 4 6 2
pitch-random-double-1-q1-a 9.5 7.7781745930520225 4 15 11
pitch-random-double-1-q1-b 15.5 4.949747468305833 12 19 7
pitch-random-double-2-q1-a 7.0 1.4142135623730951 6 8 2
pitch-random-double-2-q1-b 4.5 3.5355339059327378 2 7 5
pitch-random-double-3-q1-a 3.5 2.1213203435596424 2 5 3
pitch-random-double-3-q1-b 3.5 2.1213203435596424 2 5 3
pitch-linear-single-0-q1-a 1.0 0.0 True True 0
pitch-linear-single-0-q2-a 0.0 0.0 False False 0
pitch-linear-single-1-q1-a 0.5 0.7071067811865476 False True 1
pitch-linear-single-1-q2-a 0.5 0.7071067811865476 False True 1
pitch-linear-single-2-q1-a 0.5 0.7071067811865476 False True 1
pitch-linear-single-2-q2-a 0.5 0.7071067811865476 False True 1
pitch-linear-single-3

  df_answer_diff_melted.groupby('question_id').agg(['mean', 'std', 'min', 'max']).to_csv('./data_analysis.csv')
