In [1]:
from pathlib import Path

import pandas as pd
import pydash as _
import numpy as np

In [2]:
df_answer = pd.read_csv(Path('./answers.csv'))
df_answer

Unnamed: 0,_index,question_id,correct_answer_a,correct_answer_b
0,1,pitch-random-single-0-q1,6,
1,2,pitch-random-single-1-q1,4,
2,3,pitch-random-single-2-q1,4,
3,4,pitch-random-single-3-q1,8,
4,5,pitch-random-double-0-q1,4,-4
...,...,...,...,...
67,68,tempo-linear-double-1-q2,c,c
68,69,tempo-linear-double-2-q1,d,c
69,70,tempo-linear-double-2-q2,a,c
70,71,tempo-linear-double-3-q1,b,c


In [3]:
# write a function to check if the input is a numeric value
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False


In [4]:
answers = []
counter = 0

for r in df_answer.to_dict(orient='records'):
    condition, trend_type, series_type, dataset_index, question_index = r['question_id'].split('-')
    # print(condition, trend_type, series_type, dataset_index, question_index)
    answers.append({
        'question_id': "-".join(r['question_id'].split('-')[1:3])+"-a",
        "question_type":f"{r['question_id']}-a",
        "condition": condition,
        "_index":counter,
        'trend_type': trend_type,
        'series_type': series_type,
        'dataset_index': dataset_index,
        'question_index': question_index,
        'question_sequence:': 'a',
        'answer_type': 'numeric' if is_numeric(r['correct_answer_a']) else 'categorical',
        'answer': r['correct_answer_a']
    })
    if not pd.isna(r['correct_answer_b']):
        counter += 1
        answers.append({
            'question_id': "-".join(r['question_id'].split('-')[1:3])+"-b",
            "condition": condition,
            "question_type":f"{r['question_id']}-b",
            'trend_type': trend_type,
            "_index":counter,
            'series_type': series_type,
            'dataset_index': dataset_index,
            'question_index': question_index,
            'question_sequence:': 'b',
            'answer_type': 'numeric' if is_numeric(r['correct_answer_a']) else 'categorical',
            'answer': r['correct_answer_b']
        })
    counter += 1

df_answers = pd.DataFrame(answers)
df_answers

Unnamed: 0,question_id,question_type,condition,_index,trend_type,series_type,dataset_index,question_index,question_sequence:,answer_type,answer
0,random-single-a,pitch-random-single-0-q1-a,pitch,0,random,single,0,q1,a,numeric,6
1,random-single-a,pitch-random-single-1-q1-a,pitch,1,random,single,1,q1,a,numeric,4
2,random-single-a,pitch-random-single-2-q1-a,pitch,2,random,single,2,q1,a,numeric,4
3,random-single-a,pitch-random-single-3-q1-a,pitch,3,random,single,3,q1,a,numeric,8
4,random-double-a,pitch-random-double-0-q1-a,pitch,4,random,double,0,q1,a,numeric,4
...,...,...,...,...,...,...,...,...,...,...,...
103,linear-double-b,tempo-linear-double-2-q2-b,tempo,103,linear,double,2,q2,b,categorical,c
104,linear-double-a,tempo-linear-double-3-q1-a,tempo,104,linear,double,3,q1,a,categorical,b
105,linear-double-b,tempo-linear-double-3-q1-b,tempo,105,linear,double,3,q1,b,categorical,c
106,linear-double-a,tempo-linear-double-3-q2-a,tempo,106,linear,double,3,q2,a,categorical,c


In [5]:
# df_answers[df_answers['question_id'] == 'tempo-random-single-0-q1-a']['answer'].values[0]

df_answers.head()


Unnamed: 0,question_id,question_type,condition,_index,trend_type,series_type,dataset_index,question_index,question_sequence:,answer_type,answer
0,random-single-a,pitch-random-single-0-q1-a,pitch,0,random,single,0,q1,a,numeric,6
1,random-single-a,pitch-random-single-1-q1-a,pitch,1,random,single,1,q1,a,numeric,4
2,random-single-a,pitch-random-single-2-q1-a,pitch,2,random,single,2,q1,a,numeric,4
3,random-single-a,pitch-random-single-3-q1-a,pitch,3,random,single,3,q1,a,numeric,8
4,random-double-a,pitch-random-double-0-q1-a,pitch,4,random,double,0,q1,a,numeric,4


In [6]:
def normalize_answer(df):
    results = []
    temp_counter =0
    for r in df.to_dict(orient='records'):
        
        results.append({
            'question_type': f"{r['question_id']}-a",
            "_index": temp_counter,
            'response': r['answer'],
        })
        if not pd.isna(r['answer_2']):
            temp_counter += 1
            results.append({
                'question_type': f"{r['question_id']}-b",
                "_index": temp_counter,
                'response': r['answer_2'],
            })
            
        temp_counter += 1
    return pd.DataFrame(results)

def calculate_diff(r):
    question = df_answers[df_answers['_index'] == r['_index']]
    ans = question['answer'].values[0]
    ans_type = question['answer_type'].values[0]
    if ans_type == 'numeric':
        return abs(int(ans) - int(r['response']))
    else:
        return ans == r['response']

results_dfs = []
exp_n = 0
for f in Path('./exp/').iterdir():
    exp_n += 1
    exp_id = f.stem
    df_exp_result = pd.read_csv(f)
    df_exp_result = normalize_answer(df_exp_result)
    df_exp_result[f'diff_{exp_id}'] = df_exp_result.apply(calculate_diff, axis=1)
    df_exp_result = df_exp_result.rename(columns={'response': f"response_{exp_id}"})
    results_dfs.append(df_exp_result)

df_results = results_dfs[0]
for df in results_dfs[1:]:
    df_results = pd.merge(df_results, df, on='_index')

df_results

Unnamed: 0,question_type_x,_index,response_1,diff_1,question_type_y,response_2,diff_2
0,pitch-random-single-0-q1-a,0,5,1,tempo-random-single-0-q1-a,-6,12
1,pitch-random-single-1-q1-a,1,3,1,tempo-random-single-1-q1-a,-2,6
2,pitch-random-single-2-q1-a,2,3,1,tempo-random-single-2-q1-a,-2,6
3,pitch-random-single-3-q1-a,3,4,4,tempo-random-single-3-q1-a,3,5
4,pitch-random-double-0-q1-a,4,1,3,tempo-random-double-0-q1-a,-5,9
...,...,...,...,...,...,...,...
103,tempo-linear-double-2-q2-b,103,c,True,spatial-linear-double-2-q2-b,c,True
104,tempo-linear-double-3-q1-a,104,c,False,spatial-linear-double-3-q1-a,b,True
105,tempo-linear-double-3-q1-b,105,a,False,spatial-linear-double-3-q1-b,c,True
106,tempo-linear-double-3-q2-a,106,a,False,spatial-linear-double-3-q2-a,c,True


In [7]:
results_dfs

[                  question_type  _index response_1 diff_1
 0    pitch-random-single-0-q1-a       0          5      1
 1    pitch-random-single-1-q1-a       1          3      1
 2    pitch-random-single-2-q1-a       2          3      1
 3    pitch-random-single-3-q1-a       3          4      4
 4    pitch-random-double-0-q1-a       4          1      3
 ..                          ...     ...        ...    ...
 103  tempo-linear-double-2-q2-b     103          c   True
 104  tempo-linear-double-3-q1-a     104          c  False
 105  tempo-linear-double-3-q1-b     105          a  False
 106  tempo-linear-double-3-q2-a     106          a  False
 107  tempo-linear-double-3-q2-b     107          a   True
 
 [108 rows x 4 columns],
                     question_type  _index response_2 diff_2
 0      tempo-random-single-0-q1-a       0         -6     12
 1      tempo-random-single-1-q1-a       1         -2      6
 2      tempo-random-single-2-q1-a       2         -2      6
 3      tempo-random-

In [8]:
# merge the entire list of dataframes (results_dfs) into one dataframe with the question_type as the index
data_set_A = results_dfs[0]
data_set_A

Unnamed: 0,question_type,_index,response_1,diff_1
0,pitch-random-single-0-q1-a,0,5,1
1,pitch-random-single-1-q1-a,1,3,1
2,pitch-random-single-2-q1-a,2,3,1
3,pitch-random-single-3-q1-a,3,4,4
4,pitch-random-double-0-q1-a,4,1,3
...,...,...,...,...
103,tempo-linear-double-2-q2-b,103,c,True
104,tempo-linear-double-3-q1-a,104,c,False
105,tempo-linear-double-3-q1-b,105,a,False
106,tempo-linear-double-3-q2-a,106,a,False


In [9]:
data_set_B= results_dfs[1]
data_set_B


Unnamed: 0,question_type,_index,response_2,diff_2
0,tempo-random-single-0-q1-a,0,-6,12
1,tempo-random-single-1-q1-a,1,-2,6
2,tempo-random-single-2-q1-a,2,-2,6
3,tempo-random-single-3-q1-a,3,3,5
4,tempo-random-double-0-q1-a,4,-5,9
...,...,...,...,...
103,spatial-linear-double-2-q2-b,103,c,True
104,spatial-linear-double-3-q1-a,104,b,True
105,spatial-linear-double-3-q1-b,105,c,True
106,spatial-linear-double-3-q2-a,106,c,True


In [10]:
# merge data_set_A and data_set_B into one dataframe with the question_type as the index
df_results = pd.merge(data_set_A, data_set_B, on='question_type')

In [11]:
df_answers

Unnamed: 0,question_id,question_type,condition,_index,trend_type,series_type,dataset_index,question_index,question_sequence:,answer_type,answer
0,random-single-a,pitch-random-single-0-q1-a,pitch,0,random,single,0,q1,a,numeric,6
1,random-single-a,pitch-random-single-1-q1-a,pitch,1,random,single,1,q1,a,numeric,4
2,random-single-a,pitch-random-single-2-q1-a,pitch,2,random,single,2,q1,a,numeric,4
3,random-single-a,pitch-random-single-3-q1-a,pitch,3,random,single,3,q1,a,numeric,8
4,random-double-a,pitch-random-double-0-q1-a,pitch,4,random,double,0,q1,a,numeric,4
...,...,...,...,...,...,...,...,...,...,...,...
103,linear-double-b,tempo-linear-double-2-q2-b,tempo,103,linear,double,2,q2,b,categorical,c
104,linear-double-a,tempo-linear-double-3-q1-a,tempo,104,linear,double,3,q1,a,categorical,b
105,linear-double-b,tempo-linear-double-3-q1-b,tempo,105,linear,double,3,q1,b,categorical,c
106,linear-double-a,tempo-linear-double-3-q2-a,tempo,106,linear,double,3,q2,a,categorical,c


In [12]:
df_filtered_results = df_results.filter(regex='question|diff')

In [16]:
# try to merge with the big table of answers

df_merged = df_answers.merge(df_filtered_results, on='question_type')
df_merged

# melt the dataframe to make it easier to work with
df_melted = pd.melt(df_merged, id_vars=df_merged.columns[:11], value_vars=df_merged.columns[11:])


In [18]:
df_melted["value"]

0         1
1         1
2         1
3         4
4         3
       ... 
211    True
212    True
213    True
214    True
215    True
Name: value, Length: 216, dtype: object

In [14]:
def analyze_single_method(input_method_name,data_frame):
    # get all the data with the condition of input method name from the data frame
    df = data_frame[data_frame['condition'] == input_method_name]
    # get all the data with the series type of single from the data frame
    df_single = df[df['series_type'] == 'single']
    print ("single accurace: ", df_single["value"].mean())
    return df


In [27]:
for qid in df_melted['condition'].unique():

    df = df_melted[df_melted['condition'] == qid]
    df = df[df['answer_type'] == 'numeric']
    df = df[df['series_type'] == 'single']
    print(qid, df['value'].mean(), df['value'].std(), df['value'].min(), df['value'].max(), df['value'].max() - df['value'].min())

#df_melted.groupby('question_id').agg(['mean', 'std', 'min', 'max']).to_csv('./data_analysis.csv')
# df_answer_diff_melted.groupby('question_id').agg(['mean', 'std', 'min', 'max'])

pitch 5.25 5.05835512016892 1 19 18
spatial 2.0833333333333335 1.4116492564331997 0 5 5
tempo 5.166666666666667 4.177232979176388 0 15 15


In [28]:
for qid in df_melted['condition'].unique():

    df = df_melted[df_melted['condition'] == qid]
    df = df[df['answer_type'] == 'numeric']
    df = df[df['series_type'] == 'single']
    print(qid, df['value'].mean(), df['value'].std(), df['value'].min(), df['value'].max(), df['value'].max() - df['value'].min())


pitch 3.5 2.5071326821120348 1 7 6
spatial 1.75 1.3887301496588271 0 4 4
tempo 4.5 3.779644730092272 0 12 12


In [None]:
for qid in df_melted['condition'].unique():

    df = df_melted[df_melted['condition'] == qid]
    df = df[df['answer_type'] == 'numeric']
    df = df[df['series_type'] == 'single']
    print(qid, df['value'].mean(), df['value'].std(), df['value'].min(), df['value'].max(), df['value'].max() - df['value'].min())

In [29]:
for qid in df_melted['condition'].unique():

    df = df_melted[df_melted['condition'] == qid]
    df = df[df['answer_type'] == 'numeric']
    df = df[df['series_type'] == 'double']
    print(qid, df['value'].mean(), df['value'].std(), df['value'].min(), df['value'].max(), df['value'].max() - df['value'].min())

pitch 6.125 5.818075283115543 1 19 18
spatial 2.25 1.4375905768565218 0 5 5
tempo 5.5 4.442221666388715 0 15 15
