# Process responses

A Notebook to process the responses file and create a json file with one dataset and a pair of ranked specs.

In [2]:
import pandas as pd
import numpy as np
import json
import itertools
import scipy.stats as stats
import math

%matplotlib inline

In [3]:
before_normalization = pd.read_csv('../responses_data/responses.csv')

# multiple entropy by log_2(20) to remove the normalization that was performed
before_normalization['Q1_entropy'] = before_normalization['Q1_entropy'] * math.log2(20)
before_normalization['Q2_entropy'] = before_normalization['Q2_entropy'] * math.log2(20)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# pd.concat([before_normalization['Q1_entropy'], before_normalization['Q2_entropy']]).describe()
# pd.concat([before_normalization['Q1_entropy'], before_normalization['Q2_entropy']]).plot.hist()

In [5]:
# normalize the data by ensuring that entropy q1 <= entropy q2.

# to_swap = before_normalization[before_normalization['Q1_entropy'] > before_normalization['Q2_entropy']]
# no_swap = before_normalization[before_normalization['Q1_entropy'] <= before_normalization['Q2_entropy']]

# swapped = to_swap.rename(index=str, columns={
#     'Q1_entropy': 'Q2_entropy',
#     'Q2_entropy': 'Q1_entropy',
#     'Q1_channel': 'Q2_channel',
#     'Q2_channel': 'Q1_channel',
#     'Q1_entropy_class': 'Q2_entropy_class',
#     'Q2_entropy_class': 'Q1_entropy_class'
# })

# normalized = pd.concat([no_swap, swapped])

normalized = before_normalization

In [6]:
np.unique(normalized.taskGroup)

array(['compareAggregatedValue', 'compareValue', 'findExtremum',
       'readValue'], dtype=object)

In [7]:
# rename tasks into our taxonomy
normalized.taskGroup.replace({
    'findExtremum': 'extremum',
    'readValue': 'value',
    'compareValue': 'compare',
    'compareAggregatedValue': 'derived'
}, inplace=True)

In [22]:
np.unique(normalized.taskGroup)

array(['compare', 'derived', 'extremum', 'value'], dtype=object)

In [10]:
# limit to one task

# normalized = normalized[normalized['taskGroup'] == 'readValue']
len(normalized)

184320

In [11]:
# put entropy into quantiles (low, high)

normalized['Q1_entropy_group'] = pd.qcut(normalized['Q1_entropy'], 2, labels=["L", "H"])
normalized['Q2_entropy_group'] = pd.qcut(normalized['Q2_entropy'], 2, labels=["L", 'H'])

# entropy_cut = 0.3
# normalized.loc[normalized['Q1_entropy'] > entropy_cut, 'Q1_entropy_group'] = 'H'
# normalized.loc[normalized['Q1_entropy'] <= entropy_cut, 'Q1_entropy_group'] = 'L'
# normalized.loc[normalized['Q2_entropy'] > entropy_cut, 'Q2_entropy_group'] = 'H'
# normalized.loc[normalized['Q2_entropy'] <= entropy_cut, 'Q2_entropy_group'] = 'L'

In [12]:
gb = normalized.groupby([
    'taskGroup', 'cardinality', 'nPerCategory',
    'Q1_channel', 'Q2_channel', 'name_channel',
    'Q1_entropy_class', 'Q2_entropy_class',
#    'Q1_entropy_group', 'Q2_entropy_group'
])

In [13]:
row = normalized.iloc[0]
# row

In [24]:
def parse(group):
    cardinality = int(group['cardinality'].iloc[0])
    num_rows = cardinality * int(group['nPerCategory'].iloc[0])

    q1_entropy = group['Q1_entropy'].mean();
    q2_entropy = group['Q2_entropy'].mean();
    
    Q1_entropy_class = group['Q1_entropy_class'].iloc[0];
    Q2_entropy_class = group['Q2_entropy_class'].iloc[0];
    
    # we can identify the same dataset with the key
    data_key = f'{num_rows},{Q1_entropy_class},{Q2_entropy_class}'
    
    fields = [
        {'name': 'n', 'type': 'string', 'entropy': 1, 'cardinality': cardinality},
        {'name': 'q1', 'type': 'number', 'entropy': round(q1_entropy, 4), 'cardinality': num_rows, 'interesting': True},
        {'name': 'q2', 'type': 'number', 'entropy': round(q2_entropy), 'cardinality': num_rows}
    ]

    enc = {}

    enc[group['Q1_channel'].iloc[0]] = {'field': 'q1', 'type': 'quantitative'}
    enc[group['Q2_channel'].iloc[0]] = {'field': 'q2', 'type': 'quantitative'}
    enc[group['name_channel'].iloc[0]] = {'field': 'n', 'type': 'nominal'}

    spec = {
        'mark': 'point',
        'encoding': enc
    }

    return {
        'data_key': data_key,
        'fields': fields,
        'num_rows': num_rows,
        'task': group['taskGroup'].iloc[0],
        'spec': spec,
        'isCorrect': group['isCorrect'],
        'completionTime': group['completionTime'],
    }

In [25]:
specs = pd.DataFrame(columns=['data_key', 'fields', 'num_rows', 'task', 'spec',
                              'isCorrect', 'completionTime'])
for name, group in gb:
    parsed = parse(group)
    specs = specs.append(parsed, ignore_index=True)
    
print('Number of groups', len(specs))

Number of groups 1152


In [26]:
grouped_specs = specs.groupby(['data_key', 'num_rows', 'task'])

print(len(grouped_specs))

96


In [27]:
grouped_specs.num_rows.describe()
# specs[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,unique,top,freq
data_key,num_rows,task,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"30,H,H",30,compare,12,1,30,12
"30,H,H",30,derived,12,1,30,12
"30,H,H",30,extremum,12,1,30,12
"30,H,H",30,value,12,1,30,12
"30,H,L",30,compare,12,1,30,12
"30,H,L",30,derived,12,1,30,12
"30,H,L",30,extremum,12,1,30,12
"30,H,L",30,value,12,1,30,12
"30,L,H",30,compare,12,1,30,12
"30,L,H",30,derived,12,1,30,12


In [32]:
training = []

for name, group in grouped_specs:
    # for neg, pos in zip(range(len(group) - 1), range(1, len(group))):
    for neg, pos in itertools.combinations(range(len(group)), 2):
        negative = group.iloc[neg]
        positive = group.iloc[pos]
        
        # run Welch's t-test: https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.ttest_ind.html
        t, p = stats.ttest_ind(negative.isCorrect, positive.isCorrect, equal_var=False)
#         if p > .05:
#             t, p = stats.ttest_ind(first.completionTime, second.completionTime, equal_var=False)
        if p <= .01:
            training.append({
                'fields': group.iloc[0]['fields'],
                'num_rows': group.iloc[0]['num_rows'],
                'task': group.iloc[0]['task'],
                'negative': negative.spec,
                'positive': positive.spec,
                'p-value': round(p, 6)
            })
    
#     g = list(group.iterrows())
#     previous = g[0][1]
#     for idx, row in g[1:]:
#         spec = row.spec
#         training.append({
#             'fields': row['fields'],
#             'num_rows': row['num_rows'],
#             'task': row['task'],
#             'worse': previous.spec,
#             'better': spec
#         })
#         previous = row

In [33]:
len(training)

1863

In [34]:
training[0]

{'fields': [{'cardinality': 10, 'entropy': 1, 'name': 'n', 'type': 'string'},
  {'cardinality': 30,
   'entropy': 3.8428,
   'interesting': True,
   'name': 'q1',
   'type': 'number'},
  {'cardinality': 30, 'entropy': 4.0, 'name': 'q2', 'type': 'number'}],
 'negative': {'encoding': {'color': {'field': 'q1', 'type': 'quantitative'},
   'x': {'field': 'q2', 'type': 'quantitative'},
   'y': {'field': 'n', 'type': 'nominal'}},
  'mark': 'point'},
 'num_rows': 30,
 'p-value': 0.001413,
 'positive': {'encoding': {'color': {'field': 'q1', 'type': 'quantitative'},
   'x': {'field': 'n', 'type': 'nominal'},
   'y': {'field': 'q2', 'type': 'quantitative'}},
  'mark': 'point'},
 'task': 'compare'}

In [35]:
with open('../data/training/younghoon.json', 'w') as f:
    json.dump(training, f, indent=2)