In [1]:
import pandas as pd
import numpy as np
import json
import itertools
import scipy.stats as stats

%matplotlib inline

In [2]:
before_normalization = pd.read_csv('../responses_data/responses.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# normalize the data by ensuring that entropy q1 <= entropy q2.

to_swap = before_normalization[before_normalization['Q1_entropy'] > before_normalization['Q2_entropy']]
no_swap = before_normalization[before_normalization['Q1_entropy'] <= before_normalization['Q2_entropy']]

swapped = to_swap.rename(index=str, columns={
    'Q1_entropy': 'Q2_entropy',
    'Q2_entropy': 'Q1_entropy',
    'Q1_channel': 'Q2_channel',
    'Q2_channel': 'Q1_channel',
    'Q1_entropy_class': 'Q2_entropy_class',
    'Q2_entropy_class': 'Q1_entropy_class'
})

normalized = pd.concat([no_swap, swapped])

In [4]:
# limit to one task

normalized = normalized[normalized['taskGroup'] == 'readValue']
len(normalized)

46080

In [5]:
# put entropy into quantiles (low, high)

normalized['Q1_entropy_group'] = pd.qcut(normalized['Q1_entropy'], 2, labels=["L", "H"])
normalized['Q2_entropy_group'] = pd.qcut(normalized['Q2_entropy'], 2, labels=["L", 'H'])

In [6]:
gb = normalized.groupby([
    'taskGroup', 'cardinality', 'nPerCategory',
    'Q1_channel', 'Q2_channel', 'name_channel',
    'Q1_entropy_class', 'Q2_entropy_class',
    'Q1_entropy_group', 'Q2_entropy_group'])

In [7]:
row = normalized.iloc[0]
# row

In [8]:
def parse(group):
    cardinality = int(group['cardinality'].iloc[0])
    num_rows = cardinality * int(group['nPerCategory'].iloc[0])

    q1_entropy = group['Q1_entropy'].mean();
    q2_entropy = group['Q2_entropy'].mean();
    
    Q1_entropy_class = group['Q1_entropy_class'].iloc[0];
    Q2_entropy_class = group['Q2_entropy_class'].iloc[0];
    
    # we can identify the same dataset with the key
    data_key = f'{num_rows},{Q1_entropy_class},{Q2_entropy_class}'
    
    fields = [
        {'name': 'n', 'type': 'string', 'entropy': 1, 'cardinality': cardinality},
        {'name': 'q1', 'type': 'number', 'entropy': f'{q1_entropy:.3}', 'cardinality': num_rows},
        {'name': 'q2', 'type': 'number', 'entropy': f'{q2_entropy:.3}', 'cardinality': num_rows}
    ]

    enc = {}

    enc[group['Q1_channel'].iloc[0]] = {'field': 'q1', 'type': 'quantitative'}
    enc[group['Q2_channel'].iloc[0]] = {'field': 'q2', 'type': 'quantitative'}
    enc[group['name_channel'].iloc[0]] = {'field': 'n', 'type': 'nominal'}

    spec = {
        'mark': 'point',
        'encoding': enc
    }

    return {
        'data_key': data_key,
        'fields': fields,
        'num_rows': num_rows,
        'task': group['taskGroup'].iloc[0],
        'spec': spec,
        'score1': group['isCorrect'].mean(),
        'score2': group['completionTime'].mean(),
        'isCorrect': group['isCorrect'],
        'completionTime': group['completionTime'],
    }

In [9]:
specs = pd.DataFrame(columns=['data_key', 'fields', 'num_rows', 'task', 'spec',
                              'score1', 'score2', 'isCorrect', 'completionTime'])
for name, group in gb:
    parsed = parse(group)
    specs = specs.append(parsed, ignore_index=True)
    
print('Number of groups', len(specs))

Number of groups 409


In [10]:
specs = specs.sort_values(['score1', 'score2'])  # sort by correct ratio, then completion time
grouped_specs = specs.groupby(['data_key', 'num_rows', 'task'])

print(len(grouped_specs))

18


In [11]:
grouped_specs.describe()
# specs[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,score1,score1,score1,score1,score1,score1,score1,score1,score2,score2,score2,score2,score2,score2,score2,score2
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
data_key,num_rows,task,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
"30,H,H",30,readValue,17.0,0.918854,0.074764,0.7,0.9125,0.933333,0.972222,1.0,17.0,6436.287528,1601.594307,4311.89375,5402.014286,6342.275,6864.416667,10401.183333
"30,L,H",30,readValue,46.0,0.917737,0.053554,0.8,0.87625,0.935417,0.955357,1.0,46.0,7139.504369,3208.813679,3724.45,5308.772917,6447.236667,7952.721875,19482.05
"30,L,L",30,readValue,23.0,0.92111,0.054438,0.794444,0.9,0.93125,0.952778,1.0,23.0,5797.584148,1683.849011,3906.033333,4538.085,5220.207143,7165.025,8928.15
"300,H,H",300,readValue,16.0,0.869195,0.172797,0.25,0.867188,0.908333,0.940625,0.988889,16.0,7165.566695,1572.193577,5308.905556,5830.175,6907.97619,8008.7425,10010.9
"300,L,H",300,readValue,46.0,0.899005,0.099571,0.4,0.85,0.91875,0.966667,1.0,46.0,5844.236921,1691.49163,3327.75,4563.190313,5547.509375,6798.95625,9275.5
"300,L,L",300,readValue,14.0,0.853525,0.079559,0.678571,0.813056,0.885714,0.91625,0.93125,14.0,6044.319094,1710.462118,3618.35,4672.825,5885.3775,7099.95625,9358.45
"60,H,H",60,readValue,12.0,0.928236,0.030416,0.87,0.910714,0.936607,0.946032,0.966667,12.0,5928.123313,1461.853156,4335.375,5137.55625,5371.7125,6077.58006,8982.4
"60,L,H",60,readValue,24.0,0.920615,0.043729,0.788889,0.897222,0.934524,0.948377,0.975,24.0,7136.086898,2767.216428,4305.838889,5262.992857,6613.516667,7354.864583,13702.891667
"60,L,L",60,readValue,23.0,0.880497,0.150189,0.25,0.8875,0.9125,0.95,1.0,23.0,6440.802808,1843.347451,3769.8,5187.785,5960.83,7139.05,10438.625
"600,H,H",600,readValue,17.0,0.859307,0.153519,0.3,0.845,0.892857,0.94,0.96875,17.0,7931.205375,3238.169239,5063.575,6079.27,6371.966667,7298.235,14602.635714


In [12]:
training = []

for name, group in grouped_specs:
    # for neg, pos in zip(range(len(group) - 1), range(1, len(group))):
    for neg, pos in itertools.combinations(range(len(group)), 2):
        negative = group.iloc[neg]
        positive = group.iloc[pos]
        
        # run Welch's t-test
        t, p = stats.ttest_ind(negative.isCorrect, positive.isCorrect, equal_var=False)
#         if p > .05:
#             t, p = stats.ttest_ind(first.completionTime, second.completionTime, equal_var=False)
        if p <= .01:
            training.append({
                'fields': group.iloc[0]['fields'],
                'num_rows': group.iloc[0]['num_rows'],
                'task': group.iloc[0]['task'],
                'negative': negative.spec,
                'positive': positive.spec,
                'p-value': p
            })
    
#     g = list(group.iterrows())
#     previous = g[0][1]
#     for idx, row in g[1:]:
#         spec = row.spec
#         training.append({
#             'fields': row['fields'],
#             'num_rows': row['num_rows'],
#             'task': row['task'],
#             'worse': previous.spec,
#             'better': spec
#         })
#         previous = row

In [13]:
# list(itertools.combinations(range(5), 2))
# list(zip(range(9), range(1,10)))

In [14]:
len(training)

1253

In [15]:
training[0]

{'fields': [{'cardinality': 10, 'entropy': 1, 'name': 'n', 'type': 'string'},
  {'cardinality': 30, 'entropy': '0.876', 'name': 'q1', 'type': 'number'},
  {'cardinality': 30, 'entropy': '0.877', 'name': 'q2', 'type': 'number'}],
 'negative': {'encoding': {'color': {'field': 'q1', 'type': 'quantitative'},
   'x': {'field': 'q2', 'type': 'quantitative'},
   'y': {'field': 'n', 'type': 'nominal'}},
  'mark': 'point'},
 'num_rows': 30,
 'p-value': 0.0013008527843402279,
 'positive': {'encoding': {'size': {'field': 'q1', 'type': 'quantitative'},
   'x': {'field': 'q2', 'type': 'quantitative'},
   'y': {'field': 'n', 'type': 'nominal'}},
  'mark': 'point'},
 'task': 'readValue'}

In [16]:
with open('../data/training/q_q_n.json', 'w') as f:
    json.dump(training, f, indent=2)