In [1]:
from dziban.mkiv import Chart
from vega_datasets import data
from copy import deepcopy
import json
from tqdm import tqdm
import pandas as pd

In [2]:
movies = data('movies')
movies.head(1)

Unnamed: 0,Creative_Type,Director,Distributor,IMDB_Rating,IMDB_Votes,MPAA_Rating,Major_Genre,Production_Budget,Release_Date,Rotten_Tomatoes_Rating,Running_Time_min,Source,Title,US_DVD_Sales,US_Gross,Worldwide_Gross
0,,,Gramercy,6.1,1071.0,R,,8000000.0,Jun 12 1998,,,,The Land Girls,,146083.0,146083.0


In [3]:
base = Chart(movies)

In [4]:
q1 = 'IMDB_Rating'
q2 = 'Rotten_Tomatoes_Rating'
q3 = 'Worldwide_Gross'
n1 = 'Major_Genre'
n2 = 'MPAA_Rating'
n3 = 'Creative_Type'

fields = {
    'q': [q1, q2, q3],
    'n': [n1, n2, n3]
}

marks = ['point', 'bar', 'line', 'area', 'tick', 'rect']
aggs = ['mean']

In [5]:
def get_props_from_transform(transform):
    props = {}
    if (transform == 'bin'):
        props['bin'] = True
    elif (transform == 'agg'):
        props['aggregate'] = 'mean'
        
    return props

In [6]:
def generate_queries(currdims, targetdims, curr_query, queries):
    if (currdims < targetdims):
        for fieldtype in ['q', 'n']:
            if (fieldtype == 'q'):
                for transform in ['raw', 'bin', 'agg']:
                    next_query = deepcopy(curr_query)
                    next_query.append({ 'fieldtype': fieldtype, 'transform': transform })
                    queries.append(next_query)

                    generate_queries(currdims + 1, targetdims, next_query, queries)
            else:
                next_query = deepcopy(curr_query)
                next_query.append({ 'fieldtype': fieldtype, 'transform': 'raw' })
                queries.append(next_query)

                generate_queries(currdims + 1, targetdims, next_query, queries)

In [7]:
def dedupe_queries(queries):
    seen = set()
    
    unique = []
    for query in queries:
        reps = {}
        for field in query:
            string = '{0}({1})'.format(field['transform'], field['fieldtype'])
            if string not in reps:
                reps[string] = 0
            reps[string] += 1
            
        stringified = json.dumps(reps, sort_keys=True)
        
        if stringified not in seen:
            seen.add(stringified)
            unique.append(query)
            
    return unique
            

In [8]:
def query_to_dziban(prior, query, available_fields):
    chart = prior
    
    used_fields = {}
    
    for field in query:
        fieldtype = field['fieldtype']
        transform = field['transform']
        
        fieldname = available_fields[fieldtype].pop(0)
    
        if (fieldtype not in used_fields):
            used_fields[fieldtype] = []
        used_fields[fieldtype].append(fieldname)
        
        props = get_props_from_transform(transform)
        
        props['type'] = 'quantitative' if fieldtype == 'q' else 'nominal'
            
        chart = chart.field(fieldname, **props)
        
    return {
        'chart': chart,
        'query': query,
        'available_fields': available_fields,
        'used_fields': used_fields
    }
        

In [9]:
priors = []
generate_queries(0, 2, [], priors)
priors = dedupe_queries(priors)

In [10]:
charts = [query_to_dziban(base, query, deepcopy(fields)) for query in priors]

In [11]:
mark_edits = []
for mark in marks:
    mark_edits.append({
        'type': 'mark',
        'mark': mark
    })

In [12]:
add_field_edits = []
for fieldtype in ['q', 'n']:
    if (fieldtype == 'q'):
        for transform in ['raw', 'bin', 'agg']:
            add_field_edits.append({
                'type': 'add_field',
                'fieldtype': fieldtype,
                'transform': transform
            })
            
    else:
        add_field_edits.append({
            'type': 'add_field',
            'fieldtype': fieldtype,
            'transform': 'raw'
        })

In [13]:
bin_edits = [
    {
        'type': 'bin'
    }
]

In [14]:
agg_edits = []
for agg in aggs:
    agg_edits.append(
        {
            'type': 'agg',
            'agg': agg
        }
    )

In [15]:
edits = mark_edits + add_field_edits + bin_edits + agg_edits

In [16]:
def edit_dziban(dzi, edit):
    edited_chart = dzi['chart']
    etype = edit['type']
    
    if etype == 'mark':
        edited_chart = edited_chart.mark(edit['mark'])
    elif etype == 'add_field':
        available_fields = dzi['available_fields']
        field = available_fields[edit['fieldtype']].pop(0)
        
        if (edit['fieldtype'] not in dzi['used_fields']):
            dzi['used_fields'][edit['fieldtype']] = []
        dzi['used_fields'][edit['fieldtype']].append(field)
        
        props = get_props_from_transform(edit['transform'])
        
        edited_chart = edited_chart.field(field, **props)
    elif etype == 'bin':
        used_fields = dzi['used_fields']
        
        if ('q' not in used_fields):
            return None
        
        field_to_bin = used_fields['q'][0]
        
        edited_chart = edited_chart.field(field_to_bin, bin=True)
    elif etype == 'agg':
        used_fields = dzi['used_fields']
        
        if ('q' not in used_fields):
            return None
        
        field_to_agg = used_fields['q'][0]
        edited_chart = edited_chart.field(field_to_agg, aggregate=edit['agg'])
        
    cold = edited_chart
    anchored = edited_chart.anchor_on(dzi['chart'])
    
    return {
        'prior': dzi['chart'],
        'prior_query': dzi['query'],
        'edit': edit,
        'cold': cold,
        'anchored': anchored,
        'available_fields': dzi['available_fields'],
        'used_fields': dzi['used_fields']
    }

In [17]:
nexts = []
for i in tqdm(range(len(charts))):
    chart = charts[i]
    for edit in edits:
        prior = deepcopy(chart)
        edited = edit_dziban(prior, edit)
        if edited is not None:
            nexts.append(edited)

100%|██████████| 14/14 [00:12<00:00,  1.18it/s]


In [18]:
with_differences = []
for i in tqdm(range(len(nexts))):
    n = nexts[i]
    prior = n['prior']
    cold = n['cold']
    anchored = n['anchored']
    
    if not cold.is_satisfiable():
        n['sat'] = False
        continue
        
    if not anchored.is_satisfiable():
        print(prior._get_vegalite())
        print(n['edit'])
        print('\n'.join(anchored._get_full_query()))
        print(cold._get_asp_complete())
        break
        
    n['sat'] = True
        
#     cold_graphscape = set(cold - prior)
#     anchored_graphscape = set(anchored._get_graphscape_list())
    
#     left_diff = cold_graphscape - anchored_graphscape
#     right_diff = anchored_graphscape - cold_graphscape
    
#     n['left_diff'] = left_diff
#     n['right_diff'] = right_diff
    cold_props = cold._get_draco_sol().props[cold._name]
    anchored_props = list(anchored._get_draco_sol().props[anchored._name])
    
    cold_stats = cold._get_stats(prior)
    
    n['cold_draco_rank'] = cold_stats['draco_rank']
    n['cold_graphscape_rank'] = cold_stats['graphscape_rank']
    n['cold_draco_score'] = cold_stats['draco_score']
    n['cold_norm_draco_score'] = cold_stats['norm_draco_score']
    n['cold_graphscape_score'] = cold_stats['graphscape_score']
    n['cold_norm_graphscape_score'] = cold_stats['norm_graphscape_score']

    anchored_stats = anchored._get_stats()
    n['anchored_draco_rank'] = anchored_stats['draco_rank']
    n['anchored_graphscape_rank'] = anchored_stats['graphscape_rank']
    n['anchored_draco_score'] = anchored_stats['draco_score']
    n['anchored_norm_draco_score'] = anchored_stats['norm_draco_score']
    n['anchored_graphscape_score'] = anchored_stats['graphscape_score']
    n['anchored_norm_graphscape_score'] = anchored_stats['norm_graphscape_score']

    if (json.dumps(cold_props) == json.dumps(anchored_props)):
        n['same'] = True
    else:
        n['same'] = False
    

 10%|█         | 17/164 [00:50<07:45,  3.17s/it]

none


 12%|█▏        | 20/164 [01:00<07:52,  3.28s/it]

none


 48%|████▊     | 78/164 [04:13<04:32,  3.17s/it]

none


 62%|██████▏   | 102/164 [05:30<03:22,  3.26s/it]

none


 98%|█████████▊| 160/164 [08:27<00:13,  3.38s/it]

none


100%|██████████| 164/164 [08:43<00:00,  3.82s/it]


In [19]:
nexts[0]['cold']._get_draco_sol().props

{'v_v': ['view(v_v).',
  'encoding(v_v,e0).',
  'mark(v_v,point).',
  'channel(v_v,e0,x).',
  'field(v_v,e0,"IMDB_Rating").',
  'zero(v_v,e0).',
  'type(v_v,e0,quantitative).']}

In [20]:
def stringify_query(query):
    result = ''
    for i, field in enumerate(query):
        if (i > 0):
            result += ' x '
    
        fieldtype = field['fieldtype']
        transform = field['transform']
        
        if fieldtype == 'n':
            result += 'n'
        elif fieldtype == 'q':
            if transform == 'raw':
                result += 'q'
            else:  
                result += '{0}({1})'.format(field['transform'], field['fieldtype'])
        
    return result

In [21]:
def stringify_edit(prior, edit):
    etype = edit['type']
    
    if etype == 'mark':
        return '{0} <> {1}'.format(prior._get_vegalite()['mark'], edit['mark'])
    elif etype == 'add_field':
        return '+ {0}({1})'.format(edit['transform'], edit['fieldtype'])
    elif etype == 'bin':
        return '<- bin(q)'
    elif etype == 'agg':
        return '<- agg(q)'

In [22]:
data = []
for i in tqdm(range(len(nexts))):
    n = nexts[i]
    prior_query = stringify_query(n['prior_query'])
    edit = stringify_edit(n['prior'], n['edit'])
    
#     left_diff = None
#     if ('left_diff' in n):
#         left_diff = json.dumps([x[0][5:] for x in n['left_diff']])
#         right_diff = json.dumps([x[0][5:] for x in n['right_diff']])

    if 'cold_draco_rank' not in n:
        data.append({
            'prior_query': prior_query,
            'edit': edit
        })
    else:
        cdr = n['cold_draco_rank']
        cgr = n['cold_graphscape_rank']
        adr = n['anchored_draco_rank']
        agr = n['anchored_graphscape_rank']
        
        cds = n['cold_draco_score']
        cgs = n['cold_graphscape_score']
        ads = n['anchored_draco_score']
        ags = n['anchored_graphscape_score']
        
        norm_cds = n['cold_norm_draco_score']
        norm_cgs = n['cold_norm_graphscape_score']
        norm_ads = n['anchored_norm_draco_score']
        norm_ags = n['anchored_norm_graphscape_score']
        
        delta_dr = None
        delta_dr_assume = None
        if (cdr is not None and adr is not None):
            delta_dr = cdr - adr
        
#         if (cdr['rank'] is not None or adr['rank'] is not None):
#             cdr_assume = cdr['rank'] if cdr['rank'] is not None else (cdr['of'] + 1)
#             adr_assume = adr['rank'] if adr['rank'] is not None else (adr['of'] + 1)
            
#             delta_dr_assume = cdr_assume - adr_assume
            
            
        delta_gr = None
        delta_gr_assume = None
        if (cgr is not None and agr is not None):
            delta_gr = cgr - agr
        
#         if (cgr['rank'] is not None or agr['rank'] is not None):
#             cgr_assume = cgr['rank'] if cgr['rank'] is not None else (cgr['of'] + 1)
#             agr_assume = agr['rank'] if agr['rank'] is not None else (agr['of'] + 1)
            
#             delta_gr_assume = cgr_assume - agr_assume
        
        delta_sum_rank = None
        if (delta_dr is not None and delta_gr is not None):
            delta_sum_rank = delta_dr + delta_gr
            
#         delta_sum_rank_assume = None
#         if (delta_dr_assume is not None and delta_gr_assume is not None):
#             delta_sum_rank_assume = delta_dr_assume + delta_gr_assume
        
            
        delta_ds = None
        if (cds is not None and ads is not None):
            delta_ds = cds - ads
    
        delta_gs = None
        if (cgs is not None and ags is not None):
            delta_gs = cgs - ags
        
        data.append({
            'same': n['same'],
            'prior_query': prior_query,
            'prior_dimensions': len(n['prior_query']),
            'edit_type': n['edit']['type'],
            'edit': edit,
#             'left_diff': left_diff,
#             'right_diff': right_diff,
            'cdr': cdr,
            'cgr': cgr,
            'adr': adr,
            'agr': agr,
            'delta_dr': delta_dr,
            'delta_gr': delta_gr,
            'anchored_gain_in_sum_rank': delta_sum_rank,
#             'delta_dr_assume': delta_dr_assume,
#             'delta_gr_assume': delta_gr_assume,
#             'anchored_gain_in_sum_rank_assume': delta_sum_rank_assume,
            'cds': cds,
            'cgs': cgs,
            'ads': ads,
            'ags': ags,
            'norm_cds': norm_cds,
            'norm_cgs': norm_cgs,
            'norm_ads': norm_ads,
            'norm_ags': norm_ags
        })

100%|██████████| 164/164 [00:23<00:00,  7.11it/s]


In [23]:
data

[{'same': True,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> point',
  'cdr': 0,
  'cgr': 0,
  'adr': 0,
  'agr': 0,
  'delta_dr': 0,
  'delta_gr': 0,
  'anchored_gain_in_sum_rank': 0,
  'cds': 22,
  'cgs': 1,
  'ads': 22,
  'ags': 1,
  'norm_cds': 0.0,
  'norm_cgs': -0.25925925925925924,
  'norm_ads': 0.0,
  'norm_ags': -0.25925925925925924},
 {'same': False,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> bar',
  'cdr': 0,
  'cgr': 0,
  'adr': 0,
  'agr': 0,
  'delta_dr': 0,
  'delta_gr': 0,
  'anchored_gain_in_sum_rank': 0,
  'cds': 19,
  'cgs': 65,
  'ads': 19,
  'ags': 65,
  'norm_cds': 0.0,
  'norm_cgs': 0.8214285714285714,
  'norm_ads': 0.0,
  'norm_ags': 0.8214285714285714},
 {'same': False,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> line',
  'cdr': 0,
  'cgr': None,
  'adr': 5,
  'agr': 9,
  'delta_dr': -5,
  'delta_gr': None,
  'anchored_gain_in_s

In [24]:
columns = ['same', 'prior_query', 'prior_dimensions', 'edit', 'edit_type',
           'cdr', 'adr', 'cgr', 'agr',
           'delta_dr', 'delta_gr', 'anchored_gain_in_sum_rank',
           'cds', 'cgs', 'ads', 'ags', 'norm_cds', 'norm_cgs', 'norm_ads', 'norm_ags'
          ]

df = pd.DataFrame(data, columns=columns)

In [25]:
nexts[19]['anchored']._get_vegalite()

{'$schema': 'https://vega.github.io/schema/vega-lite/v3.json',
 'data': {'url': 'data/cars.json'},
 'mark': 'point',
 'encoding': {'y': {'type': 'quantitative',
   'field': 'IMDB_Rating',
   'scale': {'zero': True}},
  'x': {'type': 'quantitative',
   'field': 'Rotten_Tomatoes_Rating',
   'scale': {'zero': True}},
  'size': {'type': 'quantitative',
   'field': 'Worldwide_Gross',
   'bin': True,
   'scale': {'zero': True}}}}

In [26]:
nexts[19]['edit']

{'type': 'add_field', 'fieldtype': 'q', 'transform': 'bin'}

# Results

In [27]:
RESULTS = []
import math
import numpy as np

In [28]:
def filter(frame, key, value):
    return frame[frame[key] == value]

In [29]:
df = df[df['prior_dimensions'].notnull()]
df

Unnamed: 0,same,prior_query,prior_dimensions,edit,edit_type,cdr,adr,cgr,agr,delta_dr,delta_gr,anchored_gain_in_sum_rank,cds,cgs,ads,ags,norm_cds,norm_cgs,norm_ads,norm_ags
0,True,q,1.0,tick <> point,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,1.0,22.0,1.0,0.0,-0.259259,0.000000,-0.259259
1,False,q,1.0,tick <> bar,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,65.0,19.0,65.0,0.0,0.821429,0.000000,0.821429
2,False,q,1.0,tick <> line,mark,0.0,5.0,,9.0,-5.0,,,39.0,583.0,42.0,523.0,0.0,25.761905,0.500000,22.904762
3,False,q,1.0,tick <> area,mark,0.0,1.0,,2.0,-1.0,,,39.0,584.0,40.0,524.0,0.0,27.200000,0.142857,24.200000
4,True,q,1.0,tick <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,-0.126316,0.000000,-0.126316
6,False,q,1.0,+ raw(q),add_field,0.0,14.0,,127.0,-14.0,,,18.0,916.0,22.0,459.0,0.0,4.215962,0.250000,2.070423
7,False,q,1.0,+ bin(q),add_field,0.0,1.0,,12.0,-1.0,,,21.0,915.0,22.0,459.0,0.0,4.933702,0.047619,2.414365
8,False,q,1.0,+ agg(q),add_field,0.0,1.0,,175.0,-1.0,,,26.0,583.0,27.0,523.0,0.0,4.180451,0.076923,3.729323
9,False,q,1.0,+ raw(n),add_field,0.0,0.0,39.0,39.0,0.0,0.0,0.0,30.0,459.0,30.0,459.0,0.0,1.772727,0.000000,1.772727
10,False,q,1.0,<- bin(q),bin,0.0,1.0,41.0,19.0,-1.0,22.0,21.0,19.0,582.0,20.0,522.0,0.0,5.630000,0.025641,5.030000


In [30]:
df.to_json('data.json', orient='records')
df

Unnamed: 0,same,prior_query,prior_dimensions,edit,edit_type,cdr,adr,cgr,agr,delta_dr,delta_gr,anchored_gain_in_sum_rank,cds,cgs,ads,ags,norm_cds,norm_cgs,norm_ads,norm_ags
0,True,q,1.0,tick <> point,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,1.0,22.0,1.0,0.0,-0.259259,0.000000,-0.259259
1,False,q,1.0,tick <> bar,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,65.0,19.0,65.0,0.0,0.821429,0.000000,0.821429
2,False,q,1.0,tick <> line,mark,0.0,5.0,,9.0,-5.0,,,39.0,583.0,42.0,523.0,0.0,25.761905,0.500000,22.904762
3,False,q,1.0,tick <> area,mark,0.0,1.0,,2.0,-1.0,,,39.0,584.0,40.0,524.0,0.0,27.200000,0.142857,24.200000
4,True,q,1.0,tick <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,-0.126316,0.000000,-0.126316
6,False,q,1.0,+ raw(q),add_field,0.0,14.0,,127.0,-14.0,,,18.0,916.0,22.0,459.0,0.0,4.215962,0.250000,2.070423
7,False,q,1.0,+ bin(q),add_field,0.0,1.0,,12.0,-1.0,,,21.0,915.0,22.0,459.0,0.0,4.933702,0.047619,2.414365
8,False,q,1.0,+ agg(q),add_field,0.0,1.0,,175.0,-1.0,,,26.0,583.0,27.0,523.0,0.0,4.180451,0.076923,3.729323
9,False,q,1.0,+ raw(n),add_field,0.0,0.0,39.0,39.0,0.0,0.0,0.0,30.0,459.0,30.0,459.0,0.0,1.772727,0.000000,1.772727
10,False,q,1.0,<- bin(q),bin,0.0,1.0,41.0,19.0,-1.0,22.0,21.0,19.0,582.0,20.0,522.0,0.0,5.630000,0.025641,5.030000


In [31]:
def add_label(records, key, value):
    for r in records:
        r[key] = value
        
    return records

In [32]:
def summarize(dataframe):
    result = []
    
    def measures(frame):
        result = []
        
        dr_mean = frame['delta_dr'].mean()
        dr_std = frame['delta_dr'].std()
        
        gr_mean = frame['delta_gr'].mean()
        gr_std = frame['delta_gr'].std()
        
        tr_mean = frame['anchored_gain_in_sum_rank'].mean()
        tr_std = frame['anchored_gain_in_sum_rank'].std()
        
        print('MEAN')
        print('Gain in Draco rank (stdev):                            {0} ({1})'.format(dr_mean, dr_std))
        print('Gain in Graphscape rank:                               {0} ({1})'.format(gr_mean, gr_std))
        print('Gain in Total rank (both above must be defined):       {0} ({1})'.format(tr_mean, tr_std))
        print()
        
        def h(val):
            return val if not np.isnan(val) else None
            
        result.append({
            'dr_mean': h(dr_mean),
            'dr_std': h(dr_std),
            'gr_mean': h(gr_mean),
            'gr_std': h(gr_std),
            'tr_mean': h(tr_mean),
            'tr_std': h(tr_std)
        })
              
        return result

    print('%% ALL %%')
    result += add_label(measures(dataframe), 'include_zero_deltas', True)
    
#     print('%% EXCLUDING ZERO GAIN %%')
#     frame = dataframe[(dataframe['delta_dr'] != 0) & (dataframe['delta_gr'] != 0)]
#     result += add_label(measures(frame), 'include_zero_deltas', False)
    
    return result



In [33]:
def summarize_group(f):
    result = []
    
    print('%%%%% TOTAL %%%%%%')
    result += add_label(summarize(f), 'edit_type', 'total')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% MARK %%%%%%')
    mark = f[f['edit_type'] == 'mark']
    result += add_label(summarize(mark), 'edit_type', 'mark')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% ADD FIELD %%%%%%')
    add_field = f[f['edit_type'] == 'add_field']
    result += add_label(summarize(add_field), 'edit_type', 'add_field')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% AGGREGATE %%%%%%')
    add_field = f[f['edit_type'] == 'agg']
    result += add_label(summarize(add_field), 'edit_type', 'agg')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% BIN %%%%%%')
    add_field = f[f['edit_type'] == 'bin']
    result += add_label(summarize(add_field), 'edit_type', 'bin')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    return result


## Aggregate

In [34]:
RESULTS += add_label(summarize_group(df), 'dims', 'all')

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -6.946308724832215 (17.800616917224378)
Gain in Graphscape rank:                               12.225 (37.36612389839055)
Gain in Total rank (both above must be defined):       11.4875 (35.22333581698288)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -1.9473684210526316 (5.661318425578044)
Gain in Graphscape rank:                               10.057692307692308 (36.785234389357086)
Gain in Total rank (both above must be defined):       9.288461538461538 (34.088889880326136)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -15.660714285714286 (26.158451987638916)
Gain in Graphscape rank:                               27.076923076923077 (52.977135851959034)
Gain in Total rank (both above must be defined):       26.0 (51.39876781921269)

%%%%%%%%%%%%%%%%%


%%%%% AGGREGATE %

## 1D

In [35]:
oned = df[df['prior_dimensions'] == 1.0]

In [36]:
RESULTS += add_label(summarize_group(oned), 'dims', 1)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -7.825 (19.645430733836015)
Gain in Graphscape rank:                               0.9166666666666666 (4.490731195102493)
Gain in Total rank (both above must be defined):       0.875 (4.286607049870562)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -1.25 (4.1023100039393725)
Gain in Graphscape rank:                               0.0 (0.0)
Gain in Total rank (both above must be defined):       0.0 (0.0)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -17.9375 (28.285376080229163)
Gain in Graphscape rank:                               0.0 (0.0)
Gain in Total rank (both above must be defined):       0.0 (0.0)

%%%%%%%%%%%%%%%%%


%%%%% AGGREGATE %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            0.0 (0.0)
Gain in Graphscape rank:                    

## 2D

In [37]:
twod = df[df['prior_dimensions'] == 2.0]
RESULTS += add_label(summarize_group(twod), 'dims', 2)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -6.623853211009174 (17.15980930504943)
Gain in Graphscape rank:                               17.071428571428573 (43.78744408777772)
Gain in Total rank (both above must be defined):       16.035714285714285 (41.28172137254598)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -2.1964285714285716 (6.136091865073678)
Gain in Graphscape rank:                               14.527777777777779 (43.64564197594147)
Gain in Total rank (both above must be defined):       13.416666666666666 (40.45129346333864)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -14.75 (25.579188255198243)
Gain in Graphscape rank:                               39.111111111111114 (60.66597984959207)
Gain in Total rank (both above must be defined):       37.55555555555556 (58.945125140063766)

%%%%%%%%%%%%%%%%

In [38]:
len(df[df['adr'].isnull()])

0

In [39]:
identical = df[df['same'] == True]

In [40]:
len(identical)

2

In [41]:
without_identicals = df[(df['delta_dr'] != 0) | (df['delta_gr'] != 0)]

In [42]:
len(without_identicals)

82

In [43]:
len(df)

149

In [44]:
cold_below_50 = without_identicals[without_identicals['cgr'].isnull()]
len(cold_below_50)

68

In [45]:
anchored_below_50 = without_identicals[without_identicals['adr'].isnull()]
len(anchored_below_50)

0

In [46]:
nexts[20]['anchored']._get_stats()

{'draco_score': 43,
 'norm_draco_score': 0.0,
 'draco_rank': 0,
 'draco_of': 200,
 'graphscape_score': 697,
 'norm_graphscape_score': 8.115384615384615,
 'graphscape_rank': None,
 'graphscape_of': 200}

In [47]:
summarize_group(without_identicals)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -12.621951219512194 (22.5041232128212)
Gain in Graphscape rank:                               75.23076923076923 (63.69347670177044)
Gain in Total rank (both above must be defined):       70.6923076923077 (60.280434381570025)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -4.933333333333334 (8.224996943017283)
Gain in Graphscape rank:                               87.16666666666667 (75.74012586909707)
Gain in Total rank (both above must be defined):       80.5 (70.52588177399839)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -18.659574468085108 (27.585658368816816)
Gain in Graphscape rank:                               88.0 (63.85922016435841)
Gain in Total rank (both above must be defined):       84.5 (63.058174199173685)

%%%%%%%%%%%%%%%%%


%%%%% AGGREGATE %%%%%%
%% AL

[{'dr_mean': -12.621951219512194,
  'dr_std': 22.5041232128212,
  'gr_mean': 75.23076923076923,
  'gr_std': 63.69347670177044,
  'tr_mean': 70.6923076923077,
  'tr_std': 60.280434381570025,
  'include_zero_deltas': True,
  'edit_type': 'total'},
 {'dr_mean': -4.933333333333334,
  'dr_std': 8.224996943017283,
  'gr_mean': 87.16666666666667,
  'gr_std': 75.74012586909707,
  'tr_mean': 80.5,
  'tr_std': 70.52588177399839,
  'include_zero_deltas': True,
  'edit_type': 'mark'},
 {'dr_mean': -18.659574468085108,
  'dr_std': 27.585658368816816,
  'gr_mean': 88.0,
  'gr_std': 63.85922016435841,
  'tr_mean': 84.5,
  'tr_std': 63.058174199173685,
  'include_zero_deltas': True,
  'edit_type': 'add_field'},
 {'dr_mean': -3.0,
  'dr_std': None,
  'gr_mean': 63.0,
  'gr_std': None,
  'tr_mean': 60.0,
  'tr_std': None,
  'include_zero_deltas': True,
  'edit_type': 'agg'},
 {'dr_mean': -1.75,
  'dr_std': 0.9574271077563381,
  'gr_mean': 20.0,
  'gr_std': 2.8284271247461903,
  'tr_mean': 19.0,
  'tr_st

In [48]:
filter(filter(df,'edit_type', 'mark'), 'prior_dimensions', 1)

Unnamed: 0,same,prior_query,prior_dimensions,edit,edit_type,cdr,adr,cgr,agr,delta_dr,delta_gr,anchored_gain_in_sum_rank,cds,cgs,ads,ags,norm_cds,norm_cgs,norm_ads,norm_ags
0,True,q,1.0,tick <> point,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,1.0,22.0,1.0,0.0,-0.259259,0.0,-0.259259
1,False,q,1.0,tick <> bar,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,65.0,19.0,65.0,0.0,0.821429,0.0,0.821429
2,False,q,1.0,tick <> line,mark,0.0,5.0,,9.0,-5.0,,,39.0,583.0,42.0,523.0,0.0,25.761905,0.5,22.904762
3,False,q,1.0,tick <> area,mark,0.0,1.0,,2.0,-1.0,,,39.0,584.0,40.0,524.0,0.0,27.2,0.142857,24.2
4,True,q,1.0,tick <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,-0.126316,0.0,-0.126316
60,False,bin(q),1.0,bar <> point,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,2.0,46.0,2.0,0.0,-0.916667,0.0,-0.916667
61,False,bin(q),1.0,bar <> bar,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,26.0,0.0,0.0,-1.625,0.0,-1.625
62,False,bin(q),1.0,bar <> line,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,4.0,46.0,4.0,0.0,-14.0,0.0,-14.0
63,False,bin(q),1.0,bar <> area,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,46.0,3.0,46.0,3.0,0.0,-21.5,0.0,-21.5
64,False,bin(q),1.0,bar <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,2.0,51.0,2.0,0.0,-3.0625,0.0,-3.0625
