In [1]:
from dziban.mkiv import Chart
from vega_datasets import data
from copy import deepcopy
import json
from tqdm import tqdm
import pandas as pd

In [2]:
movies = data('movies')
movies.head(1)

Unnamed: 0,Creative_Type,Director,Distributor,IMDB_Rating,IMDB_Votes,MPAA_Rating,Major_Genre,Production_Budget,Release_Date,Rotten_Tomatoes_Rating,Running_Time_min,Source,Title,US_DVD_Sales,US_Gross,Worldwide_Gross
0,,,Gramercy,6.1,1071.0,R,,8000000.0,Jun 12 1998,,,,The Land Girls,,146083.0,146083.0


In [3]:
base = Chart(movies)

In [4]:
q1 = 'IMDB_Rating'
q2 = 'Rotten_Tomatoes_Rating'
q3 = 'Worldwide_Gross'
n1 = 'Major_Genre'
n2 = 'MPAA_Rating'
n3 = 'Creative_Type'

fields = {
    'q': [q1, q2, q3],
    'n': [n1, n2, n3]
}

marks = ['point', 'bar', 'line', 'area', 'tick', 'rect']
aggs = ['mean']

In [5]:
def get_props_from_transform(transform):
    props = {}
    if (transform == 'bin'):
        props['bin'] = True
    elif (transform == 'agg'):
        props['aggregate'] = 'mean'
        
    return props

In [6]:
def generate_queries(currdims, targetdims, curr_query, queries):
    if (currdims < targetdims):
        for fieldtype in ['q', 'n']:
            if (fieldtype == 'q'):
                for transform in ['raw', 'bin', 'agg']:
                    next_query = deepcopy(curr_query)
                    next_query.append({ 'fieldtype': fieldtype, 'transform': transform })
                    queries.append(next_query)

                    generate_queries(currdims + 1, targetdims, next_query, queries)
            else:
                next_query = deepcopy(curr_query)
                next_query.append({ 'fieldtype': fieldtype, 'transform': 'raw' })
                queries.append(next_query)

                generate_queries(currdims + 1, targetdims, next_query, queries)

In [7]:
def dedupe_queries(queries):
    seen = set()
    
    unique = []
    for query in queries:
        reps = {}
        for field in query:
            string = '{0}({1})'.format(field['transform'], field['fieldtype'])
            if string not in reps:
                reps[string] = 0
            reps[string] += 1
            
        stringified = json.dumps(reps, sort_keys=True)
        
        if stringified not in seen:
            seen.add(stringified)
            unique.append(query)
            
    return unique
            

In [8]:
def query_to_dziban(prior, query, available_fields):
    chart = prior
    
    used_fields = {}
    
    for field in query:
        fieldtype = field['fieldtype']
        transform = field['transform']
        
        fieldname = available_fields[fieldtype].pop(0)
    
        if (fieldtype not in used_fields):
            used_fields[fieldtype] = []
        used_fields[fieldtype].append(fieldname)
        
        props = get_props_from_transform(transform)
        
        props['type'] = 'quantitative' if fieldtype == 'q' else 'nominal'
            
        chart = chart.field(fieldname, **props)
        
    return {
        'chart': chart,
        'query': query,
        'available_fields': available_fields,
        'used_fields': used_fields
    }
        

In [9]:
priors = []
generate_queries(0, 2, [], priors)
priors = dedupe_queries(priors)

In [10]:
charts = [query_to_dziban(base, query, deepcopy(fields)) for query in priors]

In [11]:
mark_edits = []
for mark in marks:
    mark_edits.append({
        'type': 'mark',
        'mark': mark
    })

In [12]:
add_field_edits = []
for fieldtype in ['q', 'n']:
    if (fieldtype == 'q'):
        for transform in ['raw', 'bin', 'agg']:
            add_field_edits.append({
                'type': 'add_field',
                'fieldtype': fieldtype,
                'transform': transform
            })
            
    else:
        add_field_edits.append({
            'type': 'add_field',
            'fieldtype': fieldtype,
            'transform': 'raw'
        })

In [13]:
bin_edits = [
    {
        'type': 'bin'
    }
]

In [14]:
agg_edits = []
for agg in aggs:
    agg_edits.append(
        {
            'type': 'agg',
            'agg': agg
        }
    )

In [15]:
edits = mark_edits + add_field_edits + bin_edits + agg_edits

In [16]:
def edit_dziban(dzi, edit):
    edited_chart = dzi['chart']
    etype = edit['type']
    
    if etype == 'mark':
        edited_chart = edited_chart.mark(edit['mark'])
    elif etype == 'add_field':
        available_fields = dzi['available_fields']
        field = available_fields[edit['fieldtype']].pop(0)
        
        if (edit['fieldtype'] not in dzi['used_fields']):
            dzi['used_fields'][edit['fieldtype']] = []
        dzi['used_fields'][edit['fieldtype']].append(field)
        
        props = get_props_from_transform(edit['transform'])
        
        edited_chart = edited_chart.field(field, **props)
    elif etype == 'bin':
        used_fields = dzi['used_fields']
        
        if ('q' not in used_fields):
            return None
        
        field_to_bin = used_fields['q'][0]
        
        edited_chart = edited_chart.field(field_to_bin, bin=True)
    elif etype == 'agg':
        used_fields = dzi['used_fields']
        
        if ('q' not in used_fields):
            return None
        
        field_to_agg = used_fields['q'][0]
        edited_chart = edited_chart.field(field_to_agg, aggregate=edit['agg'])
        
    cold = edited_chart
    anchored = edited_chart.anchor_on(dzi['chart'])
    
    return {
        'prior': dzi['chart'],
        'prior_query': dzi['query'],
        'edit': edit,
        'cold': cold,
        'anchored': anchored,
        'available_fields': dzi['available_fields'],
        'used_fields': dzi['used_fields']
    }

In [17]:
nexts = []
for i in tqdm(range(len(charts))):
    chart = charts[i]
    for edit in edits:
        prior = deepcopy(chart)
        edited = edit_dziban(prior, edit)
        if edited is not None:
            nexts.append(edited)

100%|██████████| 14/14 [00:13<00:00,  1.08it/s]


In [18]:
with_differences = []
for i in tqdm(range(len(nexts))):
    n = nexts[i]
    prior = n['prior']
    cold = n['cold']
    anchored = n['anchored']
    
    if not cold.is_satisfiable():
        n['sat'] = False
        continue
        
    if not anchored.is_satisfiable():
        print(prior._get_vegalite())
        print(n['edit'])
        print('\n'.join(anchored._get_full_query()))
        print(cold._get_asp_complete())
        break
        
    n['sat'] = True
        
#     cold_graphscape = set(cold - prior)
#     anchored_graphscape = set(anchored._get_graphscape_list())
    
#     left_diff = cold_graphscape - anchored_graphscape
#     right_diff = anchored_graphscape - cold_graphscape
    
#     n['left_diff'] = left_diff
#     n['right_diff'] = right_diff
    cold_props = cold._get_draco_sol().props[cold._name]
    cold_props.sort()
    anchored_props = list(anchored._get_draco_sol().props[anchored._name])
    anchored_props.sort()
    
    cold_stats = cold._get_stats(prior)
    
    n['cold_draco_rank'] = cold_stats['draco_rank']
    n['cold_graphscape_rank'] = cold_stats['graphscape_rank']
    n['cold_draco_score'] = cold_stats['draco_score']
    n['cold_norm_draco_score'] = cold_stats['norm_draco_score']
    n['cold_graphscape_score'] = cold_stats['graphscape_score']
    n['cold_norm_graphscape_score'] = cold_stats['norm_graphscape_score']

    anchored_stats = anchored._get_stats()
    n['anchored_draco_rank'] = anchored_stats['draco_rank']
    n['anchored_graphscape_rank'] = anchored_stats['graphscape_rank']
    n['anchored_draco_score'] = anchored_stats['draco_score']
    n['anchored_norm_draco_score'] = anchored_stats['norm_draco_score']
    n['anchored_graphscape_score'] = anchored_stats['graphscape_score']
    n['anchored_norm_graphscape_score'] = anchored_stats['norm_graphscape_score']

    if (json.dumps(cold_props) == json.dumps(anchored_props)):
        n['same'] = True
    else:
        n['same'] = False
    

100%|██████████| 164/164 [14:13<00:00,  7.15s/it]


In [19]:
nexts[60]['cold']._sol.props

{'v_v': ['aggregate(v_v,1,count).',
  'bin(v_v,e0,10).',
  'channel(v_v,1,x).',
  'channel(v_v,e0,y).',
  'encoding(v_v,1).',
  'encoding(v_v,e0).',
  'field(v_v,e0,"IMDB_Rating").',
  'mark(v_v,point).',
  'type(v_v,1,quantitative).',
  'type(v_v,e0,quantitative).',
  'view(v_v).',
  'zero(v_v,1).']}

In [20]:
nexts[60]['anchored']._sol.props

{'anchor0': ['base(anchor0).',
  'view(anchor0).',
  'encoding(anchor0,e0).',
  'encoding(anchor0,1).',
  'mark(anchor0,bar).',
  'zero(anchor0,1).',
  'aggregate(anchor0,1,count).',
  'bin(anchor0,e0,10).',
  'channel(anchor0,e0,x).',
  'channel(anchor0,1,y).',
  'field(anchor0,e0,"IMDB_Rating").',
  'type(anchor0,e0,quantitative).',
  'type(anchor0,1,quantitative).'],
 'v_v': ['view(v_v).',
  'encoding(v_v,e0).',
  'mark(v_v,point).',
  'encoding(v_v,1).',
  'channel(v_v,e0,x).',
  'field(v_v,e0,"IMDB_Rating").',
  'channel(v_v,1,y).',
  'aggregate(v_v,1,count).',
  'zero(v_v,1).',
  'bin(v_v,e0,10).',
  'type(v_v,e0,quantitative).',
  'type(v_v,1,quantitative).']}

In [21]:
nexts[60]['same']

False

In [22]:
def stringify_query(query):
    result = ''
    for i, field in enumerate(query):
        if (i > 0):
            result += ' x '
    
        fieldtype = field['fieldtype']
        transform = field['transform']
        
        if fieldtype == 'n':
            result += 'n'
        elif fieldtype == 'q':
            if transform == 'raw':
                result += 'q'
            else:  
                result += '{0}({1})'.format(field['transform'], field['fieldtype'])
        
    return result

In [23]:
def stringify_edit(prior, edit):
    etype = edit['type']
    
    if etype == 'mark':
        return '{0} <> {1}'.format(prior._get_vegalite()['mark'], edit['mark'])
    elif etype == 'add_field':
        return '+ {0}({1})'.format(edit['transform'], edit['fieldtype'])
    elif etype == 'bin':
        return '<- bin(q)'
    elif etype == 'agg':
        return '<- agg(q)'

In [24]:
data = []
for i in tqdm(range(len(nexts))):
    n = nexts[i]
    prior_query = stringify_query(n['prior_query'])
    edit = stringify_edit(n['prior'], n['edit'])
    
#     left_diff = None
#     if ('left_diff' in n):
#         left_diff = json.dumps([x[0][5:] for x in n['left_diff']])
#         right_diff = json.dumps([x[0][5:] for x in n['right_diff']])

    if 'cold_draco_rank' not in n:
        data.append({
            'prior_query': prior_query,
            'edit': edit
        })
    else:
        cdr = n['cold_draco_rank']
        cgr = n['cold_graphscape_rank']
        adr = n['anchored_draco_rank']
        agr = n['anchored_graphscape_rank']
        
        cds = n['cold_draco_score']
        cgs = n['cold_graphscape_score']
        ads = n['anchored_draco_score']
        ags = n['anchored_graphscape_score']
        
        norm_cds = n['cold_norm_draco_score']
        norm_cgs = n['cold_norm_graphscape_score']
        norm_ads = n['anchored_norm_draco_score']
        norm_ags = n['anchored_norm_graphscape_score']
        
        delta_dr = None
        delta_dr_assume = None
        if (cdr is not None and adr is not None):
            delta_dr = cdr - adr
        
#         if (cdr['rank'] is not None or adr['rank'] is not None):
#             cdr_assume = cdr['rank'] if cdr['rank'] is not None else (cdr['of'] + 1)
#             adr_assume = adr['rank'] if adr['rank'] is not None else (adr['of'] + 1)
            
#             delta_dr_assume = cdr_assume - adr_assume
            
            
        delta_gr = None
        delta_gr_assume = None
        if (cgr is not None and agr is not None):
            delta_gr = cgr - agr
        
#         if (cgr['rank'] is not None or agr['rank'] is not None):
#             cgr_assume = cgr['rank'] if cgr['rank'] is not None else (cgr['of'] + 1)
#             agr_assume = agr['rank'] if agr['rank'] is not None else (agr['of'] + 1)
            
#             delta_gr_assume = cgr_assume - agr_assume
        
        delta_sum_rank = None
        if (delta_dr is not None and delta_gr is not None):
            delta_sum_rank = delta_dr + delta_gr
            
#         delta_sum_rank_assume = None
#         if (delta_dr_assume is not None and delta_gr_assume is not None):
#             delta_sum_rank_assume = delta_dr_assume + delta_gr_assume
        
            
        delta_ds = None
        if (cds is not None and ads is not None):
            delta_ds = cds - ads
    
        delta_gs = None
        if (cgs is not None and ags is not None):
            delta_gs = cgs - ags
        
        data.append({
            'same': n['same'],
            'prior_query': prior_query,
            'prior_dimensions': len(n['prior_query']),
            'edit_type': n['edit']['type'],
            'edit': edit,
#             'left_diff': left_diff,
#             'right_diff': right_diff,
            'cdr': cdr,
            'cgr': cgr,
            'adr': adr,
            'agr': agr,
            'delta_dr': delta_dr,
            'delta_gr': delta_gr,
            'anchored_gain_in_sum_rank': delta_sum_rank,
#             'delta_dr_assume': delta_dr_assume,
#             'delta_gr_assume': delta_gr_assume,
#             'anchored_gain_in_sum_rank_assume': delta_sum_rank_assume,
            'cds': cds,
            'cgs': cgs,
            'ads': ads,
            'ags': ags,
            'norm_cds': norm_cds,
            'norm_cgs': norm_cgs,
            'norm_ads': norm_ads,
            'norm_ags': norm_ags
        })

100%|██████████| 164/164 [00:22<00:00,  7.33it/s]


In [25]:
data

[{'same': True,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> point',
  'cdr': 0,
  'cgr': 0,
  'adr': 0,
  'agr': 0,
  'delta_dr': 0,
  'delta_gr': 0,
  'anchored_gain_in_sum_rank': 0,
  'cds': 22,
  'cgs': 1,
  'ads': 22,
  'ags': 1,
  'norm_cds': 0.0,
  'norm_cgs': -0.25925925925925924,
  'norm_ads': 0.0,
  'norm_ags': -0.26582278481012656},
 {'same': True,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> bar',
  'cdr': 0,
  'cgr': None,
  'adr': 0,
  'agr': 0,
  'delta_dr': 0,
  'delta_gr': None,
  'anchored_gain_in_sum_rank': None,
  'cds': 19,
  'cgs': 582,
  'ads': 19,
  'ags': 65,
  'norm_cds': 0.0,
  'norm_cgs': 10.053571428571429,
  'norm_ads': 0.0,
  'norm_ags': 0.8363636363636363},
 {'same': False,
  'prior_query': 'q',
  'prior_dimensions': 1,
  'edit_type': 'mark',
  'edit': 'tick <> line',
  'cdr': 0,
  'cgr': None,
  'adr': 8,
  'agr': 9,
  'delta_dr': -8,
  'delta_gr': None,
  'anchored_

In [61]:
columns = ['same', 'prior_query', 'prior_dimensions', 'edit', 'edit_type',
           'cdr', 'adr', 'cgr', 'agr',
           'delta_dr', 'delta_gr', 'anchored_gain_in_sum_rank',
           'cds', 'cgs', 'ads', 'ags', 'norm_cds', 'norm_cgs', 'norm_ads', 'norm_ags'
          ]

df = pd.DataFrame(data, columns=columns)

KeyError: (nan, 'occurred at index 5')

In [53]:
nexts[19]['anchored']._get_vegalite()

{'$schema': 'https://vega.github.io/schema/vega-lite/v3.json',
 'data': {'url': 'data/cars.json'},
 'mark': 'point',
 'encoding': {'x': {'type': 'quantitative',
   'field': 'IMDB_Rating',
   'scale': {'zero': True}},
  'y': {'type': 'quantitative',
   'field': 'Rotten_Tomatoes_Rating',
   'scale': {'zero': True}},
  'size': {'type': 'quantitative',
   'field': 'Worldwide_Gross',
   'bin': True,
   'scale': {'zero': True}}}}

In [54]:
nexts[19]['edit']

{'type': 'add_field', 'fieldtype': 'q', 'transform': 'bin'}

# Results

In [55]:
RESULTS = []
import math
import numpy as np

In [56]:
def filter(frame, key, value):
    return frame[frame[key] == value]

In [63]:
df = df[df['prior_dimensions'].notnull()]
df

edit_map = {
    'bin': 'transform',
    'agg': 'transform',
    'mark': 'mark',
    'add_field': 'add_field',
}

df['edit'] = df.apply(lambda x: edit_map[x['edit_type']], axis=1)

In [64]:
df.to_json('data.json', orient='records')
df

Unnamed: 0,same,prior_query,prior_dimensions,edit,edit_type,cdr,adr,cgr,agr,delta_dr,delta_gr,anchored_gain_in_sum_rank,cds,cgs,ads,ags,norm_cds,norm_cgs,norm_ads,norm_ags
0,True,q,1.0,mark,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,1.0,22.0,1.0,0.0,-0.259259,0.000000,-0.265823
1,True,q,1.0,mark,mark,0.0,0.0,,0.0,0.0,,,19.0,582.0,19.0,65.0,0.0,10.053571,0.000000,0.836364
2,False,q,1.0,mark,mark,0.0,8.0,,9.0,-8.0,,,38.0,1039.0,42.0,523.0,0.0,47.476190,0.571429,22.904762
3,False,q,1.0,mark,mark,0.0,3.0,,2.0,-3.0,,,38.0,1040.0,40.0,524.0,0.0,50.000000,0.250000,24.200000
4,True,q,1.0,mark,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,-0.153846,0.000000,-0.153846
6,False,q,1.0,add_field,add_field,0.0,10.0,138.0,127.0,-10.0,11.0,1.0,18.0,460.0,22.0,459.0,0.0,2.046296,0.181818,2.041667
7,False,q,1.0,add_field,add_field,0.0,0.0,729.0,12.0,0.0,717.0,717.0,22.0,915.0,22.0,459.0,0.0,4.232227,0.000000,2.071090
8,False,q,1.0,add_field,add_field,0.0,1.0,420.0,175.0,-1.0,245.0,244.0,26.0,583.0,27.0,523.0,0.0,3.688742,0.050000,3.313333
9,True,q,1.0,add_field,add_field,0.0,0.0,39.0,39.0,0.0,0.0,0.0,30.0,459.0,30.0,459.0,0.0,1.772727,0.000000,1.772727
10,False,q,1.0,transform,bin,0.0,1.0,41.0,19.0,-1.0,22.0,21.0,19.0,582.0,20.0,522.0,0.0,5.744898,0.029412,5.132653


In [59]:
def add_label(records, key, value):
    for r in records:
        r[key] = value
        
    return records

In [73]:
def summarize(dataframe):
    result = []
    
    def measures(frame):
        result = []
        
        count = len(frame)
    
        
        cdr_mean = frame['cdr'].mean()
        cdr_median = frame['cdr'].median()
        
        adr_mean = frame['adr'].mean()
        adr_median = frame['adr'].median()
        
        cgr_mean = frame['cgr'].mean()
        cgr_median = frame['cgr'].median()
        
        agr_mean = frame['agr'].mean()
        agr_median = frame['agr'].median()
        
        
        print('Count:                                                 {0}      '.format(count))
        print('Mean Cold Draco rank:                                  {0} ({1})'.format(cdr_mean))
        print('Mean Anchored Draco rank:                              {0} ({1})'.format(adr_mean))
        print('Mean Cold GraphScape rank:                             {0} ({1})'.format(cgr_mean))
        print('Mean Anchored Graphscape rank:                         {0} ({1})'.format(agr_mean))
        print()
        
        def h(val):
            return val if not np.isnan(val) else None
            
#         result.append({
#             'count': count,
#             'cdr_mean': h(dr_mean),
#             'dr_std': h(dr_std),
#             'gr_mean': h(gr_mean),
#             'gr_std': h(gr_std),
#             'tr_mean': h(tr_mean),
#             'tr_std': h(tr_std)
#         })
              
        return result

    print('%% ALL %%')
    result += add_label(measures(dataframe), 'include_zero_deltas', True)
    
#     print('%% EXCLUDING ZERO GAIN %%')
#     frame = dataframe[(dataframe['delta_dr'] != 0) & (dataframe['delta_gr'] != 0)]
#     result += add_label(measures(frame), 'include_zero_deltas', False)
    
    return result



In [74]:
def summarize_group(f):
    result = []
    
    print('%%%%% TOTAL %%%%%%')
    result += add_label(summarize(f), 'edit', 'total')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% MARK %%%%%%')
    mark = f[f['edit'] == 'mark']
    result += add_label(summarize(mark), 'edit', 'mark')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% ADD FIELD %%%%%%')
    add_field = f[f['edit'] == 'add_field']
    result += add_label(summarize(add_field), 'edit', 'add_field')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
    print('%%%%% TRANSFORM %%%%%%')
    add_field = f[f['edit'] == 'transform']
    result += add_label(summarize(add_field), 'edit', 'agg')
    print('%%%%%%%%%%%%%%%%%\n\n')
    
#     print('%%%%% BIN %%%%%%')
#     add_field = f[f['edit'] == 'bin']
#     result += add_label(summarize(add_field), 'edit', 'bin')
#     print('%%%%%%%%%%%%%%%%%\n\n')
    
    return result


## Aggregate

In [72]:
RESULTS += add_label(summarize_group(df), 'dims', 'all')

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Count:                                                 149      
Gain in Draco rank (stdev):                            -8.722972972972974 (24.390480173094062)
Gain in Graphscape rank:                               95.38317757009345 (209.3648366098656)
Gain in Total rank (both above must be defined):       89.60747663551402 (202.34272984451667)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Count:                                                 76      
Gain in Draco rank (stdev):                            -1.9066666666666667 (5.20699460272361)
Gain in Graphscape rank:                               24.826923076923077 (116.89759573398473)
Gain in Total rank (both above must be defined):       24.384615384615383 (116.47033685241853)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Count:                                                 56      
Gain in Draco rank (stdev):                            -20.339285714285715 (36.49222920304804)

## 1D

In [37]:
oned = df[df['prior_dimensions'] == 1.0]

In [38]:
RESULTS += add_label(summarize_group(oned), 'dims', 1)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -9.05 (23.652343083367732)
Gain in Graphscape rank:                               126.875 (257.50374253465685)
Gain in Total rank (both above must be defined):       117.40625 (240.9083428133961)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -1.55 (4.297796252916015)
Gain in Graphscape rank:                               0.0 (0.0)
Gain in Total rank (both above must be defined):       0.0 (0.0)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -20.625 (34.53283461673349)
Gain in Graphscape rank:                               269.2 (325.4834206881468)
Gain in Total rank (both above must be defined):       249.06666666666666 (305.8107413479937)

%%%%%%%%%%%%%%%%%


%%%%% AGGREGATE %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            0.0 (0.0)
Gain in G

## 2D

In [39]:
twod = df[df['prior_dimensions'] == 2.0]
RESULTS += add_label(summarize_group(twod), 'dims', 2)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -8.601851851851851 (24.765418991953396)
Gain in Graphscape rank:                               81.94666666666667 (185.4700306040096)
Gain in Total rank (both above must be defined):       77.74666666666667 (184.005441556826)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -2.036363636363636 (5.530935345061068)
Gain in Graphscape rank:                               33.1025641025641 (134.3829553085617)
Gain in Total rank (both above must be defined):       32.51282051282051 (133.92120067115505)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -20.225 (37.673691766032846)
Gain in Graphscape rank:                               206.30434782608697 (246.46619439638604)
Gain in Total rank (both above must be defined):       193.95652173913044 (249.2183194530212)

%%%%%%%%%%%%%%%%%



In [40]:
len(df[df['adr'].isnull()])

1

In [41]:
identical = df[df['same'] == True]

In [42]:
len(identical)

79

In [43]:
without_identicals = df[df['same'] == False]

In [44]:
len(without_identicals)

70

In [45]:
len(df)

149

In [46]:
cold_below_50 = without_identicals[without_identicals['cgr'].isnull()]
len(cold_below_50)

37

In [47]:
anchored_below_50 = without_identicals[without_identicals['adr'].isnull()]
len(anchored_below_50)

1

In [48]:
have_both_ranks = without_identicals[without_identicals['cgr'].notnull() & without_identicals['adr'].notnull()]
len(have_both_ranks)

33

In [49]:
nexts[20]['anchored']._get_stats()

{'draco_score': 41,
 'norm_draco_score': 0.0,
 'draco_rank': 0,
 'draco_of': 1000,
 'graphscape_score': 697,
 'norm_graphscape_score': 6.475247524752476,
 'graphscape_rank': None,
 'graphscape_of': 1000}

In [50]:
summarize_group(without_identicals)

%%%%% TOTAL %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -18.71014492753623 (33.11215409042575)
Gain in Graphscape rank:                               282.09090909090907 (274.6220543351767)
Gain in Total rank (both above must be defined):       263.3636363636364 (269.818668064987)

%%%%%%%%%%%%%%%%%


%%%%% MARK %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -5.5 (7.726577508832744)
Gain in Graphscape rank:                               184.42857142857142 (286.02788741825924)
Gain in Total rank (both above must be defined):       181.14285714285714 (286.6655592448156)

%%%%%%%%%%%%%%%%%


%%%%% ADD FIELD %%%%%%
%% ALL %%
MEAN
Gain in Draco rank (stdev):                            -29.973684210526315 (41.02371551564281)
Gain in Graphscape rank:                               375.5238095238095 (259.19194027739735)
Gain in Total rank (both above must be defined):       347.6190476190476 (260.12717585644066)

%%%%%%%%%%%%%%%%%




[{'dr_mean': -18.71014492753623,
  'dr_std': 33.11215409042575,
  'gr_mean': 282.09090909090907,
  'gr_std': 274.6220543351767,
  'tr_mean': 263.3636363636364,
  'tr_std': 269.818668064987,
  'include_zero_deltas': True,
  'edit_type': 'total'},
 {'dr_mean': -5.5,
  'dr_std': 7.726577508832744,
  'gr_mean': 184.42857142857142,
  'gr_std': 286.02788741825924,
  'tr_mean': 181.14285714285714,
  'tr_std': 286.6655592448156,
  'include_zero_deltas': True,
  'edit_type': 'mark'},
 {'dr_mean': -29.973684210526315,
  'dr_std': 41.02371551564281,
  'gr_mean': 375.5238095238095,
  'gr_std': 259.19194027739735,
  'tr_mean': 347.6190476190476,
  'tr_std': 260.12717585644066,
  'include_zero_deltas': True,
  'edit_type': 'add_field'},
 {'dr_mean': -2.0,
  'dr_std': None,
  'gr_mean': 63.0,
  'gr_std': None,
  'tr_mean': 61.0,
  'tr_std': None,
  'include_zero_deltas': True,
  'edit_type': 'agg'},
 {'dr_mean': -1.75,
  'dr_std': 1.5,
  'gr_mean': 17.25,
  'gr_std': 8.539125638299666,
  'tr_mean': 1

In [51]:
filter(filter(df,'edit_type', 'mark'), 'prior_dimensions', 1)

Unnamed: 0,same,prior_query,prior_dimensions,edit,edit_type,cdr,adr,cgr,agr,delta_dr,delta_gr,anchored_gain_in_sum_rank,cds,cgs,ads,ags,norm_cds,norm_cgs,norm_ads,norm_ags
0,True,q,1.0,tick <> point,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,22.0,1.0,22.0,1.0,0.0,-0.259259,0.0,-0.265823
1,True,q,1.0,tick <> bar,mark,0.0,0.0,,0.0,0.0,,,19.0,582.0,19.0,65.0,0.0,10.053571,0.0,0.836364
2,False,q,1.0,tick <> line,mark,0.0,8.0,,9.0,-8.0,,,38.0,1039.0,42.0,523.0,0.0,47.47619,0.571429,22.904762
3,False,q,1.0,tick <> area,mark,0.0,3.0,,2.0,-3.0,,,38.0,1040.0,40.0,524.0,0.0,50.0,0.25,24.2
4,True,q,1.0,tick <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,12.0,0.0,0.0,-0.153846,0.0,-0.153846
60,False,bin(q),1.0,bar <> point,mark,0.0,1.0,,0.0,-1.0,,,45.0,1386.0,46.0,2.0,0.0,29.130435,0.045455,-0.956522
61,True,bin(q),1.0,bar <> bar,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,26.0,0.0,0.0,-1.625,0.0,-1.625
62,False,bin(q),1.0,bar <> line,mark,0.0,1.0,,0.0,-1.0,,,45.0,1388.0,46.0,4.0,0.0,335.5,0.333333,-14.0
63,True,bin(q),1.0,bar <> area,mark,0.0,0.0,2.0,2.0,0.0,0.0,0.0,45.0,1387.0,45.0,1387.0,0.0,671.0,0.0,671.0
64,True,bin(q),1.0,bar <> tick,mark,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51.0,2.0,51.0,2.0,0.0,-3.0625,0.0,-3.0625
