# Find Bugs

Use this notebook to find wrong predicitons. We can then later visualize the pairs of specs.

In [2]:
from draco.learn import data_util
from draco.learn import linear
from sklearn import linear_model
import random
import json
import numpy as np

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [4]:
train_dev, _ = data_util.load_data()
X, y = linear.prepare_paired_data(train_dev)

X_train, X_dev, y_train, y_dev = data_util.paired_train_test_split(X, y)

clf = linear_model.LogisticRegression(solver='sag')
clf.fit(X_train, y_train)

print("Train score: ", clf.score(X_train, y_train))
print("Dev score: ", clf.score(X_dev, y_dev))

Train score:  1.0
Dev score:  0.9973045822102425


In [5]:
# get the pairs that are predicted wrong
negative_pairs = X[y == 0]

predicted = clf.predict(negative_pairs)
bug_idx = predicted > 0.5  # idx in pairs

np.arange(len(bug_idx))[bug_idx]

array([481])

In [12]:
# sanity check, these should be the same as the bugs for positive pairs unless the weight for one feature is 0
positive_pairs = X[y == 1]

predicted = clf.predict(positive_pairs)
bug_idx = predicted < 0.5

np.arange(len(bug_idx))[bug_idx]

array([481])

In [13]:
# get the indexes in the original data
bugs = train_dev.index[bug_idx]
bugs

Int64Index([1], dtype='int64')

In [14]:
pos_neg_data = data_util.load_neg_pos_data()

In [17]:
# TODO: generate better data

vals = [{
    'q1': random.normalvariate(2, 2),
    'q2': random.normalvariate(2, 2),
    'n': random.randint(0,7)
} for _ in range(10)]

In [18]:
bug_specs = []

for i in bugs:
    example = pos_neg_data[i]
    
    negative = example.negative
    positive = example.positive
    
    negative['data'] = {
        'values': vals
    }
    positive['data'] = {
        'values': vals
    }
    bug_specs.append({
        'true_negative': negative,
        'true_positive': positive
    })

In [19]:
with open('../data/bugs/bugs.json', 'w') as f:
    json.dump(bug_specs, f, indent=2)