# Imports

In [None]:
import argparse
import time
import os

import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader

from mscn.util import *
from mscn.data import get_train_datasets, load_data, make_dataset
from mscn.model import SetConv

: 

# Functions

In [None]:
def unnormalize_torch(vals, min_val, max_val):
    vals = (vals * (max_val - min_val)) + min_val
    return torch.exp(vals)


def qerror_loss(preds, targets, min_val, max_val):
    qerror = []
    preds = unnormalize_torch(preds, min_val, max_val)
    targets = unnormalize_torch(targets, min_val, max_val)

    for i in range(len(targets)):
        if (preds[i] > targets[i]).cpu().data.numpy()[0]:
            qerror.append(preds[i] / targets[i])
        else:
            qerror.append(targets[i] / preds[i])
    return torch.mean(torch.cat(qerror))


def predict(model, data_loader, cuda):
    preds = []
    t_total = 0.

    model.eval()
    for batch_idx, data_batch in enumerate(data_loader):

        samples, predicates, joins, targets, sample_masks, predicate_masks, join_masks = data_batch

        if cuda:
            samples, predicates, joins, targets = samples.cuda(), predicates.cuda(), joins.cuda(), targets.cuda()
            sample_masks, predicate_masks, join_masks = sample_masks.cuda(), predicate_masks.cuda(), join_masks.cuda()
        samples, predicates, joins, targets = Variable(samples), Variable(predicates), Variable(joins), Variable(
            targets)
        sample_masks, predicate_masks, join_masks = Variable(sample_masks), Variable(predicate_masks), Variable(
            join_masks)

        t = time.time()
        outputs = model(samples, predicates, joins, sample_masks, predicate_masks, join_masks)
        t_total += time.time() - t

        for i in range(outputs.data.shape[0]):
            preds.append(outputs.data[i])

    return preds, t_total


def print_qerror(preds_unnorm, labels_unnorm):
    qerror = []
    for i in range(len(preds_unnorm)):
        # SQ: preds_unnorm[i] is an array whereas labels_unnorm[i] is a scaler (int64)
        # It was causing an error, so I changed the following code to unpack the scaler value
        # from each array element inside preds_unnorm
        if preds_unnorm[i][0] > float(labels_unnorm[i]):
            qerror.append(preds_unnorm[i][0] / float(labels_unnorm[i]))
        else:
            qerror.append(float(labels_unnorm[i]) / float(preds_unnorm[i][0]))

    print("Median: {}".format(np.median(qerror)))
    print("90th percentile: {}".format(np.percentile(qerror, 90)))
    print("95th percentile: {}".format(np.percentile(qerror, 95)))
    print("99th percentile: {}".format(np.percentile(qerror, 99)))
    print("Max: {}".format(np.max(qerror)))
    print("Mean: {}".format(np.mean(qerror)))

: 

In [5]:
# print(args.testset)
# train_and_predict(args.testset, args.queries, args.epochs, args.batch, args.hid, args.cuda)
workload_name = 'job-light'
num_queries = 100000
num_epochs = 10
batch_size = 1024
hid_units = 256
cuda = False

In [6]:
# Load training and validation data
num_materialized_samples = 1000
dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_data, test_data = get_train_datasets(num_queries, num_materialized_samples)
table2vec, column2vec, op2vec, join2vec = dicts

# Train model
sample_feats = len(table2vec) + num_materialized_samples
predicate_feats = len(column2vec) + len(op2vec) + 1
join_feats = len(join2vec)

model = SetConv(sample_feats, predicate_feats, join_feats, hid_units)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


train_data_loader = DataLoader(train_data, batch_size=batch_size)
test_data_loader = DataLoader(test_data, batch_size=batch_size)

model.train()

for epoch in range(num_epochs):
    loss_total = 0.

    for batch_idx, data_batch in enumerate(train_data_loader):
        samples, predicates, joins, targets, sample_masks, predicate_masks, join_masks = data_batch

        samples, predicates, joins, targets = Variable(samples), Variable(predicates), Variable(joins), Variable(targets)
        sample_masks, predicate_masks, join_masks = Variable(sample_masks), Variable(predicate_masks), Variable(join_masks)

        optimizer.zero_grad()
        outputs = model(samples, predicates, joins, sample_masks, predicate_masks, join_masks)
        loss = qerror_loss(outputs, targets.float(), min_val, max_val)
        loss_total += loss.item()
        loss.backward()
        optimizer.step()

    print("Epoch {}, loss: {}".format(epoch, loss_total / len(train_data_loader)))

# Get final training and validation set predictions
preds_train, t_total = predict(model, train_data_loader, cuda)
print("Prediction time per training sample: {}".format(t_total / len(labels_train) * 1000))

preds_test, t_total = predict(model, test_data_loader, cuda)
print("Prediction time per validation sample: {}".format(t_total / len(labels_test) * 1000))

# Unnormalize
preds_train_unnorm = unnormalize_labels(preds_train, min_val, max_val)
labels_train_unnorm = unnormalize_labels(labels_train, min_val, max_val)

preds_test_unnorm = unnormalize_labels(preds_test, min_val, max_val)
labels_test_unnorm = unnormalize_labels(labels_test, min_val, max_val)

# Print metrics
print("\nQ-Error training set:")
print_qerror(preds_train_unnorm, labels_train_unnorm)

print("\nQ-Error validation set:")
print_qerror(preds_test_unnorm, labels_test_unnorm)
print("")

# Load test data
file_name = "workloads/" + workload_name
joins, predicates, tables, samples, label = load_data(file_name, num_materialized_samples)

# Get feature encoding and proper normalization
samples_test = encode_samples(tables, samples, table2vec)
predicates_test, joins_test = encode_data(predicates, joins, column_min_max_vals, column2vec, op2vec, join2vec)
labels_test, _, _ = normalize_labels(label, min_val, max_val)

print("Number of test samples: {}".format(len(labels_test)))

max_num_predicates = max([len(p) for p in predicates_test])
max_num_joins = max([len(j) for j in joins_test])

# Get test set predictions
test_data = make_dataset(samples_test, predicates_test, joins_test, labels_test, max_num_joins, max_num_predicates)
test_data_loader = DataLoader(test_data, batch_size=batch_size)

preds_test, t_total = predict(model, test_data_loader, cuda)
print("Prediction time per test sample: {}".format(t_total / len(labels_test) * 1000))

# Unnormalize
preds_test_unnorm = unnormalize_labels(preds_test, min_val, max_val)

# Print metrics
print("\nQ-Error " + workload_name + ":")
print_qerror(preds_test_unnorm, label)

# Write predictions
file_name = "results/predictions_" + workload_name + ".csv"
os.makedirs(os.path.dirname(file_name), exist_ok=True)
with open(file_name, "w") as f:
    for i in range(len(preds_test_unnorm)):
        f.write(str(preds_test_unnorm[i]) + "," + label[i] + "\n")


Loaded queries
Loaded bitmaps
min log(label): 0.0
max log(label): 19.94772801931604
Number of training samples: 90000
Number of validation samples: 10000
Created TensorDataset for training data
Created TensorDataset for validation data
Epoch 0, loss: 87.94611702182077
Epoch 1, loss: 9.152028847824443
Epoch 2, loss: 6.51227083531293
Epoch 3, loss: 5.602087746966969
Epoch 4, loss: 5.2040769837119365
Epoch 5, loss: 4.928066570650447
Epoch 6, loss: 4.732810521667654
Epoch 7, loss: 4.569513911550695
Epoch 8, loss: 4.452799834988334
Epoch 9, loss: 4.348687020215121
Prediction time per training sample: 0.011015070809258355
Prediction time per validation sample: 0.0111663818359375

Q-Error training set:
Median: 1.837569303617152
90th percentile: 6.628471198711917
95th percentile: 12.214382239382244
99th percentile: 45.0
Max: 1408.3290322580644
Mean: 4.180855109543047

Q-Error validation set:
Median: 1.878830593089245
90th percentile: 7.400454086129686
95th percentile: 14.823587260519819
99th p