In [52]:
import pandas as pd
import numpy as np
import csv
from mscn.util import *


# from mscn.data import load_data

# from mscn.data import get_train_datasets, load_data, make_dataset

In [34]:
import os
os.chdir('../')

In [35]:
import pandas as pd
import numpy as np
import csv
from mscn.util import *

In [36]:
from torch.utils.data import dataset
from torch.utils.data import DataLoader

In [37]:

import torch
import torch.nn as nn
import torch.nn.functional as F

# Functions

# Reading dataset

In [38]:
def load_data(file_name):
    joins = []
    predicates = []
    tables = []
    label = []

    num_queries = 0

    # Load queries
    # SQ: changed file open mode from 'rU' to 'r+'
    # SQ: the following function loads training data from train.csv file. 
    # train.csv file has both the input features and the actual cardinality of training queries
    # the following code block reads 1 training sample at a time, tokenize each training sample by # 
    # here '#' separates different kinds of information packed into each training sample: 
    # These are TABLES, JOINS, PREDICATES, and CARDINALITY. Since this generates a list of tokens and the 
    # list index is 0-based, the position 3 in the list is the cardinality. 
    # tables#joins#predicates#db2#actual#template
    with open(file_name, 'r+') as f:
        data_raw = list(list(rec) for rec in csv.reader(f, delimiter='#'))

        num_queries = len(data_raw)

        for i, row in enumerate(data_raw):
            if i == 0:
                continue
            tables.append(row[0].split(','))
            joins.append(row[1].split(','))
            predicates.append(row[2].split(','))
            # SQ: checks the value of cardinality
            # SQ: changed the row index to 4 as in our tpcds dataset the value of the label is in the column index 4
            # if int(row[3]) < 1:
            # SQ: changed the type checking below from int to float
            if float(row[4]) < 1.0:
                print("Queries must have non-zero cardinalities")
                exit(1)
            # SQ: changed the row index to 4, which has the value of the label in our tpcds dataset
            # label.append(row[3])
            label.append(row[4])
    print("Loaded queries")

    # Split predicates
    predicates = [list(chunks(d, 3)) for d in predicates]
    print(type(tables))
    return tables, joins, predicates, label, num_queries





In [43]:
def load_and_encode_train_data(dataset_name):

    # SQ: renamed the train.csv (which had the training dataset for job) to train_job.csv
    # SQ: added tpcds train dataset to the data folder and changing the file name below for tpcds
    file_name_queries = "data/train_{}.csv".format(dataset_name)

    # SQ: changed the following code to read the columns min and max for the tpcds dataset
    file_name_column_min_max_vals = "data/{}_column_min_max_vals.csv".format(dataset_name)

    tables, joins, predicates, label, num_queries = load_data(file_name_queries)

    # Get table name dict
    table_names = get_all_table_names(tables)
    table2vec, idx2table = get_set_encoding(table_names)

    # Get join name dict
    join_set = get_all_joins(joins)
    join2vec, idx2join = get_set_encoding(join_set)

    # Get column name dict
    column_names = get_all_column_names(predicates)
    column2vec, idx2column = get_set_encoding(column_names)
    
    # Get operator name dict
    operators = get_all_operators(predicates)
    op2vec, idx2op = get_set_encoding(operators)


    # Get min and max values for each column
    # SQ: changed file open model from rU to r+
    with open(file_name_column_min_max_vals, 'r+') as f:
        data_raw = list(list(rec) for rec in csv.reader(f, delimiter=','))
        column_min_max_vals = {}
        for i, row in enumerate(data_raw):
            # first row is the header, so skipping it
            if i == 0:
                continue
            # SQ: the following code is checking the data types of the column's min and max values
            # If these values are categorical (str), we're hashing them to generate a numeric value
            if type(row[1]) is str and type(row[2]) is str:
                hash_value_1 = hash(row[1])
                hash_value_2 = hash(row[2])
                row[1] = (hash_value_1 % 1000)
                row[2] = (hash_value_2 % 1000)
            column_min_max_vals[row[0]] = [float(row[1]), float(row[2])]

    # Get feature encoding and proper normalization
    predicates_enc, joins_enc = encode_data(predicates, joins, column_min_max_vals, column2vec, op2vec, join2vec)
    label_norm, min_val, max_val = normalize_labels(label)

    # Split in training and validation samples
    # SQ: multiply the number of queries by 0.9 and round that number. This gives an index around 0.9 split
    num_train = int(num_queries * 0.9)
    num_test = num_queries - num_train

    predicates_train = predicates_enc[:num_train]
    joins_train = joins_enc[:num_train]
    labels_train = label_norm[:num_train]

    predicates_test = predicates_enc[num_train:num_train + num_test]
    joins_test = joins_enc[num_train:num_train + num_test]
    labels_test = label_norm[num_train:num_train + num_test]

    print("Number of training samples: {}".format(len(labels_train)))
    print("Number of validation samples: {}".format(len(labels_test)))

    max_num_joins = max(max([len(j) for j in joins_train]), max([len(j) for j in joins_test]))
    max_num_predicates = max(max([len(p) for p in predicates_train]), max([len(p) for p in predicates_test]))

    dicts = [table2vec, column2vec, op2vec, join2vec]
    train_data = [predicates_train, joins_train]
    test_data = [predicates_test, joins_test]
    return dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_data, test_data


In [44]:
dataset_name = 'tpcds'
dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_data, test_data = load_and_encode_train_data(dataset_name)


FileNotFoundError: [Errno 2] No such file or directory: '../data/train_tpcds.csv'

In [41]:
def make_dataset(predicates, joins, labels, max_num_joins, max_num_predicates):
    """Add zero-padding and wrap as tensor dataset."""

    predicate_masks = []
    predicate_tensors = []
    for predicate in predicates:
        predicate_tensor = np.vstack(predicate)
        num_pad = max_num_predicates - predicate_tensor.shape[0]
        predicate_mask = np.ones_like(predicate_tensor).mean(1, keepdims=True)
        predicate_tensor = np.pad(predicate_tensor, ((0, num_pad), (0, 0)), 'constant')
        predicate_mask = np.pad(predicate_mask, ((0, num_pad), (0, 0)), 'constant')
        predicate_tensors.append(np.expand_dims(predicate_tensor, 0))
        predicate_masks.append(np.expand_dims(predicate_mask, 0))
    predicate_tensors = np.vstack(predicate_tensors)
    predicate_tensors = torch.FloatTensor(predicate_tensors)
    predicate_masks = np.vstack(predicate_masks)
    predicate_masks = torch.FloatTensor(predicate_masks)

    join_masks = []
    join_tensors = []
    for join in joins:
        join_tensor = np.vstack(join)
        num_pad = max_num_joins - join_tensor.shape[0]
        join_mask = np.ones_like(join_tensor).mean(1, keepdims=True)
        join_tensor = np.pad(join_tensor, ((0, num_pad), (0, 0)), 'constant')
        join_mask = np.pad(join_mask, ((0, num_pad), (0, 0)), 'constant')
        join_tensors.append(np.expand_dims(join_tensor, 0))
        join_masks.append(np.expand_dims(join_mask, 0))
    join_tensors = np.vstack(join_tensors)
    join_tensors = torch.FloatTensor(join_tensors)
    join_masks = np.vstack(join_masks)
    join_masks = torch.FloatTensor(join_masks)

    target_tensor = torch.FloatTensor(labels)

    return dataset.TensorDataset(predicate_tensors, join_tensors, target_tensor, predicate_masks, join_masks)


In [39]:

class SetConv(nn.Module):
    def __init__(self, sample_feats, predicate_feats, join_feats, hid_units):
        super(SetConv, self).__init__()
        self.sample_mlp1 = nn.Linear(sample_feats, hid_units)
        self.sample_mlp2 = nn.Linear(hid_units, hid_units)
        self.predicate_mlp1 = nn.Linear(predicate_feats, hid_units)
        self.predicate_mlp2 = nn.Linear(hid_units, hid_units)
        self.join_mlp1 = nn.Linear(join_feats, hid_units)
        self.join_mlp2 = nn.Linear(hid_units, hid_units)
        self.out_mlp1 = nn.Linear(hid_units * 3, hid_units)
        self.out_mlp2 = nn.Linear(hid_units, 1)
    def forward(self, samples, predicates, joins, sample_mask, predicate_mask, join_mask):
        # samples has shape [batch_size x num_joins+1 x sample_feats]
        # predicates has shape [batch_size x num_predicates x predicate_feats]
        # joins has shape [batch_size x num_joins x join_feats]

        hid_sample = F.relu(self.sample_mlp1(samples))
        hid_sample = F.relu(self.sample_mlp2(hid_sample))
        hid_sample = hid_sample * sample_mask  # Mask
        hid_sample = torch.sum(hid_sample, dim=1, keepdim=False)
        sample_norm = sample_mask.sum(1, keepdim=False)
        hid_sample = hid_sample / sample_norm  # Calculate average only over non-masked parts

        hid_predicate = F.relu(self.predicate_mlp1(predicates))
        hid_predicate = F.relu(self.predicate_mlp2(hid_predicate))
        hid_predicate = hid_predicate * predicate_mask
        hid_predicate = torch.sum(hid_predicate, dim=1, keepdim=False)
        predicate_norm = predicate_mask.sum(1, keepdim=False)
        hid_predicate = hid_predicate / predicate_norm

        hid_join = F.relu(self.join_mlp1(joins))
        hid_join = F.relu(self.join_mlp2(hid_join))
        hid_join = hid_join * join_mask
        hid_join = torch.sum(hid_join, dim=1, keepdim=False)
        join_norm = join_mask.sum(1, keepdim=False)
        hid_join = hid_join / join_norm

        hid = torch.cat((hid_sample, hid_predicate, hid_join), 1)
        hid = F.relu(self.out_mlp1(hid))
        out = torch.sigmoid(self.out_mlp2(hid))
        return out

# GET_TRAIN_DATA steps

In [25]:
dataset_name = 'tpcds'
dicts, column_min_max_vals, min_val, max_val, labels_train, labels_test, max_num_joins, max_num_predicates, train_data, test_data = load_and_encode_train_data(dataset_name)

table2vec, column2vec, op2vec, join2vec = dicts

train_dataset = make_dataset(*train_data, labels=labels_train, max_num_joins=max_num_joins, max_num_predicates=max_num_predicates)

test_dataset = make_dataset(*test_data, labels=labels_test, max_num_joins=max_num_joins, max_num_predicates=max_num_predicates)
print("Created TensorDataset for validation data")


Loaded queries
min log(label): 9.550306497851652
max log(label): 11.810560343519638
Number of training samples: 1591
Number of validation samples: 176
Created TensorDataset for validation data


In [26]:
type(train_dataset)

torch.utils.data.dataset.TensorDataset

In [27]:
# write code to explore a TensorDataset
# Generated by WCA for GP
# Print the length of the dataset
print(len(train_dataset))

# Print the first element of the dataset
print(train_dataset[0])


1591
(tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
         0.0000, 0.6487],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000,
         0.0000, 1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000

In [28]:
num_epochs = 10
batch_size = 1024
hid_units = 256
cuda = False

In [29]:
print(test_dataset[0])

(tensor([[ 0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          7.3248e-01],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  1.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
         -1.8750e-01],
        [ 1.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,
          0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+

In [30]:
print(train_dataset[0])

(tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 1.0000,
         0.0000, 0.6487],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 1.0000,
         0.0000, 1.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0

In [31]:
# Train model
sample_feats = len(table2vec)
predicate_feats = len(column2vec) + len(op2vec) + 1
join_feats = len(join2vec)

model = SetConv(sample_feats, predicate_feats, join_feats, hid_units)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


train_data_loader = DataLoader(train_data, batch_size=batch_size)
test_data_loader = DataLoader(test_data, batch_size=batch_size)


In [32]:
model.train()

for epoch in range(num_epochs):
    loss_total = 0.

    for batch_idx, data_batch in enumerate(train_data_loader):
        predicates, joins, targets, predicate_masks, join_masks = data_batch

        predicates, joins, targets = Variable(predicates), Variable(joins), Variable(targets)
        
        predicate_masks, join_masks = Variable(predicate_masks), Variable(join_masks)

        optimizer.zero_grad()
        outputs = model(predicates, joins, predicate_masks, join_masks)
        loss = qerror_loss(outputs, targets.float(), min_val, max_val)
        loss_total += loss.item()
        loss.backward()
        optimizer.step()

    print("Epoch {}, loss: {}".format(epoch, loss_total / len(train_data_loader)))

RuntimeError: each element in list of batch should be of equal size