In [1]:
import numpy as np
import os
import torch
import torch.nn as nn
import time
import pandas as pd
from scipy.stats import pearsonr

In [2]:
from model.util import Normalizer
from model.database_util import get_hist_file, get_job_table_sample, collator
from model.model import QueryFormer
from model.database_util import Encoding
from model.dataset import PlanTreeDataset
from model.trainer import eval_workload, train

In [3]:
data_path = './data/imdb/'

In [4]:
class Args:
    # bs = 1024
    # SQ: smaller batch size
    bs = 128
    lr = 0.001
    # epochs = 200
    epochs = 100
    clip_size = 50
    embed_size = 64
    pred_hid = 128
    ffn_dim = 128
    head_size = 12
    n_layers = 8
    dropout = 0.1
    sch_decay = 0.6
    device = 'cuda:0'
    newpath = './results/full/cost/'
    to_predict = 'cost'
args = Args()

import os
if not os.path.exists(args.newpath):
    os.makedirs(args.newpath)

In [5]:
hist_file = get_hist_file(data_path + 'histogram_string.csv')
cost_norm = Normalizer(-3.61192, 12.290855)
card_norm = Normalizer(1,100)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  hist_file['freq'][i] = freq_np
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, 

In [22]:
encoding_ckpt = torch.load('checkpoints/encoding.pt')
type(encoding_ckpt)

dict

In [23]:
encoding_ckpt.keys()

dict_keys(['encoding'])

In [24]:
encoding = encoding_ckpt['encoding']
type(encoding)

model.database_util.Encoding

## Exploring Encoding object's instance variables (attributes)

In [26]:
print(encoding.column_min_max_vals)
# column_min_max_vals is a dictionary. It has the min and max value for each numeric column in the dataset. 

{'t.id': [1.0, 2528312.0], 't.kind_id': [1.0, 7.0], 't.production_year': [1880.0, 2019.0], 'mc.id': [1.0, 2609129.0], 'mc.company_id': [1.0, 234997.0], 'mc.movie_id': [2.0, 2525745.0], 'mc.company_type_id': [1.0, 2.0], 'ci.id': [1.0, 36244344.0], 'ci.movie_id': [1.0, 2525975.0], 'ci.person_id': [1.0, 4061926.0], 'ci.role_id': [1.0, 11.0], 'mi.id': [1.0, 14835720.0], 'mi.movie_id': [1.0, 2526430.0], 'mi.info_type_id': [1.0, 110.0], 'mi_idx.id': [1.0, 1380035.0], 'mi_idx.movie_id': [2.0, 2525793.0], 'mi_idx.info_type_id': [99.0, 113.0], 'mk.id': [1.0, 4523930.0], 'mk.movie_id': [2.0, 2525971.0], 'mk.keyword_id': [1.0, 134170.0]}


In [27]:
print(encoding.col2idx)
# the label encoding of each unique column in the dataset

{'t.id': 0, 't.kind_id': 1, 't.production_year': 2, 'mc.id': 3, 'mc.company_id': 4, 'mc.movie_id': 5, 'mc.company_type_id': 6, 'ci.id': 7, 'ci.movie_id': 8, 'ci.person_id': 9, 'ci.role_id': 10, 'mi.id': 11, 'mi.movie_id': 12, 'mi.info_type_id': 13, 'mi_idx.id': 14, 'mi_idx.movie_id': 15, 'mi_idx.info_type_id': 16, 'mk.id': 17, 'mk.movie_id': 18, 'mk.keyword_id': 19, 'NA': 20}


In [28]:
print(encoding.op2idx)

{'>': 0, '=': 1, '<': 2, 'NA': 3}


In [29]:
print(encoding.idx2col)

{0: 't.id', 1: 't.kind_id', 2: 't.production_year', 3: 'mc.id', 4: 'mc.company_id', 5: 'mc.movie_id', 6: 'mc.company_type_id', 7: 'ci.id', 8: 'ci.movie_id', 9: 'ci.person_id', 10: 'ci.role_id', 11: 'mi.id', 12: 'mi.movie_id', 13: 'mi.info_type_id', 14: 'mi_idx.id', 15: 'mi_idx.movie_id', 16: 'mi_idx.info_type_id', 17: 'mk.id', 18: 'mk.movie_id', 19: 'mk.keyword_id', 20: 'NA'}


In [None]:
checkpoint = torch.load('checkpoints/cost_model.pt', map_location='cpu')

In [7]:
from model.util import seed_everything
seed_everything()

In [8]:
model = QueryFormer(emb_size = args.embed_size ,ffn_dim = args.ffn_dim, head_size = args.head_size, \
                 dropout = args.dropout, n_layers = args.n_layers, \
                 use_sample = True, use_hist = True, \
                 pred_hid = args.pred_hid
                )

In [9]:
_ = model.to(args.device)

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [10]:
to_predict = 'cost'

In [11]:
imdb_path = './data/imdb/'
dfs = []  # list to hold DataFrames
# SQ: added
for i in range(2):
#for i in range(18):
    file = imdb_path + 'plan_and_cost/train_plan_part{}.csv'.format(i)
    df = pd.read_csv(file)
    dfs.append(df)

full_train_df = pd.concat(dfs)

val_dfs = []  # list to hold DataFrames
for i in range(18,20):
    file = imdb_path + 'plan_and_cost/train_plan_part{}.csv'.format(i)
    df = pd.read_csv(file)
    val_dfs.append(df)

val_df = pd.concat(val_dfs)

In [12]:
table_sample = get_job_table_sample(imdb_path+'train')

Loaded queries with len  100000
Loaded bitmaps


# Step 1: Identifying the training dataset and its component

In [13]:
train_ds = PlanTreeDataset(full_train_df, None, encoding, hist_file, card_norm, cost_norm, to_predict, table_sample)

# Step 2: Exploring full_train_df

In [14]:
type(full_train_df)

pandas.core.frame.DataFrame

In [15]:
full_train_df.shape

(10000, 2)

In [16]:
full_train_df.head(2)

Unnamed: 0,id,json
0,0,"{""Plan"": {""Node Type"": ""Gather"", ""Parallel Awa..."
1,1,"{""Plan"": {""Node Type"": ""Seq Scan"", ""Parallel A..."


In [17]:
full_train_df.dtypes

id       int64
json    object
dtype: object

In [18]:
# printing a sample json string in full
json_sample = full_train_df['json'].iloc[0]
print(json_sample)

{"Plan": {"Node Type": "Gather", "Parallel Aware": false, "Startup Cost": 23540.58, "Total Cost": 154548.95, "Plan Rows": 567655, "Plan Width": 119, "Actual Startup Time": 386.847, "Actual Total Time": 646.972, "Actual Rows": 283812, "Actual Loops": 1, "Workers Planned": 2, "Workers Launched": 2, "Single Copy": false, "Plans": [{"Node Type": "Hash Join", "Parent Relationship": "Outer", "Parallel Aware": true, "Join Type": "Inner", "Startup Cost": 22540.58, "Total Cost": 96783.45, "Plan Rows": 236523, "Plan Width": 119, "Actual Startup Time": 369.985, "Actual Total Time": 518.487, "Actual Rows": 94604, "Actual Loops": 3, "Inner Unique": false, "Hash Cond": "(t.id = mi_idx.movie_id)", "Workers": [], "Plans": [{"Node Type": "Seq Scan", "Parent Relationship": "Outer", "Parallel Aware": true, "Relation Name": "title", "Alias": "t", "Startup Cost": 0.0, "Total Cost": 49166.46, "Plan Rows": 649574, "Plan Width": 94, "Actual Startup Time": 0.366, "Actual Total Time": 147.047, "Actual Rows": 51

In [19]:
# pretty print the json object
import json
# the following code parses the json string into a dictionary
json_parsed = json.loads(json_sample)
json_pretty = json.dumps(json_parsed, indent=4)
print(json_pretty)

with open('output.json', 'w') as f:
    f.write(json_pretty)

{
    "Plan": {
        "Node Type": "Gather",
        "Parallel Aware": false,
        "Startup Cost": 23540.58,
        "Total Cost": 154548.95,
        "Plan Rows": 567655,
        "Plan Width": 119,
        "Actual Startup Time": 386.847,
        "Actual Total Time": 646.972,
        "Actual Rows": 283812,
        "Actual Loops": 1,
        "Workers Planned": 2,
        "Workers Launched": 2,
        "Single Copy": false,
        "Plans": [
            {
                "Node Type": "Hash Join",
                "Parent Relationship": "Outer",
                "Parallel Aware": true,
                "Join Type": "Inner",
                "Startup Cost": 22540.58,
                "Total Cost": 96783.45,
                "Plan Rows": 236523,
                "Plan Width": 119,
                "Actual Startup Time": 369.985,
                "Actual Total Time": 518.487,
                "Actual Rows": 94604,
                "Actual Loops": 3,
                "Inner Unique": false,
         

# Step 3: Exploring encoding object

In [20]:
type(encoding)

model.database_util.Encoding

In [None]:
val_ds = PlanTreeDataset(val_df, None, encoding, hist_file, card_norm, cost_norm, to_predict, table_sample)

In [17]:
crit = nn.MSELoss()
model, best_path = train(model, train_ds, val_ds, crit, cost_norm, args)

OutOfMemoryError: CUDA out of memory. Tried to allocate 46.00 MiB. GPU 0 has a total capacity of 3.94 GiB of which 3.06 MiB is free. Including non-PyTorch memory, this process has 3.91 GiB memory in use. Of the allocated memory 3.70 GiB is allocated by PyTorch, and 146.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
methods = {
    'get_sample' : get_job_table_sample,
    'encoding': encoding,
    'cost_norm': cost_norm,
    'hist_file': hist_file,
    'model': model,
    'device': args.device,
    'bs': 512,
}

In [None]:
_ = eval_workload('job-light', methods)

In [None]:
_ = eval_workload('synthetic', methods)