In [1]:
import pandas as pd
import numpy as np
import copy
import sys
sys.path.append("../")
from parser.utils import load_json, dfs_cardinality, estimate_scan_in_mb
from models.feature.single_xgboost_feature import find_top_k_operators, featurize_one_plan, get_top_k_table_by_size
from utils.load_brad_trace import load_trace, create_concurrency_dataset, load_trace_all_version
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from models.single.stage import SingleStage
from models.concurrency.complex_models import ConcurrentRNN
np.set_printoptions(suppress=True)

In [2]:
parsed_queries_path = "/Users/ziniuw/Desktop/research/Data/AWS_trace/mixed_aurora/aurora_mixed_parsed_queries.json"
plans = load_json(parsed_queries_path, namespace=False)

In [71]:
folder_name = "mixed_aurora"
directory = f"/Users/ziniuw/Desktop/research/Data/AWS_trace/{folder_name}/"
all_raw_trace, all_trace = load_trace_all_version(directory, 8, concat=True)
all_concurrency_df = []
for trace in all_trace:
    concurrency_df = create_concurrency_dataset(trace, engine=None, pre_exec_interval=200)
    all_concurrency_df.append(concurrency_df)
concurrency_df = pd.concat(all_concurrency_df, ignore_index=True)

In [72]:
all_trace[0].head(5)

Unnamed: 0,timestamp,time_since_execution_s,time_of_day,query_idx,run_time_s,engine,g_offset_since_start,g_offset_since_start_s,g_issue_gap_s
0,2024-02-15 19:31:48.824876+00:00,9.645261,00:16,29,110.210543,,0 days 00:00:00,0.0,0.0
823,2024-02-15 19:31:41.425990+00:00,1.873206,00:03,143,3.95836,,0 days 00:00:00,0.0,0.0
2337,2024-02-15 19:31:43.079819+00:00,2.760294,00:04,135,3.47903,,0 days 00:00:01.653829,0.887088,0.887088
2338,2024-02-15 19:31:48.642502+00:00,8.322977,00:13,75,63.228388,,0 days 00:00:07.216512,6.449771,5.562683
1575,2024-02-15 19:31:48.316405+00:00,8.388999,00:13,36,0.853417,,0 days 00:00:06.890415,6.515793,6.515793


In [4]:
np.random.seed(0)
train_idx = np.random.choice(len(concurrency_df), size=int(0.8 * len(concurrency_df)), replace=False)
test_idx = [i for i in range(len(concurrency_df)) if i not in train_idx]
train_trace_df = copy.deepcopy(concurrency_df.iloc[train_idx])
eval_trace_df = concurrency_df.iloc[test_idx]
eval_trace_df = copy.deepcopy(eval_trace_df[eval_trace_df['num_concurrent_queries'] > 0])
print(len(train_trace_df), len(eval_trace_df))

43967 10907


In [5]:
ss = SingleStage(use_table_features=True, true_card=True)
df = ss.featurize_data(train_trace_df, parsed_queries_path)
ss.train(df)

Top 20 operators contains 0.9650782102582758 total operators


In [77]:
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.nn.functional import l1_loss, mse_loss
from tqdm import tqdm
from models.concurrency.seq_to_seq import RNN, LSTM, TransformerModel
from models.feature.complex_rnn_features import (
    collate_fn_padding,
    collate_fn_padding_transformer,
    QueryFeatureSeparatedDataset,
    featurize_queries_complex,
)


def q_loss_func(input, target, min_val=0.001, small_val=5.0, penalty_negative=1e5, lambda_small=0.1):
    """
    :param min_val: the minimal runtime you want the model to predict
    :param small_val: q_loss naturally favors small pred/label, put less weight on those values
    :return:
    """
    qerror = []
    for i in range(len(target)):
        # penalty for negative/too small estimates
        if (input[i] < min_val).data.numpy():
            # influence on loss for a negative estimate is >= penalty_negative constant
            q_err = (1 - input[i]) * penalty_negative
        # use l1_loss for small values, q_loss would explode
        elif (input[i] < small_val).data.numpy() and (target[i] < small_val).data.numpy():
            q_err = torch.abs(target[i] - input[i]) * lambda_small
        # otherwise normal q error
        else:
            if (input[i] > target[i]).data.numpy():
                q_err = torch.log(input[i]) - torch.log(target[i])
            else:
                q_err = torch.log(target[i]) - torch.log(input[i])
        qerror.append(q_err)
    loss = torch.mean(torch.cat(qerror))
    return loss


class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, layers=3):
        super(MLP, self).__init__()
        model = []
        model.append(nn.Linear(input_dim, hidden_dim))
        model.append(nn.ReLU())
        for i in range(layers):
            model.append(nn.Linear(hidden_dim, hidden_dim))
            model.append(nn.ReLU())
        model.append(nn.Dropout(0.9))
        model.append(nn.Linear(hidden_dim, 1))
        self.model = nn.Sequential(*model)
        self.is_train = True

    def forward(self, x1, x2, x3):
        y1 = self.model(x1, x2)
        if self.is_train:
            pred = self.model(y1, x3)
        else:
            pred = y1
        return torch.maximum(pred, torch.tensor(0.01))


class ConcurrentRNN:
    def __init__(
        self,
        stage_model,
        input_size,
        embedding_dim,
        hidden_size,
        output_size=1,
        num_head=4,
        num_layers=4,
        batch_size=128,
        dropout=0.2,
        include_exit=False,
        last_output=True,
        rnn_type="lstm",
    ):
        self.stage_model = stage_model
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_head = num_head
        self.num_layers = num_layers
        self.dropout = dropout
        self.include_exit = include_exit
        self.batch_size = batch_size
        self.rnn_type = rnn_type
        self.loss_func = None
        self.last_output = last_output
        if rnn_type == "vanilla":
            self.model = RNN(input_size, hidden_size, output_size, num_layers)
        elif rnn_type == "lstm":
            self.model = LSTM(
                input_size, embedding_dim, hidden_size, output_size, num_layers, dropout, last_output
            )
        elif rnn_type == "transformer":
            self.model = TransformerModel(input_size, embedding_dim, num_head, hidden_size, num_layers, dropout, output_size)
        else:
            # Todo: implement transformer
            assert False, f"unrecognized rnn type: {rnn_type}"

    def train(
        self,
        df,
        test_df=None,
        lr=0.001,
        weight_decay=2e-5,
        epochs=200,
        loss_function="l1_loss",
        report_every=5,
        val_on_test=False,
    ):
        self.loss_func = loss_function
        predictions = self.stage_model.cache.running_average
        single_query_features = dict()
        for i, f in enumerate(self.stage_model.all_feature):
            single_query_features[i] = f

        if val_on_test:
            assert (
                test_df is not None
            ), "must provide test dataframe to evaluate on test"
            val_df = test_df
            train_df = df
        else:
            # random train-eval split
            train_idx = np.random.choice(
                len(df), size=int(0.85 * len(df)), replace=False
            )
            val_idx = [i for i in range(len(df)) if i not in train_idx]
            val_df = df.iloc[val_idx]
            train_df = df.iloc[train_idx]

        val_x, val_y, val_pre_info_length, val_query_idx = featurize_queries_complex(
            val_df, predictions, single_query_features, include_exit=self.include_exit
        )
        train_x, train_y, train_pre_info_length, train_query_idx = featurize_queries_complex(
            train_df, predictions, single_query_features, include_exit=self.include_exit
        )

        train_dataset = QueryFeatureSeparatedDataset(
            train_x, train_y, train_pre_info_length, train_query_idx
        )
        if self.rnn_type == "transformer":
            collate_fn = collate_fn_padding_transformer()
        else:
            collate_fn = collate_fn_padding
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            shuffle=True,
            collate_fn=collate_fn,
        )
        val_dataset = QueryFeatureSeparatedDataset(val_x, val_y, val_pre_info_length, val_query_idx)
        val_dataloader = DataLoader(
            val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=collate_fn,
        )
        optimizer = optim.Adam(
            self.model.parameters(), lr=lr, weight_decay=weight_decay
        )
        for epoch in range(epochs):
            batch_loss = 0
            num_batch = 0
            self.model.train()
            for X, x_lengths, y, pre_info_length, query_idx in train_dataloader:
                optimizer.zero_grad()
                pred = self.model(X, x_lengths)
                y = y.reshape(-1, 1)
                if loss_function == "l1_loss":
                    loss = l1_loss(pred, y)
                elif loss_function == "mse_loss":
                    loss = mse_loss(pred, y)
                elif loss_function == "q_loss":
                    loss = q_loss_func(pred, y)
                else:
                    assert False, f"loss function {loss_function} is unrecognized"
                loss.backward()
                nn.utils.clip_grad_norm_(self.model.parameters(), 2)
                optimizer.step()
                batch_loss += loss.item()
                num_batch += 1
            if epoch % report_every == 0:
                train_loss = batch_loss / num_batch
                # Todo: implement eval loss
                print(
                    f"********Epoch {epoch}, training loss: {train_loss} || evaluation loss: ********"
                )
                _ = self.evaluate(val_dataloader, return_per_query=False)

    def predict(self, df, return_per_query=True):
        predictions = self.stage_model.cache.running_average
        single_query_features = dict()
        for i, f in enumerate(self.stage_model.all_feature):
            single_query_features[i] = f
        val_x, val_y, val_pre_info_length, val_query_idx = featurize_queries_complex(
            df, predictions, single_query_features, include_exit=self.include_exit
        )
        val_dataset = QueryFeatureSeparatedDataset(val_x, val_y, val_pre_info_length, val_query_idx)
        val_dataloader = DataLoader(
            val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=collate_fn_padding,
        )
        return self.evaluate(val_dataloader, return_per_query=return_per_query)

    def evaluate(self, val_dataloader, return_per_query=False):
        self.model.eval()
        all_pred = []
        all_label = []
        all_query_idx = []
        for X, x_lengths, y, pre_info_length, query_idx in tqdm(val_dataloader):
            pred = self.model(X, x_lengths)
            pred = pred.reshape(-1).detach().numpy()
            label = y.numpy()
            all_pred.append(pred)
            all_label.append(label)
            all_query_idx.append(query_idx.numpy())
        all_pred = np.concatenate(all_pred)
        all_pred = np.maximum(all_pred, 0.01)
        all_label = np.concatenate(all_label)
        all_query_idx = np.concatenate(all_query_idx)
        abs_error = np.abs(all_pred - all_label)
        q_error = np.maximum(all_pred / all_label, all_label / all_pred)
        for p in [50, 90, 95]:
            p_a = np.percentile(abs_error, p)
            p_q = np.percentile(q_error, p)
            print(f"{p}% absolute error is {p_a}, q-error is {p_q}")
        if return_per_query:
            preds_per_query = dict()
            labels_per_query = dict()
            for i in range(len(all_query_idx)):
                q_idx = int(all_query_idx[i])
                if q_idx not in preds_per_query:
                    preds_per_query[q_idx] = []
                    labels_per_query[q_idx] = []
                preds_per_query[q_idx].append(all_pred[i])
                labels_per_query[q_idx].append(all_label[i])
            return preds_per_query, labels_per_query
        return all_pred, all_label

    def save_model(self, directory):
        model_path = os.path.join(directory, f"{self.rnn_type}_{self.hidden_size}_{self.num_layers}_{self.loss_func}")
        torch.save(self.model.state_dict(), model_path)

    def load_model(self, directory):
        model_path = os.path.join(directory, f"{self.rnn_type}_{self.hidden_size}_{self.num_layers}_{self.loss_func}")
        self.model.load_state_dict(torch.load(model_path))

In [78]:
rnn = ConcurrentRNN(ss, 
                    input_size=len(ss.all_feature[0]) * 2 + 7,
                    embedding_dim=128,
                    hidden_size=256,
                    num_layers=2
                   )
rnn.train(train_trace_df, eval_trace_df, lr=0.001, loss_function="q_loss", val_on_test=True)

********Epoch 0, training loss: 162.67211886755257 || evaluation loss: ********


100%|███████████████████████████████████████████| 86/86 [00:01<00:00, 49.19it/s]


50% absolute error is 4.17140007019043, q-error is 2.1920669078826904
90% absolute error is 100.94978942871089, q-error is 8.855450630187985
95% absolute error is 175.7823715209958, q-error is 13.901780033111567
********Epoch 5, training loss: 2.9005175814205826 || evaluation loss: ********


100%|███████████████████████████████████████████| 86/86 [00:01<00:00, 68.12it/s]


50% absolute error is 3.0919952392578125, q-error is 1.524682641029358
90% absolute error is 43.867123413085885, q-error is 4.50993366241455
95% absolute error is 101.89528427124, q-error is 7.29283876419066
********Epoch 10, training loss: 7.3729971156910405 || evaluation loss: ********


100%|███████████████████████████████████████████| 86/86 [00:01<00:00, 60.24it/s]


50% absolute error is 2.6798553466796875, q-error is 1.4273020029067993
90% absolute error is 31.294233322143537, q-error is 4.263333320617674
95% absolute error is 74.66441497802728, q-error is 7.143231248855589
********Epoch 15, training loss: 5.791060749051529 || evaluation loss: ********


100%|███████████████████████████████████████████| 86/86 [00:01<00:00, 54.33it/s]


50% absolute error is 2.6489524841308594, q-error is 1.4011907577514648
90% absolute error is 29.342442321777344, q-error is 3.974243402481079
95% absolute error is 64.0548316955564, q-error is 6.5441845893859645


KeyboardInterrupt: 

In [70]:
preds, labels = rnn.predict(eval_trace_df)

NameError: name 'rnn' is not defined

In [14]:
torch.save(rnn.model.state_dict(), "temp")

In [11]:
predictions = rnn.stage_model.cache.running_average
single_query_features = dict()
for i, f in enumerate(rnn.stage_model.all_feature):
    single_query_features[i] = f
val_x, val_y, val_pre_info_length, val_query_idx = featurize_queries_complex(
    eval_trace_df, predictions, single_query_features, include_exit=rnn.include_exit
)
val_dataset = QueryFeatureSeparatedDataset(val_x, val_y, val_pre_info_length, val_query_idx)
val_dataloader = DataLoader(
    val_dataset,
    batch_size=rnn.batch_size,
    shuffle=False,
    collate_fn=collate_fn_padding,
)

In [50]:
for X, x_lengths, y, pre_info_length, query_idx in val_dataloader:
    print(X.shape, x_lengths.shape)
    break

torch.Size([128, 52, 117]) torch.Size([128])


In [12]:
all_pred = []
all_label = []
all_query_idx = []
for X, x_lengths, y, pre_info_length, query_idx in val_dataloader:
    pred = rnn.model(X, x_lengths)
    pred = pred.reshape(-1).detach().numpy()
    label = y.numpy()
    all_pred.append(pred)
    all_label.append(label)
    all_query_idx.append(query_idx.numpy())
all_query_idx = np.concatenate(all_query_idx)
all_pred = np.concatenate(all_pred)
all_pred = np.maximum(all_pred, 0.01)
all_label = np.concatenate(all_label)
preds_per_query = dict()
labels_per_query = dict()
for i in range(len(all_query_idx)):
    q_idx = int(all_query_idx[i])
    if q_idx not in preds_per_query:
        preds_per_query[q_idx] = []
        labels_per_query[q_idx] = []
    preds_per_query[q_idx].append(all_pred[i])
    labels_per_query[q_idx].append(all_label[i])

In [13]:
preds = preds_per_query
labels = labels_per_query

In [18]:
i = 210
idx = np.argsort(preds[i])
print(len(idx))
np.stack((np.asarray(preds[i])[idx], np.asarray(labels[i])[idx]), axis=1)

53


array([[  3.0798578,   3.3398259],
       [  3.239337 ,  82.04062  ],
       [  3.4771993,   3.0326128],
       [  3.734077 ,   3.1415386],
       [  3.7889369,   3.1945686],
       [  3.8987195,   3.3115742],
       [  3.9441261,   3.3261871],
       [  4.0396805,   3.157771 ],
       [  4.2118206,   8.695362 ],
       [  4.2364755,   3.0918634],
       [  4.2902536,  15.797157 ],
       [  4.3081965,   2.8220966],
       [  4.312231 ,   3.8185828],
       [  4.317485 ,   3.8665023],
       [  4.318064 ,   4.8590245],
       [  4.3205786,   3.99297  ],
       [  4.3352427,   3.5229752],
       [  4.372223 ,   4.0343885],
       [  4.4770455,   3.8076053],
       [  4.481619 ,  12.0486765],
       [  4.5407114,  10.00433  ],
       [  4.5709867,  12.684139 ],
       [  4.6152844,   3.1525   ],
       [  4.629574 ,   3.3944745],
       [  4.638817 ,   2.8921077],
       [  4.696923 ,   6.6521206],
       [  4.7498617,   3.2533658],
       [  4.7544203,  93.3347   ],
       [  4.7711816,

In [4]:
get_top_k_table_by_size(plans=plans)

['Nested Loop', 'Index Scan', 'Gather', 'Finalize Aggregate', 'Partial Aggregate', 'Index Only Scan', 'Seq Scan', 'Parallel Seq Scan', 'Parallel Index Scan', 'Hash', 'Hash Join', 'Merge Join', 'Parallel Hash', 'Parallel Hash Join', 'Sort', 'Gather Merge', 'Materialize', 'Aggregate', 'Finalize GroupAggregate', 'Parallel Index Only Scan']


In [5]:
all_feature = []
for i in range(len(plans["parsed_plans"])):
    plan = plans["parsed_plans"][i]
    feature = featurize_one_plan(plan, operators, use_size=True, use_log=True, true_card=False)
    all_feature.append(feature)

In [6]:
features_df = [None] * len(concurrency_df)
for i, rows in concurrency_df.groupby("query_idx"):
    feature = all_feature[i]
    row_idx = rows["index"].values
    for j in row_idx:
        features_df[j] = feature

In [7]:
concurrency_df["features"] = features_df
concurrency_df.head(5)

Unnamed: 0,index,query_idx,runtime,start_time,end_time,pre_exec_info,concur_info,num_concurrent_queries,concur_info_train,num_concurrent_queries_train,features
0,0,29,110.210543,0.0,110.210543,[],"[(143, 0.0, 3.958360195159912), (135, 0.887088...",24,[],0,"[2.0, 27.316941050528534, 2.0, 4.5643502748000..."
1,1,143,3.95836,0.0,3.95836,[],"[(29, 0.0, 110.210542678833), (135, 0.88708800...",2,"[(29, 0.0, 110.210542678833)]",1,"[0.0, 0.0, 1.0, 2.0794427916790545, 0.0, 0.0, ..."
2,2,135,3.47903,0.887088,4.366118,[],"[(29, 0.0, 110.210542678833), (143, 0.0, 3.958...",2,"[(29, 0.0, 110.210542678833), (143, 0.0, 3.958...",2,"[1.0, 14.02569044289473, 1.0, 10.0415088508690..."
3,3,75,63.228388,6.449771,69.678159,"[(143, 0.0, 3.958360195159912), (135, 0.887088...","[(29, 0.0, 110.210542678833), (36, 6.515793, 7...",9,"[(29, 0.0, 110.210542678833)]",1,"[2.0, 8.18868967999963, 2.0, 18.64454854869429..."
4,4,36,0.853417,6.515793,7.36921,"[(143, 0.0, 3.958360195159912), (135, 0.887088...","[(29, 0.0, 110.210542678833), (75, 6.449771, 6...",2,"[(29, 0.0, 110.210542678833), (75, 6.449771, 6...",2,"[4.0, 30.921264920644198, 1.0, 2.7725893472395..."


In [46]:
concurrency_df.to_csv(directory + "temp.csv", header=True, index=False)

In [47]:
df = pd.read_csv(directory + "temp.csv")

In [48]:
df.head(5)

Unnamed: 0,index,query_idx,runtime,start_time,end_time,pre_exec_info,concur_info,num_concurrent_queries,concur_info_train,num_concurrent_queries_train,features
0,0,29,110.210543,0.0,110.210543,[],"[(143, 0.0, 3.958360195159912), (135, 0.887088...",24,[],0,"[2.0, 27.316941050528534, 2.0, 4.5643502748000..."
1,1,143,3.95836,0.0,3.95836,[],"[(29, 0.0, 110.210542678833), (135, 0.88708800...",2,"[(29, 0.0, 110.210542678833)]",1,"[0.0, 0.0, 1.0, 2.0794427916790545, 0.0, 0.0, ..."
2,2,135,3.47903,0.887088,4.366118,[],"[(29, 0.0, 110.210542678833), (143, 0.0, 3.958...",2,"[(29, 0.0, 110.210542678833), (143, 0.0, 3.958...",2,"[1.0, 14.02569044289473, 1.0, 10.0415088508690..."
3,3,75,63.228388,6.449771,69.678159,"[(143, 0.0, 3.958360195159912), (135, 0.887088...","[(29, 0.0, 110.210542678833), (36, 6.515793, 7...",9,"[(29, 0.0, 110.210542678833)]",1,"[2.0, 8.18868967999963, 2.0, 18.64454854869429..."
4,4,36,0.853417,6.515793,7.36921,"[(143, 0.0, 3.958360195159912), (135, 0.887088...","[(29, 0.0, 110.210542678833), (75, 6.449771, 6...",2,"[(29, 0.0, 110.210542678833), (75, 6.449771, 6...",2,"[4.0, 30.921264920644198, 1.0, 2.7725893472395..."


In [10]:
a = concurrency_df["features"].iloc[:5].values

In [13]:
np.asarray(list(a))

array([[ 2.        , 27.31694105,  2.        ,  4.56435027,  1.        ,
         3.46573622,  1.        ,  2.77258935,  1.        ,  2.77258935,
         0.        ,  0.        ,  1.        ,  6.10479325,  0.        ,
         0.        ,  1.        , 15.30238121,  1.        ,  6.10479325,
         1.        , 14.60034318,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  1.        ,  2.07944279,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        , 12.89763858,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  1.

In [15]:
plans['parsed_plans'][0]

{'plain_content': [],
 'plan_parameters': {'op_name': 'Finalize Aggregate',
  'est_startup_cost': 1124502.37,
  'est_cost': 1124502.38,
  'est_card': 1.0,
  'est_width': 8.0,
  'act_startup_cost': 7210.412,
  'act_time': 7213.45,
  'act_card': 1.0,
  'output_columns': [{'aggregation': 'MIN', 'columns': [100]}],
  'act_children_card': 3.0,
  'est_children_card': 2.0,
  'workers_planned': 0},
 'children': [{'plain_content': [],
   'plan_parameters': {'op_name': 'Gather',
    'est_startup_cost': 1124502.16,
    'est_cost': 1124502.37,
    'est_card': 2.0,
    'est_width': 8.0,
    'act_startup_cost': 7210.403,
    'act_time': 7213.444,
    'act_card': 3.0,
    'workers_planned': 0,
    'output_columns': [{'aggregation': 'MIN', 'columns': [100]}],
    'act_children_card': 1.0,
    'est_children_card': 1.0},
   'children': [{'plain_content': [],
     'plan_parameters': {'op_name': 'Partial Aggregate',
      'est_startup_cost': 1123502.16,
      'est_cost': 1123502.17,
      'est_card': 1.0,

In [16]:
my_dict = {'a': 1, 'b': 2, 'c': 3}
print(list(my_dict.keys()))

['a', 'b', 'c']


In [19]:
plans.keys()

dict_keys(['parsed_plans', 'parsed_queries', 'sql_queries', 'database_stats', 'run_kwargs', 'skipped', 'blocks_accessed'])

In [5]:
plans["database_stats"]["table_stats"][3]

{'relname': 'aka_title_brad_source',
 'reltuples': 361472.0,
 'relpages': 7338,
 'relcols': 13}

In [7]:
input_size = 10
batch_size = 3
seq_lengths = [10, 8, 6]  # Lengths of sequences in the batch
max_seq_length = max(seq_lengths)
input_data = torch.randn(batch_size, max_seq_length, input_size)  # Random input data


In [9]:
packed_input = pack_padded_sequence(input_data, seq_lengths, batch_first=True, enforce_sorted=False)

In [14]:
packed_input.data.shape

torch.Size([24, 10])

In [15]:
input_data.shape

torch.Size([3, 10, 10])

In [19]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# Assuming you have a custom dataset class, replace YourDataset with your actual dataset class
class YourDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

# Example sequences (list of tensors)
sequences = [torch.tensor([1, 2, 3]), torch.tensor([4, 5]), torch.tensor([6])]

# Create a PyTorch dataset
dataset = YourDataset(sequences)

# Define a collate function to pad sequences
def collate_fn(batch):
    # Sort batch by sequence length (optional but recommended for efficiency)
    batch.sort(key=lambda x: len(x), reverse=True)
    # Pad sequences to the maximum length per batch
    padded_batch = pad_sequence(batch, batch_first=True, padding_value=0)
    return padded_batch

# Create a DataLoader with the collate_fn
# Adjust batch_size and other DataLoader parameters as needed
dataloader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn)

# Example usage of the DataLoader
for batch in dataloader:
    print(batch)

tensor([[1, 2, 3],
        [4, 5, 0]])
tensor([[6]])


In [None]:
np.concatenate(np.asarray(1), )

In [26]:
concurrency_df["concur_info"].iloc[1]

[(29, 0.0, 110.210542678833), (135, 0.8870880000000001, 4.366118370712281)]

In [27]:
a = concurrency_df["concur_info_train"].iloc[1][0]
b = concurrency_df["concur_info"].iloc[1][0]
a == b

True

In [28]:
torch.FloatTensor(np.ones(4))

tensor([1., 1., 1., 1.])

In [30]:
torch.stack([torch.ones(5), torch.ones(5)])

torch.Size([2, 5])

In [26]:
src_key_padding_mask = torch.zeros((5, 10), dtype=int)


In [20]:
a = torch.ones((5, 10, 11))

In [9]:
a.shape

torch.Size([5, 10, 10])

In [18]:
b = torch.tensor([8, 5, 2, 3, 1])

In [19]:
a[torch.arange(a.shape[0]), b].shape

torch.Size([5, 11])

In [21]:
torch.mean(a, dim=1).shape

torch.Size([5, 11])

In [7]:
pre_info_length = torch.zeros(10, dtype=int)
torch.maximum(pre_info_length, torch.ones(len(pre_info_length)))

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [8]:
int(pre_info_length[0])

0

In [61]:
import torch
import torch.nn as nn
from torch.nn.functional import l1_loss, mse_loss
import torch.optim as optim


# Assuming B is the output of some nn.Module
# Example:
class MyModule(nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModule, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x):
        return self.linear(x)


class MyModule2(nn.Module):
    def __init__(self, input_size, output_size):
        super(MyModule2, self).__init__()
        self.linear = nn.Linear(input_size, output_size)

    def forward(self, x, B):
        A_modified = x.clone()  # Detach to prevent gradients from flowing back to A
        A_modified[:, 0, :] = B
        y = self.linear(A_modified.mean(dim=1))
        output = torch.zeros((len(y)+5, 1), requires_grad=False)
        output[:len(y)] = y
        return output[:len(y)]
        
# Define A and B
l = 10
n = 5
model1 = MyModule(l, n)

model2 = MyModule2(n, 1)


In [62]:
ground_truth1 = torch.randn(l, n, requires_grad=False) * 10
ground_truth2 = torch.randn(n, requires_grad=False) * 10


length = 1000
training_A = torch.randn(length, l, n)
training_B = torch.randn(length, l)

def get_label():
    temp = torch.matmul(training_B, ground_truth1)
    temp2 = training_A.clone()
    temp2[:, 0, :] = temp
    temp2 = temp2.mean(dim=1)
    training_Y = torch.matmul(temp, ground_truth2)
    return training_Y
training_Y = get_label()

In [60]:
# Backward pass
output1 = model1(training_B)
print(output1.shape)
output2 = model2(training_A, output1)  # Example of using A in another nn.Module

print(output2.shape)


loss = mse_loss(output2.reshape(-1), training_Y)
print(loss)
# Backward pass
#output.backward(torch.ones_like(output), retain_graph=True)  # Retain the computational graph until B
loss.backward()
learning_rate = 0.01
optimizer = optim.SGD([
    {'params': model1.parameters()},
    {'params': model2.parameters()}
], lr=learning_rate)
optimizer.step()

# Clear gradients
optimizer.zero_grad()
print(model1.linear.weight.grad) 

torch.Size([1000, 5])
torch.Size([1000, 1])
tensor(nan, grad_fn=<MseLossBackward0>)
None


In [67]:
for i in range(1000):
    output1 = model1(training_B)
    output2 = model2(training_A, output1)  # Example of using A in another nn.Module    
    
    loss = mse_loss(output2.reshape(-1), training_Y)
    print(loss)
    optimizer.zero_grad()
    # Backward pass
    #output.backward(torch.ones_like(output), retain_graph=True)  # Retain the computational graph until B
    loss.backward()
    learning_rate = 0.001
    optimizer = optim.SGD([
        {'params': model1.parameters()},
        {'params': model2.parameters()}
    ], lr=learning_rate)
    optimizer.step()
    


tensor(56862.0586, grad_fn=<MseLossBackward0>)
tensor(42278.6797, grad_fn=<MseLossBackward0>)
tensor(30535.4531, grad_fn=<MseLossBackward0>)
tensor(21471.6523, grad_fn=<MseLossBackward0>)
tensor(14750.1699, grad_fn=<MseLossBackward0>)
tensor(9945.3652, grad_fn=<MseLossBackward0>)
tensor(6621.5918, grad_fn=<MseLossBackward0>)
tensor(4387.2656, grad_fn=<MseLossBackward0>)
tensor(2921.5955, grad_fn=<MseLossBackward0>)
tensor(1979.6499, grad_fn=<MseLossBackward0>)
tensor(1384.4246, grad_fn=<MseLossBackward0>)
tensor(1013.4185, grad_fn=<MseLossBackward0>)
tensor(784.6998, grad_fn=<MseLossBackward0>)
tensor(644.9218, grad_fn=<MseLossBackward0>)
tensor(560.0806, grad_fn=<MseLossBackward0>)
tensor(508.8540, grad_fn=<MseLossBackward0>)
tensor(478.0446, grad_fn=<MseLossBackward0>)
tensor(459.5656, grad_fn=<MseLossBackward0>)
tensor(448.4995, grad_fn=<MseLossBackward0>)
tensor(441.8748, grad_fn=<MseLossBackward0>)
tensor(437.9044, grad_fn=<MseLossBackward0>)
tensor(435.5167, grad_fn=<MseLossBackw

In [48]:
a = torch.zeros(10, requires_grad=False)
b = torch.ones(5, requires_grad=True)
b

tensor([1., 1., 1., 1., 1.], requires_grad=True)

In [49]:
a[0:5] = b

In [68]:
output1 = model1(training_B)
output2 = model2(training_A, output1)

In [69]:
output2

tensor([[-8.2346e+02],
        [-5.9103e+02],
        [ 3.6133e+02],
        [ 2.3047e+01],
        [ 3.4213e+02],
        [-3.9027e+02],
        [ 4.2706e+02],
        [-4.0829e+02],
        [ 8.2597e+02],
        [-7.3807e+01],
        [-3.1970e+02],
        [-1.6464e+01],
        [ 4.1696e+02],
        [ 4.9313e+02],
        [-6.6962e+02],
        [ 9.5590e+02],
        [-1.0947e+03],
        [-5.9370e+01],
        [ 4.4486e+02],
        [-9.4193e+01],
        [-5.4159e+02],
        [ 5.8167e+02],
        [-2.4766e+02],
        [-7.1903e+02],
        [ 1.1819e+03],
        [ 1.6983e+02],
        [ 3.5693e+01],
        [-1.7251e+02],
        [-4.9521e+02],
        [ 4.4691e+01],
        [-2.9324e+02],
        [-2.6762e+02],
        [-1.7327e+02],
        [ 1.2206e+03],
        [-2.6931e+02],
        [-4.4499e+01],
        [ 4.2132e+02],
        [ 1.2265e+03],
        [ 2.6808e+02],
        [ 3.8826e+02],
        [ 1.3192e+02],
        [ 5.8664e+01],
        [-4.5859e+02],
        [-4

In [66]:
training_Y

tensor([-8.6309e+02, -6.0070e+02,  3.7279e+02, -5.2874e+00,  3.4174e+02,
        -4.1005e+02,  3.8836e+02, -3.9748e+02,  8.3635e+02, -8.6381e+01,
        -3.3652e+02, -3.0860e+01,  3.8735e+02,  4.9341e+02, -6.9179e+02,
         9.4613e+02, -1.0771e+03, -3.4364e+01,  4.1431e+02, -9.3627e+01,
        -5.2933e+02,  5.6566e+02, -2.7008e+02, -7.2234e+02,  1.1920e+03,
         1.3399e+02,  5.9456e+01, -1.7833e+02, -4.8907e+02,  5.3351e+01,
        -2.9161e+02, -2.8650e+02, -1.5382e+02,  1.2014e+03, -2.7481e+02,
        -3.4673e+01,  4.1682e+02,  1.2285e+03,  2.7759e+02,  3.8001e+02,
         1.5427e+02,  4.8381e+01, -4.6552e+02, -4.3498e+02,  7.9393e+02,
        -1.1762e+03,  3.2600e+02,  5.4239e+02,  1.1846e+01, -2.8549e+02,
        -1.5275e+02, -1.6591e+02,  1.0382e+02,  5.6746e+02,  2.5248e+01,
         1.5879e+02,  2.4320e+02, -1.3353e+02,  1.3669e+03, -1.3951e+02,
         1.3122e+03,  1.4095e+03,  3.4844e+02, -1.1643e+02, -8.7602e+02,
        -8.4981e+00, -3.4938e+02, -9.0152e+02, -3.2

In [80]:
a = [1,2, 3]

In [81]:
a.pop(2)
a

[1, 2]