In [16]:
import argparse
import sys


import numpy as np
from torch_geometric.data import Data

import torch.nn.functional as F
from torch_geometric.loader import DataLoader


from utils import preprocess_dataset



from torch.utils.data import Subset
np.random.seed(13)


In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
# Simulate command-line arguments
sys.argv = [
    'main.py',
    '--lr', '0.001',
    '--dropout', '0.1',
    '--batch-size', '256',
    '--epochs-autoencoder', '200',
    '--hidden-dim-encoder', '64',
    '--hidden-dim-decoder', '256',
    '--latent-dim', '32',
    '--n-max-nodes', '50',
    '--n-layers-encoder', '2',
    '--n-layers-decoder', '3',
    '--spectral-emb-dim', '10',
    '--epochs-denoise', '100',
    '--timesteps', '500',
    '--hidden-dim-denoise', '512',
    '--n-layers_denoise', '3',
    # '--train-autoencoder',
    '--train-denoiser',
    '--dim-condition', '128',
    '--n-condition', '7'
]

# Initialize the parser
parser = argparse.ArgumentParser(description="Your description here")

# Add arguments
parser.add_argument('--lr', type=float, default=1e-3, help="Learning rate for the optimizer, typically a small float value (default: 0.001)")
parser.add_argument('--dropout', type=float, default=0.0, help="Dropout rate (fraction of nodes to drop) to prevent overfitting (default: 0.0)")
parser.add_argument('--batch-size', type=int, default=256, help="Batch size for training, controlling the number of samples per gradient update (default: 256)")
parser.add_argument('--epochs-autoencoder', type=int, default=200, help="Number of training epochs for the autoencoder (default: 200)")
parser.add_argument('--hidden-dim-encoder', type=int, default=64, help="Hidden dimension size for encoder layers (default: 64)")
parser.add_argument('--hidden-dim-decoder', type=int, default=256, help="Hidden dimension size for decoder layers (default: 256)")
parser.add_argument('--latent-dim', type=int, default=32, help="Dimensionality of the latent space in the autoencoder (default: 32)")
parser.add_argument('--n-max-nodes', type=int, default=50, help="Possible maximum number of nodes in graphs (default: 50)")
parser.add_argument('--n-layers-encoder', type=int, default=2, help="Number of layers in the encoder network (default: 2)")
parser.add_argument('--n-layers-decoder', type=int, default=3, help="Number of layers in the decoder network (default: 3)")
parser.add_argument('--spectral-emb-dim', type=int, default=20, help="Dimensionality of spectral embeddings for representing graph structures (default: 10)")
parser.add_argument('--epochs-denoise', type=int, default=100, help="Number of training epochs for the denoising model (default: 100)")
parser.add_argument('--timesteps', type=int, default=500, help="Number of timesteps for the diffusion (default: 500)")
parser.add_argument('--hidden-dim-denoise', type=int, default=512, help="Hidden dimension size for denoising model layers (default: 512)")
parser.add_argument('--n-layers_denoise', type=int, default=3, help="Number of layers in the denoising model (default: 3)")
parser.add_argument('--train-autoencoder', action='store_true', default=False, help="Flag to enable/disable autoencoder (VGAE) training (default: enabled)")
parser.add_argument('--train-denoiser', action='store_true', default=False, help="Flag to enable/disable denoiser training (default: enabled)")
parser.add_argument('--dim-condition', type=int, default=128, help="Dimensionality of conditioning vectors for conditional generation (default: 128)")
parser.add_argument('--n-condition', type=int, default=7, help="Number of distinct condition properties used in conditional vector (default: 7)")

# Parse the arguments
args = parser.parse_args()

# Use the arguments as needed
print(args)

Namespace(lr=0.001, dropout=0.1, batch_size=256, epochs_autoencoder=200, hidden_dim_encoder=64, hidden_dim_decoder=256, latent_dim=32, n_max_nodes=50, n_layers_encoder=2, n_layers_decoder=3, spectral_emb_dim=10, epochs_denoise=100, timesteps=500, hidden_dim_denoise=512, n_layers_denoise=3, train_autoencoder=False, train_denoiser=True, dim_condition=128, n_condition=7)


In [20]:
trainset = preprocess_dataset("train", args.n_max_nodes, args.spectral_emb_dim)
validset = preprocess_dataset("valid", args.n_max_nodes, args.spectral_emb_dim)
testset = preprocess_dataset("test", args.n_max_nodes, args.spectral_emb_dim)


100%|██████████| 8000/8000 [00:25<00:00, 311.63it/s]


Dataset ./data/dataset_train.pt saved


100%|██████████| 1000/1000 [00:10<00:00, 98.24it/s]


Dataset ./data/dataset_valid.pt saved
graph_0,This graph comprises 50 nodes and 589 edges. The average degree is equal to 23.56 and there are 3702 triangles in the graph. The global clustering coefficient and the graph's maximum k-core are 0.6226034308779012 and 18 respectively. The graph consists of 3 communities.

graph_1,This graph comprises 20 nodes and 19 edges. The average degree is equal to 1.9 and there are 0 triangles in the graph. The global clustering coefficient and the graph's maximum k-core are 0 and 1 respectively. The graph consists of 4 communities.

graph_2,This graph comprises 28 nodes and 165 edges. The average degree is equal to 11.785714285714286 and there are 387 triangles in the graph. The global clustering coefficient and the graph's maximum k-core are 0.4742647058823529 and 8 respectively. The graph consists of 3 communities.

graph_3,This graph comprises 47 nodes and 1050 edges. The average degree is equal to 44.680851063829785 and there are 14914 triangles i

In [21]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')



Sentence embeddings:
tensor([[ 6.7657e-02,  6.3496e-02,  4.8713e-02,  7.9305e-02,  3.7448e-02,
          2.6528e-03,  3.9375e-02, -7.0984e-03,  5.9361e-02,  3.1537e-02,
          6.0098e-02, -5.2905e-02,  4.0607e-02, -2.5931e-02,  2.9843e-02,
          1.1269e-03,  7.3515e-02, -5.0382e-02, -1.2239e-01,  2.3703e-02,
          2.9727e-02,  4.2477e-02,  2.5634e-02,  1.9951e-03, -5.6919e-02,
         -2.7160e-02, -3.2904e-02,  6.6025e-02,  1.1901e-01, -4.5879e-02,
         -7.2621e-02, -3.2584e-02,  5.2341e-02,  4.5055e-02,  8.2530e-03,
          3.6702e-02, -1.3942e-02,  6.5392e-02, -2.6427e-02,  2.0640e-04,
         -1.3664e-02, -3.6281e-02, -1.9504e-02, -2.8974e-02,  3.9427e-02,
         -8.8409e-02,  2.6243e-03,  1.3671e-02,  4.8306e-02, -3.1157e-02,
         -1.1733e-01, -5.1169e-02, -8.8529e-02, -2.1896e-02,  1.4299e-02,
          4.4417e-02, -1.3482e-02,  7.4339e-02,  2.6638e-02, -1.9876e-02,
          1.7919e-02, -1.0605e-02, -9.0426e-02,  2.1327e-02,  1.4120e-01,
         -6.4718e

In [22]:
trainloader= DataLoader(trainset, batch_size=32)
validloader = DataLoader(validset, batch_size=32)
testloader = DataLoader(testset, batch_size=32)

In [23]:
res = {}
for name, loader in {"train": trainloader, "valid":validloader, "test":validset}.items():
    li = []
    for data in loader:
        data = data.to("cuda")
        encoded_input = tokenizer(data.text, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        li.append(sentence_embeddings)
    res[name] = torch.concat(li)

In [24]:
res["train"][0].unsqueeze(0).shape

torch.Size([1, 384])

In [25]:
def add_embeddings_to_dataset(dataset, mat):
    for i, data in enumerate(dataset):
        data.sentence_embeddings = mat[i].unsqueeze(0)
    return dataset

new_trainset = add_embeddings_to_dataset(trainset, res["train"])
new_validset = add_embeddings_to_dataset(validset, res["valid"])
new_testset = add_embeddings_to_dataset(testset, res["test"])

In [26]:
torch.save(new_trainset, './data/dataset_train.pt')
torch.save(new_validset, './data/dataset_valid.pt')
torch.save(new_testset, './data/dataset_test.pt')
