In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader
import glob
import wandb
import os
import torch.optim as optimizers

In [3]:
import dfs_code
from torch_geometric.data import InMemoryDataset, Data
import pickle
import torch
import torch.nn as nn
import tqdm
import copy
import pandas as pd
import torch.nn.functional as F


In [4]:
import sys
sys.path = ['../../src'] + sys.path
from dfs_transformer import EarlyStopping, DFSCodeSeq2SeqFC, smiles2graph, BERTize



In [5]:
from dfs_transformer import DFSCodeSeq2SeqFCFeatures, Trainer, PubChem, get_n_files, OgbnMag
from dfs_transformer.training.utils import seq_loss, seq_acc, collate_BERT, collate_rnd2min
import argparse
import yaml
import functools
from ml_collections import ConfigDict
fname = '../../config/selfattn/rnd2min10M.yaml'

In [6]:
with open(fname) as file:
    config = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))

In [7]:
name = "rnd2min"
mode = "offline"

In [8]:
config.model.n_atoms = 4
config.model.n_bonds = 4
config.model.n_node_features = 4
config.model.n_edge_features = 4

In [9]:
config.data.n_used = None
config.data.n_iter_per_split = None
config.data.path = "../../results/ogbn_mag/timeout1/"
config.data.require_min_dfs_code = True

In [10]:
config.training.n_epochs = 100
config.training.gpu_id = None

In [11]:
run = wandb.init(mode=mode, project="ogbn-mag", entity="dfstransformer", 
                 name=name, config=config.to_dict(), job_type="pretraining")

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
2021-09-17 12:09:33.569884: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/cuda/extras/CUPTI/lib64/:/opt/intel/lib:/opt/intel/mkl/lib/intel64:/opt/intel:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/bin/x86-64_linux:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/python/3.7/x86-64_linux:/opt/intel/clck_latest/lib:/opt/intel/daal/lib:/opt/intel/intelpython3/lib:/opt/intel/ipp/lib:/opt/intel/itac_2019/lib:/opt/intel/itac_latest/lib:/opt/intel/mkl/lib:/opt/intel/mkl_/lib:/opt/intel/mpirt/lib:/opt/intel/tbb/lib:/opt/intel/clck/2019.0/lib:/opt/intel/compilers_and_libraries_2019/linux/lib:/opt/intel/compilers_and_libraries/linux/lib:/opt/intel/itac/2019.0.018/lib:/opt/intel/ita

In [12]:
m = config.model
t = config.training
d = config.data

ce = nn.CrossEntropyLoss(ignore_index=-1)
bce = nn.BCEWithLogitsLoss()    

fields = ['acc-dfs1', 'acc-dfs2', 'acc-atm1', 'acc-atm2', 'acc-bnd']
metrics = {field:functools.partial(seq_acc, idx=idx) for idx, field in enumerate(fields)}

In [27]:
def collate_fn(dlist):
    node_batch = [] 
    edge_batch = []
    min_code_batch = []
    rnd_code_batch = []
    for d in dlist:
        node_batch += [F.one_hot(d.node_labels, config.model.n_node_features).float()]
        edge_batch += [F.one_hot(d.edge_labels, config.model.n_edge_features).float()]
        rnd_code, rnd_index = dfs_code.rnd_dfs_code_from_torch_geometric(d, d.node_labels.numpy().tolist(), 
                                                                         d.edge_labels.numpy().tolist())
        rnd_code = torch.tensor(rnd_code, dtype=torch.long)
        rnd_code_batch += [rnd_code]
        min_code_batch += [d.min_dfs_code.long()]

    targets = nn.utils.rnn.pad_sequence(min_code_batch, padding_value=-1)
    return rnd_code_batch, node_batch, edge_batch, targets 

In [14]:
loss = functools.partial(seq_loss, m=m, ce=ce)

In [15]:
model = DFSCodeSeq2SeqFC(**m)
    
if t.load_last and t.es_path is not None:
    model.load_state_dict(torch.load(t.es_path+'checkpoint.pt', map_location=device))
elif t.pretrained_dir is not None:
    model.load_state_dict(torch.load(t.pretrained_dir+'checkpoint.pt', map_location=device))

In [16]:
validloader = None

In [17]:
dataset = OgbnMag(d.path, max_nodes=m.max_nodes, max_edges=m.max_edges, require_min_dfs_code=d.require_min_dfs_code)


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.68s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36820/36820 [00:01<00:00, 23534.53it/s]


In [28]:
loader = DataLoader(dataset, batch_size=t.batch_size, shuffle=True, pin_memory=False, collate_fn=collate_fn)

In [29]:
trainer = Trainer(model, loader, loss, validloader=validloader, metrics=metrics, 
                  wandb_run = run, **t)

In [30]:
trainer.fit()

Epoch 1: loss 13.674705 0.0583 0.2167 0.5424 0.6878 0.2301:   2%|█▋                                                                                                                | 6/398 [00:18<19:37,  3.00s/it]


In [None]:
#store config and model
with open(t.es_path+'config.yaml', 'w') as f:
    yaml.dump(config.to_dict(), f, default_flow_style=False)
if name is not None and mode != "offline":
    trained_model_artifact = wandb.Artifact(name, type="model", description="trained selfattn model")
    trained_model_artifact.add_dir(t.es_path)
    run.log_artifact(trained_model_artifact)