In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import numpy as np
from torch.utils.data import Dataset, DataLoader
import glob
import wandb
import os
import torch.optim as optimizers

In [3]:
import dfs_code
from torch_geometric.data import InMemoryDataset, Data
import pickle
import torch
import torch.nn as nn
import tqdm
import copy
import pandas as pd
import torch.nn.functional as F


In [4]:
import sys
sys.path = ['../../src'] + sys.path
from dfs_transformer import EarlyStopping, DFSCodeSeq2SeqFC, smiles2graph, BERTize



In [5]:
from dfs_transformer import DFSCodeSeq2SeqFCFeatures, Trainer, PubChem, get_n_files, OgbnMag
from dfs_transformer.training.utils import seq_loss, seq_acc, collate_BERT, collate_rnd2min
import argparse
import yaml
import functools
from ml_collections import ConfigDict


In [6]:
from ogb.nodeproppred import PygNodePropPredDataset

ogbd = PygNodePropPredDataset(name = "ogbn-mag", root="../../datasets") 

split_idx = ogbd.get_idx_split()
train_idx, valid_idx, test_idx = split_idx["train"]["paper"], split_idx["valid"]["paper"], split_idx["test"]["paper"]

Using backend: pytorch


In [7]:
strain_idx = set(train_idx.numpy().tolist())
stest_idx = set(test_idx.numpy().tolist())
svalid_idx = set(valid_idx.numpy().tolist())

In [8]:
fname = '../../config/selfattn/finetune_ogb.yaml'
with open(fname) as file:
    config = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))

In [9]:
name = "rnd2min-finetuned"
mode = "online"

In [10]:
# download pretrained model
run = wandb.init(mode=mode, 
                 project=config.pretrained_project, 
                 entity=config.pretrained_entity, 
                 job_type="inference")
model_at = run.use_artifact(config.pretrained_model + ":latest")
model_dir = model_at.download()
run.finish()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchrisxx[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-09-17 17:07:43.623689: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/cuda/extras/CUPTI/lib64/:/opt/intel/lib:/opt/intel/mkl/lib/intel64:/opt/intel:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/bin/x86-64_linux:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/python/3.7/x86-64_linux:/opt/intel/clck_latest/lib:/opt/intel/daal/lib:/opt/intel/intelpython3/lib:/opt/intel/ipp/lib:/opt/intel/itac_2019/lib:/opt/intel/itac_latest/lib:/opt/in

[34m[1mwandb[0m: Downloading large artifact rnd2min:latest, 189.02MB. 3 files... Done. 0:0:0


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [11]:
with open(model_dir+"/config.yaml") as file:
    mconfig = ConfigDict(yaml.load(file, Loader=yaml.FullLoader))

In [12]:
config.model = mconfig

In [13]:
run = wandb.init(mode=mode, project="ogbn-mag", entity="dfstransformer", 
                 name=name, config=config.to_dict(), job_type="evaluation")

[34m[1mwandb[0m: wandb version 0.12.2 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-09-17 17:07:50.634629: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/opt/cuda/extras/CUPTI/lib64/:/opt/intel/lib:/opt/intel/mkl/lib/intel64:/opt/intel:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/bin/x86-64_linux:/opt/ibm/ILOG/CPLEX_Studio1210/cplex/python/3.7/x86-64_linux:/opt/intel/clck_latest/lib:/opt/intel/daal/lib:/opt/intel/intelpython3/lib:/opt/intel/ipp/lib:/opt/intel/itac_2019/lib:/opt/intel/itac_latest/lib:/opt/intel/mkl/lib:/opt/intel/mkl_/lib:/opt/intel/mpirt/lib:/opt/intel/tbb/lib:/opt/intel/clck/2019.0/lib:/opt/intel/compilers_and_libraries_2019/linux/lib:/opt/intel/compilers_and_libraries/linux/lib:/opt/intel/itac/2019.0.018/lib:/opt/intel/itac_2019/intel64/lib:/opt/intel/

In [14]:
m = mconfig.model
t = config

In [15]:
ce = nn.CrossEntropyLoss(ignore_index=-1)
bce = nn.BCEWithLogitsLoss()    

In [16]:
def collate_fn(dlist):
    node_batch = [] 
    edge_batch = []
    y_batch = []
    rnd_code_batch = []
    for d in dlist:
        node_batch += [F.one_hot(d.node_labels, m.n_node_features).float()]
        edge_batch += [F.one_hot(d.edge_labels, m.n_edge_features).float()]
        rnd_code, rnd_index = dfs_code.rnd_dfs_code_from_torch_geometric(d, d.node_labels.numpy().tolist(), 
                                                                         d.edge_labels.numpy().tolist())
        rnd_code = torch.tensor(rnd_code, dtype=torch.long)
        rnd_code_batch += [rnd_code]
        y_batch += [d.y]
        
    return rnd_code_batch, node_batch, edge_batch, torch.tensor(y_batch, dtype=torch.long) 

In [17]:
dataset = OgbnMag(t.path, require_min_dfs_code=t.require_min_dfs_code, max_nodes=m.max_nodes, max_edges=m.max_edges)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [01:49<00:00,  5.49s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 736389/736389 [00:15<00:00, 46321.24it/s]


In [18]:
train_idx = []
valid_idx = []
test_idx = []

for didx, d in enumerate(dataset):
    if d.idx in strain_idx:
        train_idx += [didx]
    elif d.idx in svalid_idx:
        valid_idx += [didx]
    elif d.idx in stest_idx:
        test_idx += [didx]
train_idx = torch.tensor(train_idx, dtype=torch.long)
valid_idx = torch.tensor(valid_idx, dtype=torch.long)
test_idx = torch.tensor(test_idx, dtype=torch.long)

In [19]:
trainloader = DataLoader(dataset, sampler=torch.utils.data.SubsetRandomSampler(train_idx), 
                         batch_size=t.batch_size, collate_fn=collate_fn)
validloader = DataLoader(dataset, sampler=torch.utils.data.SubsetRandomSampler(valid_idx), 
                         batch_size=t.batch_size, collate_fn=collate_fn)
testloader = DataLoader(dataset, sampler=torch.utils.data.SubsetRandomSampler(test_idx), 
                        batch_size=t.batch_size, collate_fn=collate_fn)

In [20]:
class TransformerPlusHead(nn.Module):
    def __init__(self, encoder, n_encoding, n_classes, fingerprint='cls'):
        super(TransformerPlusHead, self).__init__()
        self.encoder = encoder
        self.head = nn.Linear(n_encoding, n_classes)
        self.fingerprint = fingerprint
    
    def forward(self, C, N, E):
        features = self.encoder.encode(C, N, E, method=self.fingerprint)
        output = self.head(features)
        return output
        

In [21]:
from ogb.nodeproppred import Evaluator

evaluator = Evaluator(name = 'ogbn-mag')

In [22]:
data = next(iter(trainloader))

In [23]:
evaluator.eval({'y_true':data[-1].unsqueeze(1), 'y_pred':data[-1].unsqueeze(1)})

{'acc': 1.0}

In [24]:
print(evaluator.expected_input_format)
print(evaluator.expected_output_format)

==== Expected input format of Evaluator for ogbn-mag
{'y_true': y_true, 'y_pred': y_pred}
- y_true: numpy ndarray or torch tensor of shape (num_node, num_task)
- y_pred: numpy ndarray or torch tensor of shape (num_node, num_task)
where y_pred stores predicted class label (integer),
num_task is 1, and each row corresponds to one node.

==== Expected output format of Evaluator for ogbn-mag
{'acc': acc}
- acc (float): Accuracy score averaged across 1 task(s)



In [25]:
def loss(pred, y, ce=ce):
    return ce(pred, y)

def acc(pred, y):
    return torch.tensor(evaluator.eval({'y_true': y.unsqueeze(1), 
                           'y_pred': torch.argmax(pred, axis=1).unsqueeze(1)})['acc'])
    

In [26]:
device = torch.device('cuda:%d'%t.gpu_id if torch.cuda.is_available()  else 'cpu')
encoder = DFSCodeSeq2SeqFC(**m)
    
if t.load_last and model_dir is not None:
    encoder.load_state_dict(torch.load(model_dir+'/checkpoint.pt', map_location=device))

In [27]:
m

class: DFSCodeSeq2SeqFCFeatures
dim_feedforward: 2048
emb_dim: 120
max_edges: 500
max_nodes: 250
missing_value: null
n_atoms: 4
n_bonds: 4
n_class_tokens: 1
n_edge_features: 4
n_node_features: 4
nhead: 12
nlayers: 6

In [28]:
model = TransformerPlusHead(encoder, m.emb_dim*5*m.n_class_tokens, t.n_classes, fingerprint=t.fingerprint)

In [29]:
del t.model

In [30]:
trainer = Trainer(model, trainloader, loss, validloader=validloader, metrics={'acc': acc}, wandb_run = run, **t)

In [31]:
trainer.fit()

  0%|                                                                                                                                                                                    | 0/11983 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 336.00 MiB (GPU 0; 10.91 GiB total capacity; 9.51 GiB already allocated; 98.31 MiB free; 9.70 GiB reserved in total by PyTorch)

In [None]:
acc_test = 0
for i, data in tqdm.tqdm(enumerate(testloader)):
    pred = model(*data[:-1])
    accuracy = acc(pred, data[-1])
    acc_test = (i*acc_test + accuracy.item())/(i+1)
run.log('Test Accuracy': acc_test)

In [None]:
#store config and model
with open(t.es_path+'config.yaml', 'w') as f:
    yaml.dump(config.to_dict(), f, default_flow_style=False)
if name is not None and mode != "offline":
    trained_model_artifact = wandb.Artifact(name, type="model", description="trained selfattn model")
    trained_model_artifact.add_dir(t.es_path)
    run.log_artifact(trained_model_artifact)

In [None]:
exit()