In [49]:
%load_ext autoreload
%autoreload 2
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl.data
from torch_geometric.datasets.qm9 import QM9
from torch_geometric.data import DataLoader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
import numpy as np

In [2]:
# [0] Reports MAE in eV / Chemical Accuracy of the target variable U0. 
# The chemical accuracy of U0 is 0.043 see [1, Table 5].

# Reproduced table [0]
# MXMNet: 0.00590/0.043 = 0.13720930232558143
# HMGNN:  0.00592/0.043 = 0.13767441860465118
# MPNN:   0.01935/0.043 = 0.45
# KRR:    0.0251 /0.043 = 0.5837209302325582
# [0] https://paperswithcode.com/sota/formation-energy-on-qm9
# [1] Neural Message Passing for Quantum Chemistry, https://arxiv.org/pdf/1704.01212v2.pdf
# MXMNet https://arxiv.org/pdf/2011.07457v1.pdf
# HMGNN https://arxiv.org/pdf/2009.12710v1.pdf
# MPNN https://arxiv.org/pdf/1704.01212v2.pdf
# KRR HDAD kernel ridge regression https://arxiv.org/pdf/1702.05532.pdf
# HDAD means HDAD (Histogram of distances, anglesand dihedral angles)

# [2] Reports the average value of MAE / Chemical Accuracy of over all targets
# [2] https://paperswithcode.com/sota/drug-discovery-on-qm9


# get rid of the degenerate molecules

In [16]:
from urllib import request
import tempfile
import os
at_url = "https://ndownloader.figshare.com/files/3195404"
tmpdir = tempfile.mkdtemp("gdb9")
tmp_path = os.path.join(tmpdir, "uncharacterized.txt")
request.urlretrieve(at_url, tmp_path)

evilmols = []
with open(tmp_path) as f:
    lines = f.readlines()
    for line in lines[9:-1]:
        evilmols.append(int(line.split()[0]))
evilgdbs = ['gdb_%d'%id for id in evilmols]

In [34]:
pre_filter = lambda d: (d.name not in evilgdbs)

In [43]:
dataset = QM9('../datasets/qm9_geometric/', pre_filter=pre_filter)#, pre_filter=pre_filter)

In [44]:
# actually QM9 already automatically gets rid of all the gad examples -.-

In [50]:
dataset = dataset.shuffle()
train_dataset = dataset[:110000]
valid_dataset = dataset[110000:120000]
test_dataset = dataset[120000:]
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=128)
test_loader = DataLoader(test_dataset, batch_size=128)

In [51]:
batch = next(iter(train_loader))

In [92]:
# investigate batch

In [70]:
batch['batch']

tensor([  0,   0,   0,  ..., 127, 127, 127])

In [86]:
(batch['batch'].detach().numpy() == 2).sum()

21

In [77]:
# node features:

In [82]:
batch['x'] # one_hot(type), atomic_number, aromatic, sp1, sp2, sp3, num_hs -> 5+1+1+1+1+1+1 = 11

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 1.],
        ...,
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.]])

In [83]:
batch['x'].shape

torch.Size([2266, 11])

In [81]:
batch['z'] # atomic number

tensor([8, 6, 7,  ..., 1, 1, 1])

In [84]:
# edge features:

In [85]:
batch['edge_index']

tensor([[   0,    1,    1,  ..., 2263, 2264, 2265],
        [   1,    0,    2,  ..., 2255, 2258, 2260]])

In [91]:
batch['edge_attr'] # one_hot(bond_type) -> 4 single, double, triple, aromatic

tensor([[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [1., 0., 0., 0.],
        [1., 0., 0., 0.],
        [1., 0., 0., 0.]])

In [93]:
# targets 
batch['y'][:, 7] # this target is U0

tensor([-13330.6152, -11508.6113,  -9522.1328, -10532.2461, -15526.5850,
        -12521.9912,  -9961.0420, -10500.1523,  -9269.8799, -11374.0488,
        -11949.5830, -11746.2314,  -9899.7324, -10500.0879, -10935.5459,
        -11948.6318, -10467.1699, -11542.8604, -10936.4619, -11916.1602,
        -10465.9971, -10499.7939, -11980.9199, -10468.0762, -11980.3975,
        -12349.3623, -10598.5361, -11980.0430, -12424.0635, -11544.2549,
        -12488.9180, -11373.1387, -11406.5303, -10464.4551,  -9893.1094,
        -10499.8291, -11342.2910,  -6693.9434, -10473.8047,  -9555.8594,
        -10936.0420, -10971.0967, -11511.0000, -11371.8154, -13397.6348,
         -9927.8242,  -8551.8311,  -9496.6719, -11844.5283, -10533.1689,
        -11374.5859, -10904.3330, -10338.7227,  -9959.4531, -11340.4824,
        -11845.9775, -11373.9443, -11450.9395, -10498.0254, -10971.6348,
        -11404.5635, -11947.7607, -11947.5020, -11543.1924, -11510.4512,
        -11343.3252, -10906.3037, -13768.1445, -127

In [94]:
ngpu=1
device = torch.device('cuda:0' if (torch.cuda.is_available() and ngpu > 0) else 'cpu')

# Model

# Training