In [1]:
import matplotlib.pyplot as plt
import numpy as np
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.backend import mean, square

from spektral.datasets import qm9
from spektral.layers import EdgeConditionedConv, GlobalAttentionPool
from spektral.utils import label_to_one_hot

from os import path

Using TensorFlow backend.


In [2]:
A_complete, X_complete, E_complete, y_complete = qm9.load_data(return_type='numpy',
                           nf_keys='atomic_num',
                           ef_keys='type',
                           self_loops=True,
                           amount=None)  # Set to None to train on whole dataset
# one-hot labeling of atoms
uniq_X = np.unique(X_complete)
X_complete = label_to_one_hot(X_complete, uniq_X)

Loading QM9 dataset.
Reading SDF


100%|██████████| 133885/133885 [00:38<00:00, 3520.21it/s]


In [3]:
y_complete

Unnamed: 0,mol_id,A,B,C,mu,alpha,homo,lumo,gap,r2,zpve,u0,u298,h298,g298,cv,u0_atom,u298_atom,h298_atom,g298_atom
0,gdb_1,157.71180,157.709970,157.706990,0.0000,13.21,-0.3877,0.1171,0.5048,35.3641,0.044749,-40.478930,-40.476062,-40.475117,-40.498597,6.469,-395.999595,-398.643290,-401.014647,-372.471772
1,gdb_2,293.60975,293.541110,191.393970,1.6256,9.46,-0.2570,0.0829,0.3399,26.1563,0.034358,-56.525887,-56.523026,-56.522082,-56.544961,6.316,-276.861363,-278.620271,-280.399259,-259.338802
2,gdb_3,799.58812,437.903860,282.945450,1.8511,6.31,-0.2928,0.0687,0.3615,19.0002,0.021375,-76.404702,-76.401867,-76.400922,-76.422349,6.002,-213.087624,-213.974294,-215.159658,-201.407171
3,gdb_4,0.00000,35.610036,35.610036,0.0000,16.28,-0.2845,0.0506,0.3351,59.5248,0.026841,-77.308427,-77.305527,-77.304583,-77.327429,8.574,-385.501997,-387.237686,-389.016047,-365.800724
4,gdb_5,0.00000,44.593883,44.593883,2.8937,12.99,-0.3604,0.0191,0.3796,48.7476,0.016601,-93.411888,-93.409370,-93.408425,-93.431246,6.278,-301.820534,-302.906752,-304.091489,-288.720028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133880,gdb_133881,3.59483,2.198990,1.904230,1.6637,69.37,-0.2254,0.0588,0.2842,760.7472,0.127406,-400.633868,-400.628599,-400.627654,-400.663098,23.658,-1603.983913,-1614.898804,-1623.788097,-1492.819438
133881,gdb_133882,3.65648,2.142370,1.904390,1.2976,69.52,-0.2393,0.0608,0.3002,762.6354,0.127495,-400.629713,-400.624444,-400.623500,-400.658942,23.697,-1601.376613,-1612.291504,-1621.181424,-1490.211511
133882,gdb_133883,3.67118,2.143140,1.895010,1.2480,73.60,-0.2233,0.0720,0.2953,780.3553,0.140458,-380.753918,-380.748619,-380.747675,-380.783148,23.972,-1667.045429,-1678.830048,-1688.312964,-1549.143391
133883,gdb_133884,3.52845,2.151310,1.865820,1.9576,77.40,-0.2122,0.0881,0.3003,803.1904,0.152222,-364.720374,-364.714974,-364.714030,-364.749650,24.796,-1794.600439,-1807.210860,-1817.286772,-1670.349892


In [4]:
A, X, E = list(), list(), list()
y = y_complete.sample(10000)
for index, row in y.iterrows():
    A.append(A_complete[index])
    X.append(X_complete[index])
    E.append(E_complete[index])
A = np.stack(A, axis=0)
X = np.stack(X, axis=0)
E = np.stack(E, axis=0)

In [5]:
tasks = list(y.columns)[1:]
num_tasks = len(tasks)
y_list = []
for task in tasks:
    y_list.append(y[[task]].values)
    
key_to_index = dict(zip(tasks, range(len(tasks))))
key_to_mean = dict()
key_to_std = dict()
for task in tasks:
    key_to_mean[task] = np.mean(y[[task]].values)
    key_to_std[task] = np.std(y[[task]].values)

# Transforms the output values to have mean 0 and variance 1
for i in range(len(y_list)):
    y_list[i] = StandardScaler().fit_transform(y_list[i]).reshape(-1, y_list[-1].shape[-1])

In [6]:
clusters = [['A', 'B', 'alpha'], 
               ['C', 'r2', 'u0'],
               ['zpve', 'g298', 'cv'],
               ['lumo', 'u298', 'h298'],
               ['mu', 'homo']]

In [7]:
N = X.shape[-2]           # Number of nodes in the graphs
F = X.shape[-1]           # Node features dimensionality
S = E.shape[-1]           # Edge features dimensionality
n_out = y_list[0].shape[-1]    # Dimensionality of the target
learning_rate = 1e-3      # Learning rate for SGD
epochs = 25               # Number of training epochs # formerly 25
batch_size = 64           # Batch size
es_patience = 5           # Patience fot early stopping
soft = False
soft_weight = 0.1

In [8]:
A_train, A_test, \
X_train, X_test, \
E_train, E_test, \
*y_train_test_list = train_test_split(A, X, E, *y_list, test_size = 0.1)

y_train_list = y_train_test_list[::2]
y_test_list = y_train_test_list[1::2]

In [9]:
X_in = Input(shape=(N, F))
A_in = Input(shape=(N, N))
E_in = Input(shape=(N, N, S))





In [10]:
def create_single_task_model():
    gc1 = EdgeConditionedConv(64, activation='relu')([X_in, A_in, E_in])
    gc2 = EdgeConditionedConv(128, activation='relu')([gc1, A_in, E_in])
    pool = GlobalAttentionPool(256)(gc2)
    dense = Dense(256, activation='relu')(pool)
    output = Dense(n_out)(dense)
    return Model(inputs=[X_in, A_in, E_in], outputs=output)

In [11]:
def create_hard_parameter_sharing_model(num_tasks=1):
    gc1 = EdgeConditionedConv(64, activation='relu')([X_in, A_in, E_in])
    gc2 = EdgeConditionedConv(128, activation='relu')([gc1, A_in, E_in])
    pool = GlobalAttentionPool(256)(gc2)
    dense_list = [Dense(256, activation='relu')(pool) for i in range(num_tasks)]
    output_list = [Dense(n_out)(dense_layer) for dense_layer in dense_list]
    return Model(inputs=[X_in, A_in, E_in], outputs=output_list)

In [12]:
def create_soft_paramter_sharing_model_and_loss(soft_weight, num_tasks=1):
    gc1_list = [EdgeConditionedConv(64, activation='relu')([X_in, A_in, E_in]) for i in range(num_tasks)]
    gc2_list = [EdgeConditionedConv(128, activation='relu')([gc1_layer, A_in, E_in]) for gc1_layer in gc1_list]
    pool_list = [GlobalAttentionPool(256)(gc2_layer) for gc2_layer in gc2_list]
    dense_list = [Dense(256, activation='relu')(pool_layer) for pool_layer in pool_list]
    output_list = [Dense(n_out)(dense_layer) for dense_layer in dense_list]
    model = Model(inputs=[X_in, A_in, E_in], outputs=output_list)
    
    def loss(y_true, y_pred):
        avg_layer_diff = 0
        for i in range(len(dense_list)):
            for j in range(i):
                avg_layer_diff += mean(square(dense_list[i]-dense_list[j]))
        avg_layer_diff /= len(dense_list)
        return mean(square(y_pred - y_true)) + soft_weight*avg_layer_diff
    
    return Model(inputs=[X_in, A_in, E_in], outputs=output_list), loss

In [13]:
def generate_filename(tasks):
    tasks_str = "".join(sorted(tasks))
    return path.join('demo_models', tasks_str + '.h5')

In [14]:
def generate_helper_filename(task):
    return path.join('demo_models', task + '.txt')

In [15]:
def train_and_save_multitask_model(tasks, y_train_list):
    model = create_hard_parameter_sharing_model(len(tasks))
    model.compile(optimizer=Adam(lr=learning_rate), loss='mse')
    es_callback = EarlyStopping(monitor='val_loss', patience=es_patience)
    training_set = [y_train_list[key_to_index[task]] for task in tasks]
    model.fit([X_train, A_train, E_train],
             training_set,
             batch_size=batch_size,
             validation_split=0.1,
             epochs=epochs,
             callbacks=[es_callback])
    model.save_weights(generate_filename(tasks))
    for task in tasks:
        helper_file = generate_helper_filename(task)
        with open(helper_file, 'w') as file:
            print(key_to_mean[task], file=file)
            print(key_to_std[task], file=file)

In [16]:
def load_and_evaluate_model(tasks, y_test_list):
    model = create_hard_parameter_sharing_model(len(tasks))
    model.load_weights(generate_filename(tasks))
    model.compile(optimizer=Adam(lr=learning_rate), loss='mse')
    testing_set = [y_test_list[key_to_index[task]] for task in tasks]
    eval_results = model.evaluate([X_test, A_test, E_test], testing_set, batch_size=batch_size)
    return eval_results

In [17]:
def calculate_property(prop, mol_id):
    for cluster in clusters:
        if prop in cluster:
            model = create_hard_parameter_sharing_model(len(cluster))
            model.load_weights(generate_filename(cluster))
            model.compile(optimizer=Adam(lr=learning_rate), loss='mse')
            predictions = model.predict([[X_complete[mol_id-1]], [A_complete[mol_id-1]], [E_complete[mol_id-1]]])
            mean, std = 0, 1
            with open(generate_helper_filename(prop), 'r') as f:
                lines = f.readlines()
                mean = float(lines[0].strip())
                std = float(lines[1].strip())
            prediction = mean + std * predictions[1 + cluster.index(prop)]
            return prediction[0][0]

In [18]:
for cluster in clusters:
    train_and_save_multitask_model(cluster, y_train_list)





Train on 8100 samples, validate on 900 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Train on 8100 samples, validate on 900 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25


Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Train on 8100 samples, validate on 900 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25


Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Train on 8100 samples, validate on 900 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


Train on 8100 samples, validate on 900 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [19]:
print(load_and_evaluate_model(['A', 'B', 'alpha'], y_test_list))

[0.7329079527854919, 0.36025442719459533, 0.30761341547966004, 0.06504011046886445]


In [26]:
print(calculate_property('A', 13333))
print(y_complete.loc[13333 - 1, 'A'])

4.059586
5.20531


In [21]:
# optimizer = Adam(lr=learning_rate)
# if soft:
#     model, loss = create_soft_paramter_sharing_model_and_loss(X_in, A_in, E_in, soft_weight)
#     model.compile(optimizer=optimizer, loss=loss)
# else:
#     model = create_hard_paramter_sharing_model(X_in, A_in, E_in)
#     model.compile(optimizer=optimizer, loss='mse')
# model.summary()

In [22]:
# log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [23]:
# es_callback = EarlyStopping(monitor='val_loss', patience=es_patience)

# model.fit([X_train, A_train, E_train],
#           y_train_list,
#           batch_size=batch_size,
#           validation_split=0.1,
#           epochs=epochs,
#           callbacks=[es_callback])

In [24]:
# print('Evaluating model.')
# eval_results = model.evaluate([X_test, A_test, E_test],
#                               y_test_list,
#                               batch_size=batch_size)
# print('Done.\n'
#       'Test loss: {}'.format(eval_results))

In [25]:
# preds = model.predict([X_test, A_test, E_test])

# if num_tasks == 1:
#     preds = np.transpose(preds)

# for i in range(num_tasks):
#     plt.figure()
#     plt.scatter(preds[i], y_test_list[i], alpha=0.3)
#     plt.plot()
#     plt.title(tasks[i])
#     plt.xlabel('Predicted')
#     plt.ylabel('Actual')
#     # plt.savefig('graphs/' + '11_5_'+tasks[i]+'_multitask')