This document demonstrates the making, training, saving, loading, and usage of a sklearn-compliant CGCNN model.

In [1]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES']='1'

In [2]:

#dataset of docs taken from jupyter-dev using:
#     from gaspy import gasdb, defaults
#     import warnings
#     warnings.filterwarnings('ignore')

#     filters = defaults.adsorption_filters('CO')
#     #filters['results.energy'] = {'$gt': -3.5, '$lt': 9.0}
#     #filters['processed_data.movement_data.max_adsorbate_movement']['$lt'] = 4.0 #specifically for OOH vs OH

#     # Establish the fingerprints that are needed for the preprocessing
#     fingerprints = {}
#     fingerprints['atoms']='$atoms'
#     fingerprints['results']='$results'
#     fingerprints['max_surface_movement']='$processed_data.movement_data.max_surface_movement'
#     fingerprints['adsorption_site'] = '$initial_configuration.atoms.atoms'
#     # Pull the documents and then modify them so that they'll work with the preprocessor
#     docs = gasdb.get_adsorption_docs(['CO'],extra_fingerprints=fingerprints, filters=filters)

#CO_docs = pickle.load(open('/home/zulissi/CO_docs_200.pkl','rb'))
    

# with open('/home/zulissi/CO_docs_200.pkl','wb') as fhandle:
#     pickle.dump(CO_docs,fhandle)

## Load the dataset as mongo docs

In [3]:
import random
import pickle

docs = pickle.load(open('/home/zulissi/CO_docs.pkl','rb'))
random.shuffle(sdocs)
docs = docs[:1000]

## Get the size of the features from the data transformer, to be used in setting up the net model

In [4]:
from torch.utils.data import Dataset, DataLoader
import mongo
from cgcnn.data import StructureData, ListDataset, StructureDataTransformer
import numpy as np
import tqdm
from sklearn.preprocessing import StandardScaler


energies = np.array([doc['energy'] for doc in docs])
scaler = StandardScaler().fit(energies.reshape(-1, 1))


SDT = StructureDataTransformer(atom_init_loc='/home/zulissi/software/cgcnn_sklearn/atom_init.json',
                              max_num_nbr=7,
                              radius=1,
                              use_tag=True,
                              use_fixed_info=True)

SDT_out = SDT.transform(docs)

structures = SDT_out[0]
orig_atom_fea_len = structures[0].shape[-1]
nbr_fea_len = structures[1].shape[-1]



## CGCNN model with skorch to make it sklearn compliant

In [5]:
from torch.optim import Adam
from sklearn.model_selection import ShuffleSplit
from skorch.callbacks import Checkpoint, LoadInitState #needs skorch 0.4.0, conda-forge version at 0.3.0 doesn't cut it
from cgcnn.data import collate_pool
from skorch import NeuralNetRegressor
from cgcnn.model import CrystalGraphConvNet
import torch
from cgcnn.data import MergeDataset
import skorch.callbacks.base


cuda = torch.cuda.is_available()
if cuda:
    device = torch.device("cuda")
else:
    device='cpu'

#Make a checkpoint to save parameters every time there is a new best for validation lost
cp = Checkpoint(monitor='valid_loss_best')

#Callback to load the checkpoint with the best validation loss at the end of training
class train_end_load_best_valid_loss(skorch.callbacks.base.Callback):
    def on_train_end(self, net, X, y):
        net.load_params('params.pt')
        
load_best_valid_loss = train_end_load_best_valid_loss()
net = NeuralNetRegressor(
    CrystalGraphConvNet,
    module__orig_atom_fea_len = orig_atom_fea_len,
    module__nbr_fea_len = nbr_fea_len,
    module__atom_fea_len=46,
    module__h_fea_len=83,
    module__n_conv=8,
    module__n_h=4,
    iterator_train__batch_size=214,
    iterator_train__pin_memory=True,
    iterator_train__num_workers=0,
    iterator_train__collate_fn = collate_pool,
    iterator_valid__pin_memory=True,
    iterator_valid__num_workers=0,
    iterator_valid__collate_fn = collate_pool,
    max_epochs=10,
    lr=np.exp(-5.18),
    optimizer=Adam,    
    device=device,
    criterion=torch.nn.L1Loss,
    dataset=MergeDataset,
    callbacks=[cp, load_best_valid_loss]
)


## Example converting all the documents up front

In [6]:
import multiprocess as mp

SDT_out = SDT.transform(docs)

with mp.Pool(4) as pool:
    SDT_list = list(tqdm.tqdm(pool.imap(lambda x: SDT_out[x],range(len(SDT_out)),chunksize=40),total=len(SDT_out)))

#Make the target list
target_list = scaler.transform(np.array([doc['energy'] for doc in docs]).reshape(-1,1))
target_list = np.array([doc['energy'] for doc in docs]).reshape(-1,1)


100%|██████████| 1000/1000 [01:40<00:00,  9.97it/s]


# Test single training

In [7]:
#Fit the NN
net.fit(SDT_list,target_list)

  epoch    train_loss    valid_loss    cp     dur
-------  ------------  ------------  ----  ------
      1        [36m0.8001[0m        [32m0.5471[0m     +  1.3488
      2        [36m0.5914[0m        0.5832        0.8138
      3        [36m0.5816[0m        [32m0.5184[0m     +  0.8133
      4        [36m0.5577[0m        [32m0.4731[0m     +  0.8122
      5        [36m0.4889[0m        [32m0.4160[0m     +  0.8125
      6        [36m0.4307[0m        0.6140        0.8127
      7        [36m0.4185[0m        0.8031        0.8117
      8        [36m0.4011[0m        0.6072        0.8123
      9        [36m0.4009[0m        0.4360        0.8120
     10        [36m0.3819[0m        0.4194        0.8129


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=CrystalGraphConvNet(
    (embedding): Linear(in_features=94, out_features=46, bias=True)
    (convs): ModuleList(
      (0): ConvLayer(
        (fc_full): Linear(in_features=98, out_features=92, bias=True)
        (sigmoid): Sigmoid()
        (softplus1): Softplus(beta=1, threshold=20)
        (bn1): BatchNorm1d(92, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm1d(46, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (softplus2): Softplus(beta=1, threshold=20)
      )
      (1): ConvLayer(
        (fc_full): Linear(in_features=98, out_features=92, bias=True)
        (sigmoid): Sigmoid()
        (softplus1): Softplus(beta=1, threshold=20)
        (bn1): BatchNorm1d(92, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (bn2): BatchNorm1d(46, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (softplus2): Softplus(

## Test saving and loading and using a pipeline (single-threaded conversion)

In [8]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SDT,net)

#Save the fitted sklearn-compatible pipeline
with open('fitted-pipeline.pkl','wb') as fhandle:
    pickle.dump(pipe,fhandle)

In [9]:
pipeline = pickle.load(open('fitted-model.pkl','rb'))
pipeline.predict(docs)

array([[ 0.8384002 ],
       [ 0.44950756],
       [ 0.8624066 ],
       [-1.0550358 ],
       [ 0.98745465],
       [ 0.586292  ],
       [ 0.95263827],
       [ 0.98174685],
       [-0.05879573],
       [ 0.800063  ],
       [ 0.9196766 ],
       [-0.32130083],
       [ 0.48017555],
       [ 0.94949406],
       [ 0.5370213 ],
       [-0.55270046],
       [-0.4050538 ],
       [ 0.504645  ],
       [-0.9592412 ],
       [-0.20463465],
       [ 0.72949976],
       [-0.99070495],
       [ 0.7621159 ],
       [ 0.7684037 ],
       [ 0.7612388 ],
       [ 0.22055197],
       [-1.2068889 ],
       [ 0.9720404 ],
       [ 0.6367614 ],
       [ 0.8578729 ],
       [-1.1057863 ],
       [-0.94082546],
       [ 0.81635696],
       [-1.0139832 ],
       [-0.87557876],
       [-0.7303076 ],
       [ 0.9046748 ],
       [ 0.9248668 ],
       [ 0.79441607],
       [-1.1301421 ],
       [-0.03730621],
       [ 0.58609164],
       [ 0.94179046],
       [-0.77236825],
       [ 0.85958314],
       [ 0

# Test sigopt


In [10]:
from sigopt_sklearn.search import SigOptSearchCV
from sklearn.metrics import get_scorer

client_token = 'insert_sigopt_token_here'

net_parameters  = {'max_epochs': (10,20)}

clf = SigOptSearchCV(net, net_parameters, cv=5,
    client_token=client_token, n_jobs=1, n_iter=2, scoring=get_scorer('neg_mean_absolute_error'))

clf.fit(SDT_list, target_list)


Re-initializing module because the following parameters were re-set: atom_fea_len, h_fea_len, n_conv, n_h, nbr_fea_len, orig_atom_fea_len.
Re-initializing optimizer because the following parameters were re-set: lr.
  epoch    train_loss    valid_loss    cp     dur
-------  ------------  ------------  ----  ------
      1        [36m0.9186[0m        [32m0.7733[0m     +  0.6957
      2        [36m0.6103[0m        [32m0.5523[0m     +  0.6514
      3        0.6175        [32m0.4866[0m     +  0.6517
      4        [36m0.5823[0m        0.5521        0.6505
      5        [36m0.5634[0m        0.5235        0.6507
      6        [36m0.4783[0m        [32m0.4176[0m     +  0.6510
      7        [36m0.4322[0m        0.5811        0.6509
      8        0.4400        0.5440        0.6510
      9        [36m0.3997[0m        0.5409        0.6506
     10        0.4102        0.4641        0.6516
     11        [36m0.3914[0m        0.4689        0.6505
     12        [36m0.3908

      2        [36m0.6122[0m        [32m0.5466[0m     +  0.6563
      3        0.6262        [32m0.5233[0m     +  0.6554
      4        [36m0.5973[0m        0.5484        0.6556
      5        [36m0.5839[0m        0.5620        0.6560
      6        [36m0.5293[0m        0.5474        0.6555
      7        [36m0.4597[0m        [32m0.3963[0m     +  0.6547
      8        [36m0.4450[0m        0.5688        0.6562
      9        [36m0.4406[0m        0.5484        0.6553
     10        [36m0.4196[0m        0.4945        0.6558
Re-initializing module because the following parameters were re-set: atom_fea_len, h_fea_len, n_conv, n_h, nbr_fea_len, orig_atom_fea_len.
Re-initializing optimizer because the following parameters were re-set: lr.
  epoch    train_loss    valid_loss    cp     dur
-------  ------------  ------------  ----  ------
      1        [36m0.8385[0m        [32m0.6565[0m     +  0.6522
      2        [36m0.5561[0m        [32m0.5863[0m     +  0.6509


SigOptSearchCV(client_token=None, cv=5, cv_timeout=None, error_score='raise',
        estimator=<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=CrystalGraphConvNet(
    (embedding): Linear(in_features=94, out_features=46, bias=True)
    (convs): ModuleList(
      (0): ConvLayer(
        (fc_full): Linear(in_features=98, out_features=92, bias=True)
        (sigmo...s(beta=1, threshold=20)
    )
    (fc_out): Linear(in_features=83, out_features=1, bias=True)
  ),
),
        fit_params=None, iid=True, n_iter=2, n_jobs=1, n_sug=1,
        opt_timeout=None, param_domains={'max_epochs': (10, 20)},
        pre_dispatch='2*n_jobs', refit=True,
        scoring=make_scorer(mean_absolute_error, greater_is_better=False),
        sigopt_connection=<sigopt.interface.Connection object at 0x7fe600305f28>,
        verbose=0)