This document demonstrates the making, training, saving, loading, and usage of a sklearn-compliant CGCNN model.

In [1]:
import os
import sys
import numpy as np
import cgcnn

#Select which GPU to use if necessary
#%env CUDA_DEVICE_ORDER=PCI_BUS_ID
#%env CUDA_VISIBLE_DEVICES=0

## Load the dataset as mongo docs

In [2]:
import random
import pickle

#Load a selection of documents
docs = pickle.load(open('CO_docs.pkl','rb'))
random.shuffle(docs)
docs = [doc for doc in docs if -3<doc['energy']<1.0]
docs = docs[:2000]

## Get the size of the features from the data transformer, to be used in setting up the net model

In [3]:
from torch.utils.data import Dataset, DataLoader
import mongo
from cgcnn.data import StructureData, ListDataset, StructureDataTransformer
import numpy as np
import tqdm
from sklearn.preprocessing import StandardScaler


SDT = StructureDataTransformer(atom_init_loc='/home/zulissi/software/cgcnn_sklearn/atom_init.json',
                              max_num_nbr=12,
                               step=0.2,
                              radius=1,
                              use_tag=True,
                              use_fixed_info=True)

SDT_out = SDT.transform(docs)

structures = SDT_out[0]

#Settings necessary to build the model (since they are size of vectors as inputs)
orig_atom_fea_len = structures[0].shape[-1]
nbr_fea_len = structures[1].shape[-1]

## CGCNN model with skorch to make it sklearn compliant

In [4]:
from torch.optim import Adam, SGD
from sklearn.model_selection import ShuffleSplit
from skorch.callbacks import Checkpoint, LoadInitState #needs skorch 0.4.0, conda-forge version at 0.3.0 doesn't cut it
from cgcnn.data import collate_pool
from skorch import NeuralNetRegressor
from cgcnn.model import CrystalGraphConvNet
import torch
from cgcnn.data import MergeDataset
import skorch.callbacks.base


cuda = torch.cuda.is_available()
if cuda:
    device = torch.device("cuda")
else:
    device='cpu'

#Make a checkpoint to save parameters every time there is a new best for validation lost
cp = Checkpoint(monitor='valid_loss_best',fn_prefix='valid_best_')

#Callback to load the checkpoint with the best validation loss at the end of training
class train_end_load_best_valid_loss(skorch.callbacks.base.Callback):
    def on_train_end(self, net, X, y):
        net.load_params('valid_best_params.pt')
        
load_best_valid_loss = train_end_load_best_valid_loss()


## Example converting all the documents up front

In [None]:
import multiprocess as mp
from sklearn.model_selection import ShuffleSplit

SDT_out = SDT.transform(docs)

with mp.Pool(4) as pool:
    SDT_list = list(tqdm.tqdm(pool.imap(lambda x: SDT_out[x],range(len(SDT_out)),chunksize=40),total=len(SDT_out)))

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
#Make the target list
target_list = np.array([doc['energy'] for doc in docs]).reshape(-1,1)

## Shuffle and Split

In [None]:
from sklearn.model_selection import train_test_split

SDT_training, SDT_test, target_training, target_test = train_test_split(SDT_list, target_list, test_size=0.2)


## Fit the model

In [None]:
from skorch.dataset import CVSplit
from skorch.callbacks.lr_scheduler import WarmRestartLR, LRScheduler

train_test_splitter = ShuffleSplit(test_size=0.25, random_state=42)
LR_schedule = LRScheduler('MultiStepLR',milestones=[100],gamma=0.1)

net = NeuralNetRegressor(
    CrystalGraphConvNet,
    module__orig_atom_fea_len = orig_atom_fea_len,
    module__nbr_fea_len = nbr_fea_len,
    batch_size=214,
    module__classification=False,
    lr=0.0056,
    max_epochs=292, #188
    module__atom_fea_len=46,
    module__h_fea_len=83,
    module__n_conv=8,
    module__n_h=4,
    optimizer=Adam,
    iterator_train__pin_memory=True,
    iterator_train__num_workers=0,
    iterator_train__collate_fn = collate_pool,
    iterator_valid__pin_memory=True,
    iterator_valid__num_workers=0,
    iterator_valid__collate_fn = collate_pool,
    device=device,
    criterion=torch.nn.MSELoss,
    dataset=MergeDataset,
    train_split = CVSplit(cv=train_test_splitter),
    callbacks=[cp, load_best_valid_loss, LR_schedule]
)

In [None]:
net.initialize()
net.fit(SDT_training,target_training)

## Plot

In [None]:
import pandas as pd

train_indices, valid_indices = next(train_test_splitter.split(SDT_training))

training_data = {'actual_value':np.array(target_training.reshape(-1))[train_indices],
                 'predicted_value':net.predict(SDT_training)[train_indices].reshape(-1)}
test_data ={'actual_value':np.array(target_test).reshape(-1),
            'predicted_value':net.predict(SDT_test).reshape(-1)}
validation_data = {'actual_value':np.array(target_training.reshape(-1))[valid_indices],
                 'predicted_value':net.predict(SDT_training)[valid_indices].reshape(-1)}

df_training = pd.DataFrame(training_data)
df_validation = pd.DataFrame(validation_data)
df_test = pd.DataFrame(test_data)

In [None]:
df_training.to_csv('training.csv', sep='\t', index=False)
df_test.to_csv('test.csv', sep='\t', index=False)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

f, ax = plt.subplots(figsize=(8,8))
ax.scatter(df_training['actual_value'], df_training['predicted_value'], color='orange', 
           marker='o', alpha=0.5, label='train\nMAE=%0.2f, RMSE=%0.2f, R$^2$=%0.2f'\
            %(mean_absolute_error(df_training['actual_value'], df_training['predicted_value']), 
              np.sqrt(mean_squared_error(df_training['actual_value'], df_training['predicted_value'])),
              r2_score(df_training['actual_value'], df_training['predicted_value'])))


ax.scatter(df_validation['actual_value'], df_validation['predicted_value'], color='blue', 
           marker='o', alpha=0.5, label='valid\nMAE=%0.2f, RMSE=%0.2f, R$^2$=%0.2f'\
            %(mean_absolute_error(df_validation['actual_value'], df_validation['predicted_value']), 
              np.sqrt(mean_squared_error(df_validation['actual_value'], df_validation['predicted_value'])),
              r2_score(df_validation['actual_value'], df_validation['predicted_value'])))

ax.scatter(df_test['actual_value'], df_test['predicted_value'], color='green', 
           marker='o', alpha=0.5, label='test\nMAE=%0.2f, RMSE=%0.2f, R$^2$=%0.2f'\
            %(mean_absolute_error(df_test['actual_value'], df_test['predicted_value']), 
              np.sqrt(mean_squared_error(df_test['actual_value'], df_test['predicted_value'])),
              r2_score(df_test['actual_value'], df_test['predicted_value'])))


ax.plot([min(df_training['actual_value']), max(df_training['actual_value'])], 
        [min(df_training['actual_value']), max(df_training['actual_value'])], 'k--')

# format graph
ax.tick_params(labelsize=14)
ax.set_xlabel('DFT E (eV)', fontsize=14)
ax.set_ylabel('CGCNN predicted E (eV)', fontsize=14)
ax.set_title('Multi-element ', fontsize=14) 
ax.legend(fontsize=12)

plt.show()
