In [None]:
# import os
# os.environ['CUDA_VISIBLE_DEVICES']='1'

In [None]:

#dataset of docs taken from jupyter-dev using:
#     from gaspy import gasdb, defaults
#     import warnings
#     warnings.filterwarnings('ignore')

#     filters = defaults.adsorption_filters('CO')
#     #filters['results.energy'] = {'$gt': -3.5, '$lt': 9.0}
#     #filters['processed_data.movement_data.max_adsorbate_movement']['$lt'] = 4.0 #specifically for OOH vs OH

#     # Establish the fingerprints that are needed for the preprocessing
#     fingerprints = {}
#     fingerprints['atoms']='$atoms'
#     fingerprints['results']='$results'
#     fingerprints['max_surface_movement']='$processed_data.movement_data.max_surface_movement'
#     fingerprints['adsorption_site'] = '$initial_configuration.atoms.atoms'
#     # Pull the documents and then modify them so that they'll work with the preprocessor
#     docs = gasdb.get_adsorption_docs(['CO'],extra_fingerprints=fingerprints, filters=filters)

#CO_docs = pickle.load(open('/home/zulissi/CO_docs_200.pkl','rb'))
    

# with open('/home/zulissi/CO_docs_200.pkl','wb') as fhandle:
#     pickle.dump(CO_docs,fhandle)

## Load the dataset as mongo docs

In [1]:
import random
import pickle

docs = pickle.load(open('/home/zulissi/CO_docs.pkl','rb'))
random.shuffle(docs)
docs = docs[:200]

## Get the size of the features from the data transformer, to be used in setting up the net model

In [2]:
from torch.utils.data import Dataset, DataLoader
import mongo
from cgcnn.data import StructureData, ListDataset, StructureDataTransformer
import numpy as np
import tqdm
from sklearn.preprocessing import StandardScaler


energies = np.array([doc['energy'] for doc in docs])
scaler = StandardScaler().fit(energies.reshape(-1, 1))


SDT = StructureDataTransformer(atom_init_loc='/home/zulissi/software/cgcnn_sklearn/atom_init.json',
                              max_num_nbr=9,
                              radius=1,
                              use_tag=True,
                              use_fixed_info=True)

SDT_out = SDT.transform(docs)

structures = SDT_out[0]
orig_atom_fea_len = structures[0].shape[-1]
nbr_fea_len = structures[1].shape[-1]



## CGCNN model with skorch to make it sklearn compliant

In [3]:
from torch.optim import Adam
from sklearn.model_selection import ShuffleSplit
from skorch.callbacks import Checkpoint, LoadInitState #needs skorch 0.4.0, conda-forge version at 0.3.0 doesn't cut it
from cgcnn.data import collate_pool
from skorch import NeuralNetRegressor
from cgcnn.model import CrystalGraphConvNet
import torch
from cgcnn.data import MergeDataset


cuda = torch.cuda.is_available()
if cuda:
    device = torch.device("cuda")
else:
    device='cpu'

#This is a little weird, one of the parameters basically needs to match the results 
# that are coming from the transform function, should be fixable 

from sklearn.model_selection import KFold

def train_test_split(X,y):
    kf = KFold(n_splits=5)
    train_idx, test_idx = next(kf.split(X))
    
    dataset_train = list(zip(X[train_idx],y[train_idx]))
    dataset_test = list(zip(X[test_idx],y[test_idx]))
    
    sjk5352
    return dataset_train, dataset_test

 
cp = Checkpoint(monitor='valid_loss_best')
net = NeuralNetRegressor(
    CrystalGraphConvNet,
    module__orig_atom_fea_len = orig_atom_fea_len,
    module__nbr_fea_len = nbr_fea_len,
    module__atom_fea_len=46,
    module__h_fea_len=83,
    module__n_conv=8,
    module__n_h=4,
    iterator_train__batch_size=214,
    iterator_train__pin_memory=True,
    #iterator_train__num_workers=0,
    iterator_train__collate_fn = collate_pool,
    iterator_valid__pin_memory=True,
    #iterator_valid__num_workers=0,
    iterator_valid__collate_fn = collate_pool,
    max_epochs=10,
    lr=np.exp(-5.18),
    optimizer=Adam,    
    device=device,
    criterion=torch.nn.L1Loss,
    dataset=MergeDataset,
    callbacks=[cp]
)




## Example converting all the documents up front

In [4]:
import multiprocess as mp

SDT_out = SDT.transform(docs)

with mp.Pool(4) as pool:
    SDT_list = list(tqdm.tqdm(pool.imap(lambda x: SDT_out[x],range(len(SDT_out)),chunksize=40),total=len(SDT_out)))

#Make the target list
target_list = scaler.transform(np.array([doc['energy'] for doc in docs]).reshape(-1,1))
y = torch.FloatTensor(target_list)

#Fit the NN
net.fit(SDT_list,y=y)

#Load the best parameters (best validation)
net.load_params('params.pt')


100%|██████████| 200/200 [00:25<00:00,  7.90it/s]


  epoch    train_loss    valid_loss    cp     dur
-------  ------------  ------------  ----  ------
      1        [36m0.7769[0m        [32m2.8486[0m     +  0.7164
      2        1.5297        [32m0.8276[0m     +  0.2098
      3        0.8223        [32m0.8070[0m     +  0.1989
      4        0.8359        0.9374        0.1956
      5        0.9581        0.9444        0.1960
      6        0.9710        0.8743        0.1957
      7        0.9193        0.8117        0.1953
      8        0.8518        [32m0.7725[0m     +  0.1955
      9        0.7934        [32m0.7532[0m     +  0.1954
     10        [36m0.7644[0m        [32m0.7438[0m     +  0.1955


## Test saving and loading and using a pipeline (single-threaded conversion)

In [5]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(SDT,net)

#Save the fitted sklearn-compatible pipeline
with open('fitted-pipeline.pkl','wb') as fhandle:
    pickle.dump(pipe,fhandle)

In [6]:
pipeline = pickle.load(open('fitted-model.pkl','rb'))
pipeline.predict(docs)

array([[ 0.37539738],
       [ 0.5736287 ],
       [ 0.2945856 ],
       [ 0.28321084],
       [-0.60433006],
       [ 0.7196084 ],
       [ 0.5606486 ],
       [-0.6748233 ],
       [ 0.46784127],
       [-1.0971079 ],
       [ 0.2739984 ],
       [-1.0812374 ],
       [ 0.78952384],
       [ 0.64816505],
       [ 0.89845055],
       [ 0.80677325],
       [ 0.2926386 ],
       [ 0.83383465],
       [ 0.91047287],
       [ 1.0416356 ],
       [-1.0808791 ],
       [-0.32229933],
       [ 0.6984915 ],
       [-0.32724988],
       [ 1.0069227 ],
       [-1.1325859 ],
       [ 0.7002761 ],
       [ 0.35284716],
       [ 0.31941128],
       [ 0.74202013],
       [ 0.70782155],
       [ 0.95855284],
       [ 0.44919765],
       [ 0.75663906],
       [-1.1704398 ],
       [-0.99034244],
       [ 1.0443858 ],
       [ 0.4120098 ],
       [ 0.55018395],
       [-1.220707  ],
       [ 0.6030986 ],
       [-0.3989029 ],
       [ 0.9047002 ],
       [-0.88523614],
       [-0.38830686],
       [-0

## Test saving and loading and using a pipeline (multi-threaded conversion)

In [None]:
pipeline = pickle.load(open('fitted-model.pkl','rb'))

import os

def init():
    global pipeline
    os.environ['CUDA_VISIBLE_DEVICES']=''
    pipeline2 = pickle.load(open('fitted-model.pkl','rb'))

with mp.Pool(4,initializer=init) as pool:
    predictions = list(tqdm.tqdm(pool.imap(lambda x: pipeline2.predict(x),
                                           docs,
                                           chunksize=40),total=len(docs)))

    
# SDT = pipeline.named_steps.structuredatatransformer

#     #pipeline.predict(docs)



# SDT_out = SDT.transform(docs)

# with mp.Pool(4) as pool:
#     SDT_list = ListDataset(list(tqdm.tqdm(pool.imap(lambda x: SDT_out[x],range(len(SDT_out)),
#                                                     chunksize=40,
#                                                    initializer=init),total=len(SDT_out))))

# pipeline.named_steps.neuralnetregressor.predict([SDT_list[0:1]])

In [None]:
pipeline2

In [None]:
pipeline.named_steps.neuralnetregressor.predict([SDT_list[0:1]])

In [None]:
SDT_out[0]

In [None]:
pipeline.named_steps.neuralnetregressor.predict(SDT_out)

In [None]:
%debug

## Example of using the pipeline to do conversion and everything (works, but is slow because of datset loading issues)

In [None]:
# from sklearn.pipeline import make_pipeline

# #y, energies
# target_list = scaler.transform(np.array([doc['energy'] for doc in docs]).reshape(-1,1))
# y = torch.FloatTensor(target_list)

# #Make the sklearn pipeline (convert doc, then net)
# pipe = make_pipeline(SDT, net)

# #Fit the pipeline
# pipe.fit(docs,y=y)

# #Load the best parameters (best validation)
# net.load_params('params.pt')

# #Save the fitted sklearn-compatible pipeline
# with open('fitted-pipeline.pkl','wb') as fhandle:
#     pickle.dump(pipe,fhandle)
