This notebook loads the preprocessed data created by `./sdt/*/create_sdt.py` and `./fingerprint/fingerprint_*.ipynb` and then splits them into train, validate, and test sets.

Note that our GASpy fingerprinter was "trained" on all of our data, but it used only the adsorption energy data on monometallic surfaces. To ensure the integrity of our validation and test metrics, we should put the data points from the monometallic surface structures in the training set explicitly. This way there is no information leak from our validation/test sets into our training sets. We do this allocation here.

We also stratify our data by adsorbate. This means that we first split our data by adsorbate, and then we perform a separate train/validate/test split for subset of the data. We then concatenate the training, validation, and test sets back together. This ensures that each of our data partitions has a proportional amount of data from each adsorbate.

In [1]:
%load_ext ipycache

  from IPython.utils.traitlets import Unicode


# GASdb

In [2]:
import pickle
import numpy as np


with open('pull_data/gaspy/docs.pkl', 'rb') as file_handle:
    docs = pickle.load(file_handle)
print('%i documents/data points' % len(docs))

with open('sdt/gasdb/sdts.pkl', 'rb') as file_handle:
    sdts = pickle.load(file_handle)

with open('fingerprint/fingerprints_gasdb.pkl', 'rb') as file_handle:
    fingerprints = pickle.load(file_handle)

# Targets = adsorption energies
targets = np.array([doc['energy'] for doc in docs]).reshape(-1, 1)

# Zip it all together for easier data management
data = list(zip(docs, sdts, fingerprints, targets))

47279 documents/data points


In [3]:
%%cache splits_gasdb.pkl docs_train docs_val docs_test sdts_train sdts_val sdts_test fingerprints_train fingerprints_val fingerprints_test targets_train targets_val targets_test

import random
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from gaspy.utils import read_rc


# Need this to figure out if the structure is monometallic or not
with open(read_rc('gasdb_path') + '/mp_comp_data.pkl', 'rb') as file_handle:
    comp_by_mpid = pickle.load(file_handle)

# Initialize training set and the "remainder set", which we'll eventually split further
# into train, validate, and test
data_train = []
data_remain = []

# Allocate the monommetallics to the training set
for doc, sdt, fingerprint, target in data:
    elements = comp_by_mpid[doc['mpid']]
    if len(elements) == 1:
        data_train.append((doc, sdt, fingerprint, target))

    # Leave everything else to be split normally
    else:
        data_remain.append((doc, sdt, fingerprint, target))

# Figure out all of the adsorbates we'll be looking at.
# We will stratify our train/validate/test splits by adsorbate.
adsorbates = {doc['adsorbate'] for doc in docs}

# Stratify our data by adsorbate
data_val = []
data_test = []
for ads in adsorbates:
    _data = [(doc, sdt, fingerprint, target)
             for (doc, sdt, fingerprint, target) in data_remain
             if doc['adsorbate'] == ads]

    # Split out the testing and validation data
    data_cv, _data_test = train_test_split(_data, test_size=0.2)
    _data_train, _data_val = train_test_split(data_cv, test_size=0.2)

    # Concatenate the data in this split with the rest
    data_train.extend(_data_train)
    data_val.extend(_data_val)
    data_test.extend(_data_test)

# Shuffle all the datasets because they've been sorted by both adsorbate and monometallics
random.shuffle(data_train)
random.shuffle(data_val)
random.shuffle(data_test)

# Parse everything back out explicitly
docs_train, sdts_train, fingerprints_train, targets_train = zip(*data_train)
docs_val, sdts_val, fingerprints_val, targets_val = zip(*data_val)
docs_test, sdts_test, fingerprints_test, targets_test = zip(*data_test)
# Turn the tuples into lists
docs_train = list(docs_train)
sdts_train = list(sdts_train)
fingerprints_train = list(fingerprints_train)
targets_train = list(targets_train)
docs_val = list(docs_val)
sdts_val = list(sdts_val)
fingerprints_val = list(fingerprints_val)
targets_val = list(targets_val)
docs_test = list(docs_test)
sdts_test = list(sdts_test)
fingerprints_test = list(fingerprints_test)
targets_test = list(targets_test)

# Report the final splits
print('%i%% train' % round(len(data_train)/len(data) * 100))
print('%i%% validate' % round(len(data_val)/len(data) * 100))
print('%i%% test' % round(len(data_test)/len(data) * 100))

[Saved variables 'docs_test, docs_train, docs_val, fingerprints_test, fingerprints_train, fingerprints_val, sdts_test, sdts_train, sdts_val, targets_test, targets_train, targets_val' to file '/global/project/projectdirs/m2755/ktran/sandbox/uncertainty_benchmarking/preprocessing/splits_gasdb.pkl'.]
68% train
14% validate
18% test


# Catalysis-Hub

In [4]:
import json
import pickle
import numpy as np


with open('fingerprint/preprocessed_cathub.json', 'rb') as file_handle:
    docs = json.load(file_handle)
print('%i documents/data points' % len(docs))

with open('sdt/cathub/sdts.pkl', 'rb') as file_handle:
    sdts = pickle.load(file_handle)

with open('fingerprint/fingerprints_cathub.pkl', 'rb') as file_handle:
    fingerprints = pickle.load(file_handle)

# Targets = adsorption energies
targets = np.array([doc['energy'] for doc in docs]).reshape(-1, 1)

# Zip it all together for easier data management
data = list(zip(docs, sdts, fingerprints, targets))

30420 documents/data points


In [5]:
%%cache splits_cathub.pkl docs_train docs_val docs_test sdts_train sdts_val sdts_test fingerprints_train fingerprints_val fingerprints_test targets_train targets_val targets_test

import random
import pickle
import numpy as np
from sklearn.model_selection import train_test_split


# Initialize training set and the "remainder set", which we'll eventually split further
# into train, validate, and test
data_train = []
data_remain = []

# Allocate the monommetallics to the training set
for doc, sdt, fingerprint, target in data:
    # We assume that the number of elements here is the number of elements
    # within the first three neighbor shells of the adsorbate
    elements = {element for neighbor in doc['neighborcoord']
                for element in neighbor.split(':')[-1].split('-')}
    if len(elements) == 1:
        data_train.append((doc, sdt, fingerprint, target))

    # Leave everything else to be split normally
    else:
        data_remain.append((doc, sdt, fingerprint, target))

# Figure out all of the adsorbates we'll be looking at.
# We will stratify our train/validate/test splits by adsorbate.
adsorbates = {doc['adsorbate'] for doc in docs}

# Stratify our data by adsorbate
data_val = []
data_test = []
for ads in adsorbates:
    _data = [(doc, sdt, fingerprint, target)
             for (doc, sdt, fingerprint, target) in data_remain
             if doc['adsorbate'] == ads]

    # Split out the testing and validation data
    data_cv, _data_test = train_test_split(_data, test_size=0.2)
    _data_train, _data_val = train_test_split(data_cv, test_size=0.2)

    # Concatenate the data in this split with the rest
    data_train.extend(_data_train)
    data_val.extend(_data_val)
    data_test.extend(_data_test)

# Shuffle all the datasets because they've been sorted by both adsorbate and monometallics
random.shuffle(data_train)
random.shuffle(data_val)
random.shuffle(data_test)

# Parse everything back out explicitly
docs_train, sdts_train, fingerprints_train, targets_train = zip(*data_train)
docs_val, sdts_val, fingerprints_val, targets_val = zip(*data_val)
docs_test, sdts_test, fingerprints_test, targets_test = zip(*data_test)
# Turn the tuples into lists
docs_train = list(docs_train)
sdts_train = list(sdts_train)
fingerprints_train = list(fingerprints_train)
targets_train = list(targets_train)
docs_val = list(docs_val)
sdts_val = list(sdts_val)
fingerprints_val = list(fingerprints_val)
targets_val = list(targets_val)
docs_test = list(docs_test)
sdts_test = list(sdts_test)
fingerprints_test = list(fingerprints_test)
targets_test = list(targets_test)

# Report the final splits
print('%i%% train' % (len(data_train)/len(data) * 100))
print('%i%% validate' % (len(data_val)/len(data) * 100))
print('%i%% test' % (len(data_test)/len(data) * 100))

[Saved variables 'docs_test, docs_train, docs_val, fingerprints_test, fingerprints_train, fingerprints_val, sdts_test, sdts_train, sdts_val, targets_test, targets_train, targets_val' to file '/global/project/projectdirs/m2755/ktran/sandbox/uncertainty_benchmarking/preprocessing/splits_cathub.pkl'.]
70% train
13% validate
16% test
