We need neighbors and nearest-neighbors and whatnot in order to fingerprint. The data from Catalysis-Hub does not have that, so we have to get it ourselves. This notebook does this for us.
# Initialize
Load the data in their raw form. `cathub.pkl` should have been created by `../pull_data/catalysis-hub/pull.py`.

In [1]:
import pickle


with open('../pull_data/catalysis-hub/cathub.pkl', 'rb') as file_handle:
    cathub_data = pickle.load(file_handle)

In `../pull_data/catalysis-hub/analyze_data.ipynb`, we learned that the majority of the Catalysis-Hub data came from one source that used Quantum Espresso 5.1 and the BEEF-vdW functional. Let's just grab all the data from there.

In [2]:
# Filter out the data from sources we don't plan to use
docs = []
for adsorbate, _docs in cathub_data.items():
    for doc in _docs:
        if doc['pubId'] == 'MamunHighT2019' and doc['dftCode'] == 'Quantum ESPRESSO 5.1' and doc['dftFunctional'] == 'BEEF-vdW':
            doc['adsorbate'] = adsorbate
            
            # Delete some keys we don't care about
            del doc['pubId']
            del doc['dftCode']
            del doc['dftFunctional']
            del doc['Equation']
            del doc['username']

            docs.append(doc)
print('%i data points' % len(docs))


# Display all the adsorbates for clarity's sake
adsorbates = {doc['adsorbate'] for doc in docs}
for adsorbate in adsorbates:
    _docs = [doc for doc in docs if doc['adsorbate'] == adsorbate]
    print('    %i documents for %s' % (len(_docs), adsorbate))

30420 data points
    1148 documents for OH
    10074 documents for H
    6664 documents for C
    9000 documents for N
    3534 documents for O


In [3]:
# Display one document (also for clarity)
docs[0]

{'coverages': '{"H": 0.25}',
 'systems': [Atoms(symbols='Pt3TiPt3TiPt3Ti', pbc=True, cell=[[5.60132660361148, 0.0, 0.0], [-2.8006638018057, 4.85089165671743, 0.0], [0.0, 0.0, 24.5734632610122]], constraint=FixAtoms(indices=[0, 1, 2, 3, 4, 5, 6, 7])),
  Atoms(symbols='H2', pbc=True, cell=[19.9999985846621, 19.9999985846621, 20.7371645324951]),
  Atoms(symbols='Pt3Ti', pbc=True, cell=[3.96073671971094, 3.96073671971094, 3.96073671971094]),
  Atoms(symbols='PtTiPt3TiPt3TiPt2H', pbc=True, cell=[[5.60132660361148, 0.0, 0.0], [2.8006638018057, 4.85089165671743, 0.0], [0.0, 0.0, 24.5734632610122]], constraint=FixAtoms(indices=[0, 1, 2, 3, 4, 5, 6, 7]))],
 'energy': -0.2928098648580715,
 'adsorbate': 'H'}

# Finding neighbor shells
The fingerprinting method we use in GASpy currently requires us to know the neighbors and the neighbors' neighbors. CatHub did not give that to us, so we use GASpy to figure it out.

In [5]:
import numpy as np
from gaspy.utils import multimap
from gaspy.atoms_operators import fingerprint_adslab



def fingerprint_doc(doc):
    '''
    Modifies a document from Catalysis-Hub so that it can be post-processed by GASpy_regressions
    '''
    doc = doc.copy()

    # Assume that the largest atoms object is the adsorbate+slab system
    all_atoms = doc['systems']
    atoms_lens = [len(atoms) for atoms in all_atoms]
    adslab_len = max(atoms_lens)
    adslab_index = atoms_lens.index(adslab_len)
    adslab = all_atoms[adslab_index]

    # Assert that the adsorbates are always last in the atoms object
    adsorbate_name = doc['adsorbate']
    adsorbate = adslab[-len(adsorbate_name):]
    assert str(adsorbate.symbols) == adsorbate_name
    
    # Now that we know the adsorbate atoms are last, let's tag them appropriately
    tags = np.zeros(len(adslab), int)
    tags[-len(adsorbate):] = 1
    adslab.set_tags(tags)

    # Now we can use GASpy to find the neighbors and such
    fingerprint = fingerprint_adslab(adslab)
    for key, value in fingerprint.items():
        doc[key] = value

    # We don't need the atoms objects anymore
    del doc['systems']

    return doc


docs = multimap(fingerprint_doc, docs, processes=8, maxtasksperchild=100, chunksize=10, n_calcs=len(docs))

100%|██████████| 30420/30420 [04:36<00:00, 110.17it/s]


In [6]:
docs[0]

{'coverages': '{"H": 0.25}',
 'energy': -0.2928098648580715,
 'adsorbate': 'H',
 'coordination': 'Pt-Pt',
 'neighborcoord': ['Pt:Pt-Pt-Pt-Pt-Pt-Ti-Ti', 'Pt:Pt-Pt-Pt-Pt-Pt-Ti-Ti'],
 'nextnearestcoordination': 'Pt-Pt-Pt-Ti'}

Now save it

In [7]:
import json


with open('preprocessed_cathub.json', 'w') as file_handle:
    json.dump(docs, file_handle)