In [None]:
from ocpmodels.datasets import LmdbDataset
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import lmdb
import pickle
import pandas as pd
from data_processing import extract_post_hoc_data, extract_last_frame, get_material_data
from utils import pyg2atoms, SiteAnalyzer
from descriptor import (
    get_local_e, get_ads_e, get_slab_e, get_eff_coord, get_center_coord, 
    get_sum_atomic_adsorbate, get_num_adsorbate, get_sites, get_density, 
    get_H_f, get_band_gap, get_space_groups, get_miller_indices, get_adsorption_energy
)

In [None]:
# Download the lmdb file from OC20 repository
lmdb_path = "/path/to/lmdb/file"
data = LmdbDataset({'src': lmdb_path})

In [None]:
# Extracting systems with only O, H and C1 category adsorbates for Post-hoc analysis 
req_data = extract_post_hoc_data(data)

# Extracting last frame id (fid) from every system id (sid)
last_frame = extract_last_frame(req_data)

# Use extract_sr_data function to retrieve data objects with H category adsorbates for Symbolic Regression

In [None]:
# Saving the last frame data in form of Lmdb format
db_path = '/enter/path/to/save/new/lmdb/file'

db = lmdb.open(
    db_path,
    map_size=1099511627776 * 2,
    subdir=False,
    meminit=False,
    map_async=True,
)

key = 0
for j, (sid, fid) in tqdm(enumerate(last_frame.items()), total=len(last_frame)):
    for i, item in tqdm(enumerate(req_data), total=len(req_data)):
        if item['sid'] == sid and item['fid'] == fid:
            data_object = item
            txn = db.begin(write=True)
            txn.put(f"{key}".encode("ascii"), pickle.dumps(data_object))
            txn.commit()
            db.sync()
            key += 1
db.close()

In [None]:
new_data = LmdbDataset({'src':db_path})

In [None]:
# Download mapping and reference energy pickle files from OC20 github repository
mapping_path = '/path/to/mapping/pickle/file.pkl' # Path to the pickle file containing the system ID to Materials Project ID mapping
ref_path = '/path/to/reference_energy/pickle/file.pkl'  # Path to the pickle file containing reference energy
# Generate api key on Materials Project database
summary, electronic, mp_sid_dict  = get_material_data(mapping_path, api_key)  # Summary and Electronic structure data from the Materials Project API


# Calculate descriptors for all systems in new_data
local_e = get_local_e(new_data)
ads_e = get_ads_e(new_data)
slab_e = get_slab_e(new_data)
eff_coord = get_eff_coord(new_data)
center_coord = get_center_coord(new_data)
sum_atomic_adsorbate = get_sum_atomic_adsorbate(new_data)
num_adsorbates = get_num_adsorbate(new_data)
sites = get_sites(new_data)
density = get_density(summary, mp_sid_dict)
formation_eng = get_H_f(summary, mp_sid_dict)
band_gap = get_band_gap(electronic, mp_sid_dict)
space_groups = get_space_groups(summary, mp_sid_dict)
miller_indices = get_miller_indices(mapping_path, new_data)
adsorption_energy = get_adsorption_energy(new_data, ref_path)

In [None]:
# Create a list of dictionaries of all descriptors
descriptor_list = [local_e, ads_e,slab_e, eff_coord, center_coord, sum_atomic_adsorbate, num_adsorbates, sites,density, formation_eng, band_gap,
                   space_groups, miller_indices, adsorption_energy]
df = pd.DataFrame(descriptor_list)


In [None]:
df_new = df.dropna(axis=1)
final_dataset = df_new.T.rename(columns={
    0: 'local_e', 1: 'adsorbate_e', 2: 'slab_e', 3: 'effective_coord', 4: 'center_coord', 5: 'sites',
    6: 'band_gap', 7: 'formation_eng', 8: 'density', 9: 'space_group_no', 10: 'Miller_index',
    11: 'Sum_atomic_adsorbate', 12: 'num_adsorbates', 13: 'mean_atomic_bulk', 14: 'Eads'
})
