In [1]:
import numpy as np
import pickle
import sqlite3
from qcapi_utils import QCRecord, Conformation,get_record_id
import requests

import tqdm

In [2]:
response = requests.get("http://127.0.0.1:8000/list_records/?status=1")
records = response.json()
print(len(records))



178


In [3]:
def convert_numpy(record,keys=None):
    output = dict(record["conformation"])
    if keys is not None:
        output = {key:output[key] for key in keys}
    for key in output:
        if isinstance(output[key],list):
            output[key] = np.asarray(output[key])
    return output

In [4]:
keys = [
    "species",
    "coordinates",
    "total_charge",
    "energy",
    "forces",
    "mbis_charges",
    "mbis_volumes",
    # "mbis_volume_ratios",
    "mbis_valence_widths",
]
dataset_raw = list(
    map(
        lambda x: convert_numpy(x, keys=keys),
        records,
    )
)

In [5]:
import yaml
from client import PERIODIC_TABLE
with open("atomic_data.yaml",'r') as f:
    atomic_data = yaml.safe_load(f)
print(atomic_data["H_0"])

species_set = np.unique(np.concatenate([d["species"] for d in dataset_raw]))
print(species_set)
atomic_energies = np.zeros(np.max(species_set)+1)
for species in species_set:
    atomic_energies[species] = atomic_data[PERIODIC_TABLE[species]+"_0"]["energy"]
print(atomic_energies)

{'energy': -0.4987605100487198, 'spin': 1, 'nel': 1, 'MBIS_volume': 1.2409718845631814, 'MBIS_sigma': 0.27227412413766605}
[ 1  6  8 11 15]
[   0.           -0.49876051    0.            0.            0.
    0.          -37.87264504    0.          -75.11317841    0.
    0.         -162.30553818    0.            0.            0.
 -341.3059197 ]


In [6]:
KCALPERMOL = 627.5096080305927

dataset = []
for d in dataset_raw:
    species = d["species"]
    reference_energy = np.sum(atomic_energies[species])
    formation_energy = d["energy"] - reference_energy
    coordinates = d["coordinates"]
    forces = d["forces"]

    virial_tensor = -(forces[:,:,None]*coordinates[:,None,:]).sum(axis=0,keepdims=True)

    forces = forces*KCALPERMOL
    virial_tensor = virial_tensor*KCALPERMOL
    formation_energy = formation_energy*KCALPERMOL
    energy = d["energy"]*KCALPERMOL

    d2 = {
        **d,
        "formation_energy":formation_energy,
        "energy":energy,
        "forces":forces,
        "virial_tensor":virial_tensor,
    }
    dataset.append(d2)

In [7]:
energies = np.array([d["formation_energy"] for d in dataset])
natoms = np.array([d["species"].shape[0] for d in dataset])
eperatom = energies/natoms
print(np.mean(eperatom),np.std(eperatom),np.min(eperatom),np.max(eperatom))

-82.45059772254784 1.281363516812011 -85.19286826041558 -75.50506062849448


In [9]:
validation_size = int(round(0.01*len(dataset)))
train_size = len(dataset) - validation_size

#get numpy generator
rng = np.random.default_rng(202420032050)
idx_shuffle = rng.permutation(len(dataset))
print(idx_shuffle[:10])
train_idx = idx_shuffle[:train_size]
validation_idx = idx_shuffle[train_size:]

train_dataset = [dataset[i] for i in train_idx]
validation_dataset = [dataset[i] for i in validation_idx]
print(f"train dataset size : {len(train_dataset):,}")
print(f"validation dataset size : {len(validation_dataset):,}")

[159  14  80 163 114 139  64  68  17  35]
train dataset size : 176
validation dataset size : 2


In [4]:
import pickle
ds_prev = "dataset_ani2x_hard_round2_wb97m-d3bj_def2-tzvppd.pkl"

with open(ds_prev,"rb") as f:
    dataset_prev = pickle.load(f)
print(len(dataset_prev["training"]))


69446


In [11]:

label = "dataset_ani2x_hard_round2+dmp_wb97m-d3bj_def2-tzvppd.pkl"
with open(label, "wb") as f:
    pickle.dump(
        {
            "training": [dataset[j] for j in train_idx]+dataset_prev["training"]
            ,"validation": [dataset[j] for j in validation_idx]+dataset_prev["validation"],
        },
        f,
    )

In [13]:
!pwd

/home/pthomas/Programs/qcAPI
