In [2]:
from pyscf import gto, scf, dft
import numpy as np
from IPython.display import clear_output
import pandas as pd
from pathlib import Path

In [None]:
def np_to_scfinput(atoms, coords):
    '''
    Input: tuple of (coordinates: np.array; atom representation: tuple of strings)
    '''
    # safeguarding here
    # convertion
    raw = []
    for atom, coord in zip(atoms, coords):
        raw.append(' '.join([atom, *coord.astype(str)]))
    
    return ' ;'.join(raw)

def add_gaussian_noise(coords, sigma=0.02, seed=None):
    '''
    Input: np array shape (N,3)
    First row is central atom: 0,0,0. Or any other atom that is kept at the origin
    Second and third rows are hydrogens / all other atoms
    '''
    # default_rng is the new np random Generator. good to use!
    rng = np.random.default_rng(seed)
    noise = rng.normal(loc=0.0, scale=sigma, size=coords.shape)
    coords_w_noise = coords + noise

    # force the first row (central atom) to be 0,0,0
    coords_w_noise[0] = [0,0,0]
    return coords_w_noise

## Prepare water molecule file (default)

1. Water molecule

In [None]:
# actual water molecule data
bond_len = 0.9572
bond_ang = 104.5
atoms = ('O','H','H') # what atom that each row corresponds to 

2. $CO_2$ molecule

In [5]:
bond_len = 1.163
bond_ang = 180
atoms = ('C','O','O') # what atom that each row corresponds to 


In [6]:
x = bond_len * np.sin(np.radians(bond_ang))
y = bond_len * np.cos(np.radians(bond_ang))

coords = np.array([[0,0,0],[x, y, 0],[-x,y,0]]) # put in the actual data
print(coords)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.42426423e-16 -1.16300000e+00  0.00000000e+00]
 [-1.42426423e-16 -1.16300000e+00  0.00000000e+00]]


3. More complex CH3OH molecule: 

In [None]:

atoms = ('C', 'H', 'H', 'H', 'O', 'H')

coords = np.array([
    [ 0.0000,   0.0000,   0.0000]   # C
    [ 0.4970,   0.0280,   0.9736]   # H
    [ 0.2846,   0.8860,  -0.5809]   # H
    [ 0.3124,  -0.9001,  -0.5437]   # H
    [-1.3613,  -0.0150,   0.3035]   # O
    [-1.8357,  -0.0394,  -0.5177]   # H
])

The code

In [None]:
# change this:
#####################
data_dir = Path('./co2_energies_noise') # change this
file_name = 'co2_noise_dft_b3lyp_def2-SVP'
mf_scanner = gto.M().apply(dft.RKS).set(xc='b3lyp', basis='def2-SVP').as_scanner() # an easy way to ask for energy only
####################

data_dir.mkdir(exist_ok=True)

sigma = 0.02

for i in range(1000):
    coords_w_noise = add_gaussian_noise(coords, sigma=sigma, seed=i)
    inp = np_to_scfinput(atoms, coords_w_noise)
    e = mf_scanner(gto.M(atom=inp))

    np.savez_compressed(
        data_dir / f'{file_name}_{i}.npz', # change y
        coords=coords_w_noise,
        energy = e,
        noise_sigma = sigma
    )

<class 'pyscf.scf.hf.RKS_Scanner'> does not have attributes  basis


converged SCF energy = 448.957981018617
SCF not converged.
SCF energy = 1689.31870196331
converged SCF energy = 352.037718049335
converged SCF energy = 723.061716300095
converged SCF energy = 299.622442577248
converged SCF energy = 428.140581254692
converged SCF energy = 868.143918575402
converged SCF energy = 482.56909778607
converged SCF energy = 172.601476726363
converged SCF energy = 1500.16379626547
converged SCF energy = 974.123991510786
converged SCF energy = 672.671437442671
converged SCF energy = 178.354483770494
converged SCF energy = 393.366107844275
converged SCF energy = 228.270487341527
converged SCF energy = 638.702331628277
converged SCF energy = 362.266860711865
converged SCF energy = 1147.80877922503
converged SCF energy = 629.605746384166
converged SCF energy = 303.375372779779
converged SCF energy = 575.959073601811
converged SCF energy = 292.097252278734
converged SCF energy = 514.343535704566
converged SCF energy = 74.3309401116903
converged SCF energy = 660.23116

In [None]:
# inspection of one single file
index = 6
file_path = data_dir / f'{file_name}_{index}.npz'
data = np.load(file_path)
print(data['coords'], data['energy'])

[[ 0.          0.          0.        ]
 [ 0.92395162 -0.21938935  0.02704284]
 [-0.91363515 -0.20972138  0.00579915]] -75.23438594734546
