In [2]:
from pyscf import gto, scf, dft
import numpy as np
from IPython.display import clear_output
import pandas as pd
from pathlib import Path

In [None]:
def np_to_scfinput(atoms, coords):
    '''
    Input: tuple of (coordinates: np.array; atom representation: tuple of strings)
    '''
    # safeguarding here
    # convertion
    raw = []
    for atom, coord in zip(atoms, coords):
        raw.append(' '.join([atom, *coord.astype(str)]))
    
    return ' ;'.join(raw)

def add_gaussian_noise(coords, sigma=0.02, seed=None):
    '''
    Input: np array shape (N,3)
    First row is central atom: 0,0,0. Or any other atom that is kept at the origin
    Second and third rows are hydrogens / all other atoms
    '''
    # default_rng is the new np random Generator. good to use!
    rng = np.random.default_rng(seed)
    noise = rng.normal(loc=0.0, scale=sigma, size=coords.shape)
    coords_w_noise = coords + noise

    # force the first row (central atom) to be 0,0,0
    coords_w_noise[0] = [0,0,0]
    return coords_w_noise

## Prepare water molecule file (default)

1. Water molecule

In [15]:
# actual water molecule data
bond_len = 0.9572
bond_ang = 104.5
atoms = ('O','H','H') # what atom that each row corresponds to 

2. $CO_2$ molecule

In [19]:
bond_len = 1.163
bond_ang = 180
atoms = ('C','O','O') # what atom that each row corresponds to 


In [23]:
x = bond_len * np.sin(np.radians(bond_ang/2))
y = bond_len * np.cos(np.radians(bond_ang/2))

coords = np.array([[0,0,0],[x, y, 0],[-x,y,0]]) # put in the actual data
print(coords)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 1.16300000e+00  7.12132114e-17  0.00000000e+00]
 [-1.16300000e+00  7.12132114e-17  0.00000000e+00]]


3. More complex CH3OH molecule: 

In [27]:
atoms = ('C', 'H', 'H', 'H', 'O', 'H')

coords = np.array([
    [ 0.0000,   0.0000,   0.0000],   # C
    [ 0.4970,   0.0280,   0.9736],   # H
    [ 0.2846,   0.8860,  -0.5809],   # H
    [ 0.3124,  -0.9001,  -0.5437],   # H
    [-1.3613,  -0.0150,   0.3035],   # O
    [-1.8357,  -0.0394,  -0.5177],   # H
])

The code

In [30]:
# change this:
#####################
data_dir = Path('./methanol_energies_noise') # change this
file_name = 'methanol_noise_dft_b3lyp_def2-SVP'
mf_scanner = gto.M().apply(dft.RKS).set(xc='b3lyp', basis='def2-SVP').as_scanner() # an easy way to ask for energy only
####################

data_dir.mkdir(exist_ok=True)

sigma = 0.02

for i in range(1000):
    coords_w_noise = add_gaussian_noise(coords, sigma=sigma, seed=i)
    inp = np_to_scfinput(atoms, coords_w_noise)
    e = mf_scanner(gto.M(atom=inp))

    np.savez_compressed(
        data_dir / f'{file_name}_{i}.npz', # change y
        coords=coords_w_noise,
        energy = e,
        noise_sigma = sigma
    )

converged SCF energy = -114.163110130081
converged SCF energy = -114.164673766301
converged SCF energy = -114.168410489937
converged SCF energy = -114.163826636029
converged SCF energy = -114.148598274082
converged SCF energy = -114.172201319779
converged SCF energy = -114.15233105229
converged SCF energy = -114.165516967075
converged SCF energy = -114.156361671069
converged SCF energy = -114.166229465521
converged SCF energy = -114.170618778698
converged SCF energy = -114.16185522284
converged SCF energy = -114.150265513631
converged SCF energy = -114.174030422965
converged SCF energy = -114.161334827174
converged SCF energy = -114.159369049796
converged SCF energy = -114.169511892625
converged SCF energy = -114.167959732412
converged SCF energy = -114.169820684205
converged SCF energy = -114.171213544392
converged SCF energy = -114.163186151675
converged SCF energy = -114.166466781871
converged SCF energy = -114.164825464472
converged SCF energy = -114.163308775753
converged SCF ener

In [31]:
# inspection of one single file
index = 100
file_path = data_dir / f'{file_name}_{index}.npz'
data = np.load(file_path)
print(data['coords'], data['energy'])

[[ 0.          0.          0.        ]
 [ 0.50787947  0.00877235  0.99502017]
 [ 0.29862911  0.90009947 -0.56599875]
 [ 0.33448694 -0.85524055 -0.55592986]
 [-1.36035578  0.02008469  0.2767404 ]
 [-1.82918851 -0.05318235 -0.51809644]] -114.16080122591144
