In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
full_train_data = pd.read_csv("../data/train.csv")
valid_ffv = full_train_data[full_train_data['FFV'].notna()]

two_star_ffv = valid_ffv[valid_ffv['SMILES'].str.count(r"\*") == 2]

In [40]:
#https://github.com/rdkit/rdkit/issues/7716
from pyscf import gto, scf
from rdkit import Chem
from rdkit.Chem import rdDistGeom

sample_smiles = two_star_ffv.sample(1, random_state=1)['SMILES'].values[0]

def remove_asterisk(smiles):
    return smiles.replace('*', '').replace('()', '')

mol = Chem.rdmolfiles.MolFromSmiles(remove_asterisk(sample_smiles))
mol = Chem.AddHs(mol)

geometry = rdDistGeom.ETKDGv3()
Chem.rdDistGeom.EmbedMolecule(mol, geometry)

mol_xyz = Chem.MolToXYZBlock(mol).splitlines()[2:]
mol_xyz = '\n'.join(mol_xyz)

#https://pyscf.org/user/using.html
mymol = gto.Mole()
mymol.atom = mol_xyz
mymol.basis = 'sto-3g'

mymol.build()

mf = scf.RHF(mymol)
mf.kernel()
dm1 = mf.make_rdm1()
print(dm1)


converged SCF energy = -1161.14913590736
[[ 2.07030080e+00 -2.28172889e-01 -1.36160615e-02 ...  2.12686957e-04
  -1.23019739e-03 -1.63912127e-03]
 [-2.28172889e-01  8.75305320e-01  4.42143035e-02 ... -9.57427356e-04
   5.60495329e-03  6.89929335e-03]
 [-1.36160615e-02  4.42143035e-02  5.31253510e-01 ...  1.00522717e-03
  -6.84017336e-03 -9.84755836e-03]
 ...
 [ 2.12686957e-04 -9.57427356e-04  1.00522717e-03 ...  6.06607153e-01
  -1.60864213e-03 -5.03202079e-04]
 [-1.23019739e-03  5.60495329e-03 -6.84017336e-03 ... -1.60864213e-03
   6.06015943e-01 -7.77503491e-02]
 [-1.63912127e-03  6.89929335e-03 -9.84755836e-03 ... -5.03202079e-04
  -7.77503491e-02  5.91910010e-01]]


In [None]:
def optimize_ffv(smiles: str, basis: str = 'sto-3g', max_cycle: int = 50, conv: float = 1e-6, basis_set: str = 'sto-3g'):
    mol = Chem.rdmolfiles.MolFromSmiles(remove_asterisk(smiles))
    mol = Chem.AddHs(mol)
    
    geometry = rdDistGeom.ETKDGv3()
    Chem.rdDistGeom.EmbedMolecule(mol, geometry)
    mol_xyz = Chem.MolToXYZBlock(mol).splitlines()[2:]
    mol_xyz = '\n'.join(mol_xyz)
    
    mymol = gto.Mole()
    mymol.atom = mol_xyz
    mymol.basis = basis_set
    mymol.build()

    mf = scf.RHF(mymol)
    mf.conv_tol = conv
    mf.max_cycle = max_cycle
    mf.verbose = 4
    mf.kernel()
    return mf

mf_test = optimize_ffv(sample_smiles, max_cycle=5)
print(mf_test.mo_energy)