# Benchmarking Workflow
This workflow will perform a calculation at the BHandHLYP/PCSEG-1/high level of theory. It will start with a fragment calculation using HF/PCSEG-0/LOW. The fragments will then be joined as a guess for a full calculation at HF/PCSEG-0/LOW. Then BHandHLYP/PCSEG-0/low will be performed. The density matrix will be projected for a BHandHYLP/PCSEG-1/LOW, and then the calculation finishes at BHandHYLP/PCSEG-1/HIGH.

In [None]:
from os.path import join
geom = "1crn-step-99-conn"
bnames = ["pcseg-0", "pcseg-1"]
workdir = "scratch_clean"
dry_run = "dry_run_clean"
cache_dir = "cache"

In [None]:
from os.path import exists
from os import mkdir
if not exists(cache_dir):
    mkdir(cache_dir)
if not exists(dry_run):
    mkdir(dry_run)

Read in the input file.

In [None]:
from pyntchem.io import read_pdb
with open(join("input", geom + ".pdb")) as ifile:
    sys = read_pdb(ifile)

Remove ionic bonds in the pdb file.

In [None]:
for fragid in sys:
    if "NA" in fragid or "CL" in fragid:
        sys.conmat[fragid][0] = {}

Calculators.

In [None]:
from pyntchem.calculator import JobscriptCalculator
from socket import gethostname
calc = JobscriptCalculator(computer="Fugaku", skip=True, verbose=True)
# arguments redacted because they contain private keys
calc_args_small = {}
calc_args_medium = {}
calc_args_large = {}

Input file setup.

In [None]:
def general_setup(inp, accuracy="low", shift=False, hf=False):
    # Functional
    if hf:
        inp.set_basic_rhf()
    else:
        # BHandHLYP
        inp.set_custom_dft({"b88": 0.5}, {"lyp": 1.0}, 0.5)
        if accuracy == "low":  # (SG1)
            inp.set_dft_prune_grid(50, 194)
        else:  # (SG2)
            inp.set_dft_prune_grid(75, 302)

    # Convergence help
    inp["scf"].maxdiis = 10
    inp["scf"].onbasdiis = True

    # ETC
    inp["scf"].iprint = 3

    # Performance parameters
    inp["int2"].procslices = 8
    if "dft" in inp:
        inp["dft"].procslices = 16
    inp.set_linear_scaling()
    inp["scf"].findiag = False

    # Solver Parameters
    inp["ntpoly"].thresholdorth = 1e-10
    inp["ntpoly"].convergencethresholdorth = 5e-7
    inp["ntpoly"].thresholdpdm = 1e-8
    inp["ntpoly"].convergencethresholdpdm = 1e-8
    inp["ntpoly"].orthtype = "eig"
    inp["ntpoly"].pdmtype = "eig"
    
    # Integral Parameters
    if accuracy == "low":
        inp["int2"].prelinkjthreshold = 1e-7
        inp["int2"].prelinkkthreshold = 1e-4
        inp.set_scf_convergence(1e-4, 5e-3)
        inp["int2"].thrpre = 1e-9
    elif accuracy == "high":
        inp["int2"].prelinkjthreshold = 1e-8
        inp["int2"].prelinkkthreshold = 1e-5
        inp.set_scf_convergence(1e-5, 1e-4)
    
    # Shift
    if shift:
        inp["scf"].vshift = 0.5
        inp["scf"].finshift = False
    else:
        inp["scf"].vshift = 0.0

## System Fragments
For the systems containing water molecules, we want to join them into larger clusters to reduce the number of fragment jobs.

In [None]:
def cluster_molecules(sys, k):
    from scipy.cluster.vq import kmeans2
    def centroid(self):
        """
        The center of a fragment.
        """
        from numpy import mean, ravel
        pos = [at.get_position() for at in self]
        return ravel(mean(pos, axis=0))

    centroids = [centroid(v) for v in sys.values()]
    flist = list(sys)
    
    _, label = kmeans2(centroids, k)
    mapping = {flist[i]: "CLU:" + str(j) for i,j in enumerate(label)}
    return mapping

In [None]:
from pyntchem.systems import System, copy_bonding_information
from pyntchem.fragments import Fragment
from pickle import load, dump

clustered = System()
watersys = System()
for k, v in sys.items():
    if "HOH" in k:
        watersys[k] = v
    else:
        clustered[k] = v
        
if len(list(watersys)) > 0:
    pname = join(cache_dir, "mapping_" + geom + ".pickle")
    try:
        with open(pname, "rb") as ifile:
            mapping = load(ifile)
    except Exception as e:
        print(e)
        mapping = cluster_molecules(watersys, int(len(list(watersys))/10))
        with open(pname, "wb") as ofile:
            dump(mapping, ofile)

    for k, v in mapping.items():
        if v not in clustered:
            clustered[v] = Fragment()
        clustered[v] += sys[k]
    copy_bonding_information(sys, clustered)
else:
    clustered = sys

Created the shuffled system for load balancing.

In [None]:
def get_shuffled_system(sys):
    from pyntchem.systems import System
    from pyntchem.fragments import Fragment
    from random import sample

    # Pack into a list of atoms
    temp = System()
    i = 0
    for frag in sys.values():
        for at in frag:
            temp["FRA:" + str(i)] = Fragment([at])
            i += 1

    # Shuffle
    sys2 = System()
    for i, fragid in enumerate(sample(list(temp), len(list(temp)))):
        sys2["FRA:" + str(i)] = temp[fragid]
    
    return sys2

In [None]:
pname = join(cache_dir, "shuffle_" + geom + ".pickle")
try:
    with open(pname, "rb") as ifile:
        shuffle_sys = load(ifile)
except Exception as e:
    print(e)
    shuffle_sys = get_shuffled_system(clustered)
    with open(pname, "wb") as ofile:
        dump(shuffle_sys, ofile)

Compute the charges.

In [None]:
from pyntchem.preprocessing import guess_fragment_charges
amino_acids = ["ARG", "HIS", "HIP", "LYS", "ASP", "GLU", 
               "SER", "THR", "ASN", "GLN", "CYS", "SEC",
               "GLY", "PRO", "ALA", "VAL", "ILE", "LEU",
               "MET", "PHE", "TYR", "TRP"]
biosys = System()
for fragid, frag in clustered.items():
    if fragid.split(":")[0] in amino_acids:
        biosys[fragid] = frag

if len(list(biosys)) > 0:
    charges = guess_fragment_charges(biosys)
else:
    charges = {}
for k, v in clustered.items():
    if "CLU:" in k:
        continue
    if "CL" in k:
        charges[k] = -1
    elif "NA" in k:
        charges[k] = +1

## Fragment Calculations
Now we can compute the fragments. Get the basis sets.

In [None]:
from pyntchem.basis import BasisSet, symlookup
from pickle import load, dump

pname = join(cache_dir, geom + "-basis2.pickle")
try:
    with open(pname, "rb") as ifile:
        basis = load(ifile)
except Exception as e:
    symlist = set([x.sym for frag in sys.values() for x in frag])
    basis = {}
    for b in bnames:
        basis[b] = BasisSet(b, {x: symlookup[x] for x in symlist})
        
    with open(pname, "wb") as ofile:
        dump(basis, ofile)

Run.

In [None]:
from pyntchem.inputfile import Inputfile
inp = Inputfile()
general_setup(inp, accuracy="low", shift=True, hf=True)
inp.set_scf_guess("diagonal")

In [None]:
if sys.conmat is None:
    capping = None
else:
    capping = "hydrogen"

In [None]:
from pyntchem.preprocessing import create_fragment_guess
from pyntchem.postprocessing import NTChemTool
from pyntchem.preprocessing import put_guess_matrix
from contextlib import suppress

tool = NTChemTool()

dens, _ = create_fragment_guess(clustered, inp, basis[bnames[0]],
                                calc, tool, join(workdir, geom, "guess"),
                                capping=capping, charges=charges, 
                                shuffle_sys=shuffle_sys, **calc_args_small)

In [None]:
# Put guess matrix in the actual calculation directory
with suppress(OSError):
    put_guess_matrix(join(workdir, geom), basis[bnames[0]].name + "_hf", dens_alp=dens)
    
# Put guess matrix in the dry run directory
with suppress(OSError):
    dname = join(dry_run, geom)
    if not exists(dname):
        mkdir(dname)
    dname = join(dry_run, geom, basis[bnames[0]].name + "_hf")
    if not exists(dname):
        mkdir(dname)
    put_guess_matrix(join(dry_run, geom), basis[bnames[0]].name + "_hf", dens_alp=dens)

## Full calculations
Now we are ready to run on the full system.

In [None]:
inp = Inputfile()
general_setup(inp, accuracy="low", shift=True,  hf=True)
inp.set_scf_guess("readdens")
inp["scf"].icharg = sum(charges.values())

In [None]:
from shutil import copy2
calc.run(shuffle_sys, inp, basis[bnames[0]], name=basis[bnames[0]].name + "_hf", 
         run_dir=join(workdir, geom),  **calc_args_medium)
copy2(join(workdir, geom, basis[bnames[0]].name + "_hf", basis[bnames[0]].name + "_hf.Inp"),
      join(dry_run, geom, basis[bnames[0]].name + "_hf", basis[bnames[0]].name + "_hf.Inp"))

In [None]:
from time import sleep
while not calc.check_results(): sleep(10.0)
logfiles = {x: y.log for x, y in calc.calculations.items()}

Activate DFT.

In [None]:
guess = logfiles[basis[bnames[0]].name + "_hf"].densalp
with suppress(OSError):
    put_guess_matrix(join(workdir, geom), basis[bnames[0]].name, dens_alp_file=guess)
with suppress(OSError):
    put_guess_matrix(join(dry_run, geom), basis[bnames[0]].name, dens_alp_file=guess)

In [None]:
inp = Inputfile()
general_setup(inp, accuracy="low", shift=True,  hf=False)
inp.set_scf_guess("readdens")
inp["scf"].icharg = sum(charges.values())

In [None]:
calc.run(shuffle_sys, inp, basis[bnames[0]], name=basis[bnames[0]].name, 
         run_dir=join(workdir, geom),  **calc_args_medium)
copy2(join(workdir, geom, basis[bnames[0]].name, basis[bnames[0]].name + ".Inp"),
      join(dry_run, geom, basis[bnames[0]].name, basis[bnames[0]].name + ".Inp"))

In [None]:
while not calc.check_results(): sleep(10.0)
logfiles = {x: y.log for x, y in calc.calculations.items()}

Project up to a larger basis set.

In [None]:
guess = logfiles[basis[bnames[0]].name].densalp
with suppress(OSError):
    put_guess_matrix(join(workdir, geom), basis[bnames[1]].name, dens_alp_file=guess)
with suppress(OSError):
    put_guess_matrix(join(dry_run, geom), basis[bnames[1]].name, dens_alp_file=guess)

In [None]:
inp = Inputfile()
general_setup(inp, accuracy="low", shift=True, hf=False)
inp.set_scf_guess("readdens")
inp.set_project()
inp["projdens"].iprint = 3
inp["scf"].icharg = sum(charges.values())

In [None]:
calc.run(shuffle_sys, inp, basis[bnames[1]], name=basis[bnames[1]].name, 
         basis_set_proj=basis[bnames[0]], run_dir=join(workdir, geom),  **calc_args_large)
copy2(join(workdir, geom, basis[bnames[1]].name, basis[bnames[1]].name + ".Inp"),
      join(dry_run, geom, basis[bnames[1]].name, basis[bnames[1]].name + ".Inp"))

In [None]:
while not calc.check_results(): sleep(10.0)
logfiles = {x: y.log for x, y in calc.calculations.items()}

In [None]:
print([v.get_timings("SCF_Driv")[0]/3600 for k, v in logfiles.items() if "pc" in k])

Target accuracy calculation.

In [None]:
guess = logfiles[basis[bnames[1]].name].densalp
with suppress(OSError):
    put_guess_matrix(join(workdir, geom), "target", dens_alp_file=guess)
with suppress(OSError):
    put_guess_matrix(join(dry_run, geom), "target", dens_alp_file=guess)

In [None]:
inp = Inputfile()
general_setup(inp, accuracy="high", shift=False, hf=False)
inp.set_scf_guess("readdens")
inp["scf"].icharg = sum(charges.values())

In [None]:
calc.run(shuffle_sys, inp, basis[bnames[1]], name="target", 
         run_dir=join(workdir, geom),  **calc_args_large)
copy2(join(workdir, geom, "target", "target.Inp"),
      join(dry_run, geom, "target", "target.Inp"))

In [None]:
while not calc.check_results(): sleep(10.0)
logfiles = {x: y.log for x, y in calc.calculations.items()}

Create the timing chart.

In [None]:
from matplotlib import pyplot as plt
fig, axs = plt.subplots(1, 1)

keys = ["pcseg_0_hf", "pcseg_0", "pcseg_1", "target"]
times = [logfiles[x].get_timings("SCF_Driv")[0]/3600 for x in keys]
sv = sum(times)

axs.plot(times, 
         marker='o', color='C0', linestyle='--', markersize=12)
axs.set_ylabel("Time (h)", fontsize=18)
axs.set_ylim(0, sv*1.1)
axs.set_xticks(range(len(times)))
axs.set_xticklabels(["PCSEG-0-HF/LOW", "PCSEG-0-DFT/LOW", 
                     "PCSEG-1-DFT/LOW", "PCSEG-1-DFT/HIGH"], rotation=90)
axs.axhline(sv, color='k', linewidth=2, label="Total")
axs.tick_params(axis='both', which='major', labelsize=16)
axs.legend(prop={'size': 14})

plt.savefig("timings-dft.png", dpi=600, bbox_inches = "tight")